diff --git a/.gitignore b/.gitignore
index ab61416d..2e7ef6bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -183,3 +183,4 @@ dmypy.json
 tests/benchmarks/profiling/*.speedscope.json
 tests/benchmarks/profiling/*.memray.bin
 tests/benchmarks/profiling/*.flamegraph.html
+tests/benchmarks/profiling/*.perf.data
diff --git a/CLAUDE.md b/CLAUDE.md
index 50ce5fd5..42ca5a1b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -168,7 +168,9 @@ pixi run -e dev typecheck
 pixi run -e docs doc
 ```
 
-The build system uses Maturin (Rust + Python). Rust code is compiled automatically when running tests via pixi.
+The build system uses Maturin (Rust + Python).
+
+**IMPORTANT — rebuild Rust before testing Rust changes:** `pixi run -e dev pytest` (and `pixi run -e dev test`) do **not** rebuild the Rust extension. After editing anything in `src/`, run `pixi run -e dev maturin develop --release` first, or pytest silently imports the *stale* compiled extension — parity/integration tests then pass or fail against the old binary, not your change. (`cargo test`/`cargo-test` compile from source and are unaffected; this only bites the Python tests that import the extension.)
 
 **Before pushing a change that renames/removes a public symbol or touches shared code, run the full tree** (`pixi run -e dev pytest tests -q`, or the full `pixi run -e dev test`). Scoped runs like `pytest tests/dataset` skip `tests/unit/` (e.g. `tests/unit/dataset/test_build_reconstructor.py`), so a stale reference there fails only in CI.
 
diff --git a/Cargo.toml b/Cargo.toml
index 66a7242f..431165cd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,3 +29,11 @@ features = ["abi3-py310"]
 
 [dev-dependencies]
 rstest = "0.26.1"
+
+# Perf call-graph attribution only (`perf report --children`). Inherits release
+# codegen and adds line tables + frame pointers. NEVER the gate artifact — all
+# throughput/asm gate numbers come from the plain `--release` build.
+[profile.profiling]
+inherits = "release"
+debug = "line-tables-only"
+force-frame-pointers = true
diff --git a/docs/handoffs/2026-06-25-phase5-getitem-optimization.md b/docs/handoffs/2026-06-25-phase5-getitem-optimization.md
new file mode 100644
index 00000000..4401d1c6
--- /dev/null
+++ b/docs/handoffs/2026-06-25-phase5-getitem-optimization.md
@@ -0,0 +1,326 @@
+# Handoff: Phase 5 — fully optimize `Dataset.__getitem__` (targets 5, 6, 7 + rayon)
+
+**Date:** 2026-06-25
+**Status:** Not started. Four parallel-ready workstreams.
+**Audience:** GenVarLoader maintainers / per-workstream sessions.
+**Roadmap:** `docs/roadmaps/rust-migration.md` — Phase 5 ⬜, "Optimization targets — round 2" (targets 5/6/7).
+**Base branch:** `zero-copy-scale-safe-readpath` (format 2.0 SoA + zero-copy FFI + sub-linear cache + uninit buffers; PR TBD). All four workstreams branch from here.
+
+## TL;DR
+
+Phase 3 profiling (de-noised `test_e2e.py` benchmark + `perf` on the Python process) left three
+single-thread deficits on the read path, then rayon batch parallelism as the capstone:
+
+| # | Workstream | What | Kind | Parallel? |
+|---|---|---|---|---|
+| **5** | tracks-only ndarray slicing | hoist `out.as_slice_mut()` in `intervals_to_tracks`, drop per-interval `SliceInfo` | rust-only, **byte-identical** | now |
+| **6** | strand reverse-complement | fold RC into **all** reconstruct/track kernels (incl. splice); delete `reverse_complement_ragged` | parity-gated (strand=-1) | now |
+| **7** | variant-windows assembly | replace the per-batch `_FlatWindow`/`_FlatAlleles` object graph with **one Rust call** returning flat `(data, offsets)` | parity-gated | now |
+| **rayon** | batch parallelism | `par_iter` over disjoint per-query slices in the fused kernels | parity-trivial (disjoint) | **after 5/6/7 merge** |
+
+**Run 5, 6, 7 concurrently. Rayon is blocked until 5+6+7 land** — the roadmap is explicit that
+parallelizing before the single-thread work just scales the numpy RC pass (6) and the ndarray
+slicing (5). Each workstream is its own branch + its own parity-gated PR.
+
+The measured starting point (branch `zero-copy-scale-safe-readpath`, `chr22_geuv.gvl`, `with_len(16384)`,
+BATCH=32, `NUMBA_NUM_THREADS=1`, Carter EPYC 7543), **min rust ÷ min numba** ms/batch:
+
+| Mode | rust ÷ numba | note |
+|---|---|---|
+| tracks-only | **0.63×** (rust slower) | target 5 fixes this |
+| tracks (seqs + read-depth) | 0.95× | shares the target-5 kernel |
+| haplotypes | 0.94× | target 6 is its biggest sink (~19% self / 28% incl RC) |
+| annotated | **1.68×** (rust faster) | already a win post-format-2.0 |
+
+---
+
+## Shared context (every session reads this first)
+
+### Where this sits
+
+Phases 0–3 ported the read path to Rust behind a per-kernel dispatch registry
+(`python/genvarloader/_dispatch.py`, default `rust`, `GVL_BACKEND=numba` override). The numba
+kernels are **retained as registered parity oracles** (deleted wholesale later in Phase 5 — NOT in
+these workstreams). The read path is fused: `__getitem__` → `QueryView.recon(...)` → one of the
+fused FFI kernels in `src/ffi/mod.rs`.
+
+### How to measure (use this, not py-spy `--native`)
+
+py-spy `--native` slows the deep-stack haplotype paths ~10× and times out. Use `perf` on the Python
+process — no sudo on Carter (`perf_event_paranoid=2`), near-zero overhead, resolves
+`genvarloader.abi3.so` Rust symbols:
+
+```bash
+NUMBA_NUM_THREADS=1 perf record -F 999 -o p.data -- .pixi/envs/dev/bin/python \
+    tests/benchmarks/profiling/profile.py --mode <mode> --n-batches 12000
+perf report --stdio --no-children -i p.data        # flat self-time, Rust symbols resolved
+```
+
+`profile.py --mode {haplotypes,annotated,tracks,tracks-seqs,variants,variant-windows}`. Run 8–25k
+batches so steady state drowns import/JIT. For the rust↔numba ratio use the de-noised
+`pytest-benchmark` harness in `tests/benchmarks/test_e2e.py`: `_bench_indexing` uses
+`benchmark.pedantic(iterations=10, rounds=50)` so per-batch OS jitter averages out — compare the
+**min** (cleanest CPU-bound estimate), not the mean. Build release first:
+`pixi run -e dev maturin develop --release`.
+
+### Parity (the landing gate)
+
+Every workstream lands only when output stays **byte-identical** to the numba oracle. The harness is
+`tests/parity/` (`_harness.py` run-both-assert-byte-identical, return-value + in-place variants) plus
+hypothesis property generators. The dataset-level backstop (`tests/parity/test_dataset_parity.py`)
+spies on the kernel to prove it actually runs on the live `__getitem__` path (guards against vacuous
+passes). Targets 5/7 are byte-identical by construction; target 6 is gated on **strand=-1** datasets
+(see its section). Run both backends:
+
+```bash
+pixi run -e dev pytest tests/parity -q                      # rust default
+GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q    # oracle
+pixi run -e dev cargo-test                                  # rust unit tests
+```
+
+### Before pushing
+
+Per `CLAUDE.md`: run the **full tree** on both backends before any push that touches shared code
+(`pixi run -e dev pytest tests -q`, then `GVL_BACKEND=numba …`) — scoped runs skip `tests/unit/`.
+Lint/format/typecheck: `pixi run -e dev ruff check python/ tests/ && ruff format … && typecheck`.
+Update `docs/roadmaps/rust-migration.md` (tick the target, record the re-measured ratio, set the PR
+link) as part of the work.
+
+### Parallel-session coordination
+
+- **One branch per workstream**, all off `zero-copy-scale-safe-readpath`. Use a git worktree per
+  session to avoid stepping on each other's working tree.
+- **File-overlap map** (plan rebases around these):
+  - Target 5: `src/intervals.rs` only (+ its cargo tests). **No overlap** with 6/7.
+  - Target 6: `src/intervals.rs` (track reverse), `src/ffi/mod.rs` + the reconstruct/track cores
+    under `src/{reconstruct,tracks,intervals}/`, `python/genvarloader/_dataset/_query.py`,
+    `_reconstruct.py`. **Overlaps target 5 in `intervals.rs`** and target 7 in `_query.py` — see below.
+  - Target 7: `python/genvarloader/_dataset/_flat_variants.py`, `_flat_flanks.py`, new
+    `src/variants/` code + `src/ffi/mod.rs`. **Overlaps target 6 in `src/ffi/mod.rs`** (additive — new
+    pyfunctions, low conflict risk).
+- **Merge order:** 5 first (smallest, rust-only), then 6 and 7 in either order; rebase the later ones.
+  Rayon last, after all three are on the base branch.
+- **HPC gotcha:** dataset tests need pytest's tmp on the same filesystem as `tests/data`
+  (`--basetemp=$(pwd)/.pytest_tmp`) or the write path's `os.link` hardlink fails cross-device (Errno 18).
+
+### Don't regress the format-2.0 read path
+
+The base branch replaced per-batch `np.ascontiguousarray` on per-sample-scale memmaps with `_ffi_array`
+(cross zero-copy or raise loudly) and caches sub-linear per-variant arrays on `Haps.ffi_static`
+(`_HapsFfiStatic`). `tests/integration/test_scale_guard.py` fails if any per-batch
+`np.ascontiguousarray` materializes a sample-scale memmap. Keep that test green — do **not** reintroduce
+`ascontiguousarray` on `geno_v_idxs` / `itv_*` / genotype memmaps.
+
+---
+
+## Target 5 — tracks-only ndarray slicing (rust-only, byte-identical)
+
+**Goal:** close the **0.63×** tracks-only deficit — the one read path where rust is clearly slower than
+numba — and get rust ahead single-threaded on the cheapest read.
+
+**Evidence (`perf` flat self-time, tracks-only path):** `intervals_to_tracks` 31% + `ndarray::slice_mut`
+**11%** + `ndarray::do_slice` **9.5%** ≈ **20.5%** in ndarray slice machinery. Source: the per-interval
+`out.slice_mut(s![a..b]).fill(value)` and the `out.fill(0.0)` prelude in
+`src/intervals.rs:66` / `:27`. numba compiles `out[a:b] = value` to a direct memset and pays none of this.
+tracks-only is the cheapest path (~1.1–1.7 ms) so this fixed per-interval cost dominates with no
+sequence work to amortize it.
+
+**Fix:** the `out` buffer is contiguous. Hoist `let out_slice = out.as_slice_mut().unwrap();` once at the
+top, then write `out_slice[out_s + s as usize .. out_s + e as usize].fill(value)` and
+`out_slice.fill(0.0)` on the raw `&mut [f32]` — dropping per-interval `SliceInfo` construction +
+bounds-check. Keep the exact clamp/break semantics (start clamped ≥0, end ≤length, break on
+`start >= length`, no-op when `e <= s`) — see the docstring at `src/intervals.rs:3-15`. This kernel is
+shared by the combined **tracks** path too, so that improves with it.
+
+**Files:** `src/intervals.rs` (`intervals_to_tracks` + its cargo tests). Nothing Python-side changes.
+
+**Parity:** **byte-identical by construction** — same arithmetic, same write order, just a different way to
+address the contiguous buffer. The 8 existing cargo unit tests (`src/intervals.rs:72+`) plus the
+`intervals_to_tracks` hypothesis parity gate and the tracks dataset backstop must stay green. No oracle
+change.
+
+**Perf gate:** re-measure tracks-only via `test_e2e.py`; target rust ÷ numba ≥ 1.0 (was 0.63×). Record in
+the roadmap's re-measurement block.
+
+**Start your session here:**
+1. Branch `opt/target-5-intervals-slice` off `zero-copy-scale-safe-readpath`.
+2. Read `src/intervals.rs` end-to-end (it's ~220 lines).
+3. TDD: the cargo tests already pin the contract — refactor under them, then add a profiling re-measure.
+4. Gate: `cargo-test` + `pytest tests/parity -q` (both backends) + tracks-only `test_e2e` re-measure.
+
+---
+
+## Target 6 — fold strand reverse-complement into the kernels (delete the numpy post-pass)
+
+**Goal:** delete the `reverse_complement_ragged` post-pass entirely (incl. the spliced per-element path)
+by emitting negative-strand regions already reverse-complemented from the Rust kernels. This is the
+**largest single-thread throughput lever** left and it is **backend-agnostic** (numba pays it too) — it
+must go before rayon, else we parallelize a numpy pass.
+
+**Evidence (py-spy, no `--native`, self-time):** RC post-pass is haplotypes **~19% self / ~28% inclusive**,
+variants **~15% / ~16%**, tracks-only **~10%**. Every negative-strand region triggers a Python/numpy RC
+pass *after* reconstruction.
+
+**Current state:** `python/genvarloader/_dataset/_query.py`
+- unspliced: `_getitem_unspliced` computes `to_rc = view.full_regions[r_idx, 3] == -1` and does
+  `recon = tuple(reverse_complement_ragged(r, to_rc) for r in recon)` (~line 188–190).
+- spliced: `_getitem_spliced` builds a **permuted per-element** mask `to_rc_per_elem` via
+  `plan.permutation` (the spliced kernel writes pre-spliced bytes in permuted order) and applies the same
+  call (~line 259–280).
+- `reverse_complement_ragged` (~line 352–410) dispatches by output kind.
+
+**RC semantics per output kind (the contract to reproduce in-kernel):**
+
+| Output kind | Python today | In-kernel behavior |
+|---|---|---|
+| haplotypes `_Flat` (S1) | `reverse_masked(to_rc, comp=_COMP)` | reverse bytes **and** complement |
+| reference `_Flat` (S1) | same | reverse + complement |
+| annotated `_FlatAnnotatedHaps` | `reverse_masked(to_rc, _COMP)` | reverse+complement bytes **and reverse** the parallel `var_idxs`/`ref_coords` arrays (no complement on those — order only) |
+| tracks `_Flat` (f32) | `reverse_masked(to_rc, comp=None)` | **reverse only**, no complement |
+| variants `RaggedVariants` | `rc_(to_rc)` | reverse allele order within each row **and** complement allele bytes (ragged) |
+| variant-windows | no-op (returns unchanged) | **skip** — reference-oriented |
+| intervals | no-op | **skip** |
+
+`_COMP` is the complement LUT (find it in `_query.py` / seqpro). Confirm exact mapping (incl. `N`,
+IUPAC, lowercase if any) and reproduce it in Rust.
+
+**Kernels to thread a per-query `to_rc: &[bool]` through** (`src/ffi/mod.rs`):
+- `reconstruct_haplotypes_fused` (`:393`) — haplotypes
+- `reconstruct_annotated_haplotypes_fused` (`:604`) — bytes + parallel arrays
+- `reconstruct_haplotypes_spliced_fused` (`:521`) — **the hard one**, see below
+- `intervals_and_realign_track_fused` (`:848`) — tracks (reverse only)
+- `get_reference` (`:728`) — reference
+- the variants allele-gather path (`gather_alleles` in `src/variants/`) — `RaggedVariants` RC
+
+**Approach:** each kernel takes the per-query mask; when `to_rc[query]` is set, write that query's output
+slice **back-to-front** with complemented bytes (seqs) or plain reversed values (tracks). For annotated,
+reverse the parallel `var_idxs`/`ref_coords` slices in lockstep. Do the RC as the kernel writes (or as a
+final in-place pass over each query's just-written slice — simpler to get byte-identical first, optimize
+second). Mind the interaction with **insertion-fill** and **trailing-fill**: RC must apply to the final
+post-fill bytes (same as today, where RC runs after reconstruction completes).
+
+**The splice sub-case:** `reconstruct_haplotypes_spliced_fused` writes pre-spliced bytes in
+**permuted** order (`plan.permutation`), and today RC is applied per spliced **element** with
+`to_rc_per_elem`. In-kernel, pass the already-permuted per-element `to_rc` and reverse-complement each
+spliced element's byte range as it is finalized. Verify the element boundaries you reverse match
+`plan.group_offsets`. This is the part most likely to need careful TDD — start from the existing spliced
+parity fixtures and add strand=-1 coverage.
+
+**Delete after parity holds:** the `reverse_complement_ragged` calls in `_getitem_unspliced` /
+`_getitem_spliced`, the function itself, and the now-dead `to_rc` plumbing in `_query.py`. Confirm no other
+caller (`grep -rn reverse_complement_ragged python/`).
+
+**Parity:** byte-identical vs the current post-pass. The default parity fixtures use `max_jitter=0` and may
+be strand-agnostic — **add strand=-1 datasets** (mix of + and − regions) to the dataset parity backstop
+for every output kind incl. annotated and spliced. Gate both backends. This is the workstream where a
+vacuous pass is easiest, so assert the RC actually fires (regions with strand −1 produce RC'd bytes ≠ the
++ strand).
+
+**Perf gate:** re-measure haplotypes/variants/tracks via `test_e2e`; expect the RC self-time gone and the
+ratios up. Record in the roadmap.
+
+**Start your session here:**
+1. Branch `opt/target-6-kernel-rc` off `zero-copy-scale-safe-readpath`.
+2. Read `_query.py:152-410` (both getitem paths + `reverse_complement_ragged` + the `_COMP` LUT), then the
+   six kernels in `src/ffi/mod.rs` and their cores.
+3. TDD order: reference (simplest, no fill) → haplotypes → tracks (reverse-only) → variants → annotated →
+   **splice last**. Land each kind's in-kernel RC behind parity before deleting its post-pass branch.
+4. Gate: `cargo-test` + `pytest tests/parity -q` (both backends, with new strand=-1 fixtures) + full tree.
+
+---
+
+## Target 7 — variant-windows assembly in one Rust call
+
+**Goal:** kill the per-batch object churn on the `variant-windows` (and `variants`) flat-output path by
+assembling the token/window buffers in **one Rust call returning flat arrays**, eliminating the per-batch
+Python object graph. (This is the larger of the three; it effectively starts the windows half of the
+deferred single-big-kernel rewrite.)
+
+**Evidence (`perf` flat self-time, variant-windows):** no dominant Rust kernel — the cost is interpreter +
+allocator: `_PyEval_EvalFrameDefault` ~8.5%, GC (`gc_collect_main` + `deduce_unreachable` +
+`visit_reachable` + `dict_traverse`) **~14% combined**, dict/attr lookups, dynamic-symbol lookup
+(ctypes/cffi binding) ~2.3%. The flat-windows assembly allocates many small objects per batch
+(`_FlatWindow` / `_FlatVariants` / `_FlatAlleles` / scalar-field dataclasses).
+
+**Current state:** trace `profile.py --mode variant-windows` and `--mode variants` into
+`python/genvarloader/_dataset/_flat_variants.py` (`_FlatWindow` `:189`, `_FlatVariantWindows` `:270`,
+`_FlatVariants` `:344`) and `_flat_flanks.py` (`_make_window` / ref+alt window builders `:116–220`). These
+rebuild dicts of wrapper dataclasses, gather/fill via the `*_i32`/`*_f32` rust cores, and re-wrap, **every
+batch**. The Phase-2 rust gather/fill kernels already exist (`src/variants/`,
+`gather_rows`/`gather_alleles`/`compact_keep`/`fill_empty_*`) — the win here is collapsing the
+**orchestration** that allocates Python objects around them.
+
+**Approach:** add one (or a few) Rust pyfunction(s) in `src/ffi/mod.rs` that take the raw inputs the
+windows path needs (gathered v_idxs / alleles / scalar fields + flank/tokenize/LUT params) and return the
+final flat `(data, offsets)` token buffers directly — so the Python side constructs **one** `_Flat`/result
+wrapper instead of a graph of `_FlatWindow`/`_FlatAlleles`. Reuse the existing `src/variants/` cores
+internally. Inventory exactly which fields/windows the consumer actually reads downstream (in
+`_query.py` reshape/pad and the flat-output assembly) so the Rust call returns precisely those, no more.
+
+**Files:** new code in `src/variants/` + `src/ffi/mod.rs`; rewrite the assembly in
+`_dataset/_flat_variants.py` / `_flat_flanks.py` to call it; keep the public output type
+(`_FlatVariants` / `_FlatVariantWindows`) identical from the caller's view.
+
+**Parity:** byte-identical token buffers + offsets vs the current Python assembly, for both `variants` and
+`variant-windows`, incl. the flank-tokenize ride-along (`flank_tokens`), the empty-group fill
+(`fill_empty_groups` / `DummyVariant`), and the unknown-token path. Note `test_e2e_variants` is a
+**pre-existing xfail** (`_FlatVariants.to_fixed` missing) — don't conflate it with a regression; check it
+xfails identically at the base before you start.
+
+**Perf gate:** re-measure `variant-windows` and `variants` via `test_e2e`; expect the GC/eval self-time to
+drop. Record in the roadmap.
+
+**Start your session here:**
+1. Branch `opt/target-7-windows-rust-assembly` off `zero-copy-scale-safe-readpath`.
+2. `perf record` the `variant-windows` mode and read the assembly in `_flat_variants.py` / `_flat_flanks.py`
+   top-to-bottom; map every per-batch allocation.
+3. TDD: pin the current flat-buffer output (data+offsets) for `variants` and `variant-windows` as the
+   oracle, then build the Rust call under it.
+4. Gate: `cargo-test` + `pytest tests/parity tests/unit -q` (both backends) + `variant-windows` re-measure.
+
+---
+
+## Rayon — batch parallelism (BLOCKED: start only after 5/6/7 are merged)
+
+**Goal:** parallelize the fused kernels' per-query loops with rayon, now that single-thread rust is ahead.
+
+**Why blocked:** the roadmap is explicit — "Only after (5)+(6) put rust ahead single-threaded do we add
+rayon batch parallelism — parallelizing first would just scale the numpy RC pass and the ndarray slicing."
+Do not start until target 5, 6, and 7 are on the base branch.
+
+**Approach:** the batch drivers are currently serial by deliberate design — per-`(query, hap)` output
+slices are **disjoint**, which is exactly why they're embarrassingly parallel and why the serial result
+already equals numba's `prange`. Convert the per-query loops in the fused kernels
+(`reconstruct_haplotypes_fused`, `intervals_and_realign_track_fused`, the annotated/spliced variants) to
+`rayon::par_iter` (or `par_chunks` over disjoint output slices — use `split_at_mut` / `ndarray`
+`axis_chunks_iter_mut` to hand each thread a non-overlapping `&mut` slice). Expose a thread-count control
+(env var or arg) so benchmarks can pin it; default to rayon's global pool.
+
+**Parity:** **trivial** — disjoint slices, deterministic per-slice work, so output is identical regardless
+of thread count. Run the existing parity suite at >1 thread.
+
+**Perf gate:** throughput scaling vs thread count on `test_e2e`. **Re-baseline the whole read path here**
+(the roadmap's Phase 5 checkpoint). Note the `NUMBA_NUM_THREADS=1` caveat — for an honest comparison, set
+numba threads to match, or report both single- and multi-thread numbers explicitly.
+
+**Start your session here (once unblocked):**
+1. Branch off the merged base (with 5/6/7 in).
+2. Confirm each fused kernel's per-query output slices are provably disjoint before parallelizing.
+3. Gate: `cargo-test` + full parity suite at N>1 threads + a thread-scaling sweep recorded in the roadmap.
+
+---
+
+## Pointer table
+
+| Need | Where |
+|---|---|
+| Roadmap + targets 5/6/7 detail | `docs/roadmaps/rust-migration.md` (round-2 optimization block) |
+| Fused FFI kernels | `src/ffi/mod.rs` (`:66`, `:393`, `:521`, `:604`, `:728`, `:848`) |
+| tracks slice kernel | `src/intervals.rs` |
+| RC post-pass to delete | `python/genvarloader/_dataset/_query.py` (`reverse_complement_ragged`, getitem paths) |
+| windows assembly | `python/genvarloader/_dataset/_flat_variants.py`, `_flat_flanks.py` |
+| Phase-2 variant cores (reuse) | `src/variants/` |
+| Dispatch registry | `python/genvarloader/_dispatch.py` (`GVL_BACKEND`) |
+| Parity harness | `tests/parity/` |
+| Perf benchmark | `tests/benchmarks/test_e2e.py`, `tests/benchmarks/profiling/profile.py` |
+| Scale guard (don't regress) | `tests/integration/test_scale_guard.py` |
diff --git a/docs/handoffs/2026-06-27-rust-migration-w5.md b/docs/handoffs/2026-06-27-rust-migration-w5.md
new file mode 100644
index 00000000..adf17a47
--- /dev/null
+++ b/docs/handoffs/2026-06-27-rust-migration-w5.md
@@ -0,0 +1,78 @@
+# Handoff — Rust Migration Phase 5 W5 (consolidation PR)
+
+**Written:** 2026-06-27, mid-execution. **Branch:** `phase-5-w5` (off `rust-migration @ efb87ea`, in the MAIN repo, not a worktree).
+**Current point:** Stage C (rayon) task **C1 just landed (`4cde9b9`)**; controller-verify + review of C1 is the immediate next step.
+
+## What W5 is
+
+The consolidation PR of the rust migration. One PR (`phase-5-w5` → `rust-migration`), three staged commit-boundaries:
+- **Stage A — snapshot** (DONE): froze the numba-oracle parity suites to committed `.npz` goldens; rewrote all parity tests to assert `rust == golden` (importing rust callables directly, never `_dispatch`).
+- **Stage B — delete numba** (DONE): removed dispatch layer, backend conditionals, all `@njit`, deps.
+- **Stage C — rayon** (IN PROGRESS): add `parallel:bool` batch parallelism to read kernels, gated `serial==parallel==golden`.
+
+## The 3 user decisions (binding)
+
+1. Goldens = **frozen seeded-sample `.npz`** (deterministic hypothesis draw, frozen inputs+outputs).
+2. **One PR, staged commits** (not split PRs).
+3. Rayon gating = **`parallel:bool` + `RAYON_NUM_THREADS`**, copying the `get_reference` idiom (`src/reference/mod.rs:82-106`: `split_at_mut` chain → `Vec<&mut [_]>` → `into_par_iter`). Serial branch is the byte-identity reference. **Never put raw `*mut` in a rayon closure (not `Send`) — carve `&mut [_]` slices.**
+4. (2026-06-27) **seqpro transitively imports numba** → B4 guard RELAXED to "genvarloader's OWN code is numba-free" (source scan); a seqpro follow-up tracks the eager import.
+
+## How to work this (subagent-driven-development)
+
+- **The authoritative records:** the plan `docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w5.md` and the durable ledger `.superpowers/sdd/progress.md` (read this FIRST on resume — it has the blow-by-blow, every commit, every Minor finding, all pending items). Task briefs/reports live in `.superpowers/sdd/task-<ID>-{brief,report}.md`.
+- **Per task:** extract brief → dispatch a **Sonnet** implementer (global CLAUDE.md mandates Sonnet for impl) → generate review package → dispatch a **Sonnet** task-reviewer (spec + quality verdicts) → fix Critical/Important → mark complete in the ledger.
+- **Brief extraction** (the SDD `task-brief` script only matches numeric `Task N`; our IDs are A1/B1/C1):
+  ```bash
+  PLAN=docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w5.md
+  DIR=.superpowers/sdd
+  awk '/^### Task C2:/ {grab=1} grab && /^### Task C3:/ {exit} grab {print}' "$PLAN" > "$DIR/task-C2-brief.md"
+  ```
+- **Review package:** `/carter/users/dlaub/.claude/plugins/cache/claude-plugins-official/superpowers/6.0.3/skills/subagent-driven-development/scripts/review-package BASE HEAD` (BASE = commit before the implementer ran; current next BASE = `4cde9b9`).
+
+## ⚠️ THE LOAD-BEARING LESSON
+
+**Subagent self-reported test/env results are UNRELIABLE — the controller MUST re-run every load-bearing gate.** This stage, 3 of 4 B-stage reports didn't hold up: B2 claimed "686 passed" hiding a real failure; B3 claimed "clean import passed" (false — seqpro pulls numba); B4 claimed "687 passed" but had silently BROKEN the env (removed conda numba pin → broken PyPI llvmlite → `import genvarloader` failed at collection). Each was caught by the controller re-running the gate. **Keep doing this for C1/C2/C3.** Gates take ~4 min (run `run_in_background: true`; foreground sleeps are blocked).
+
+Standing gate command (after any `src/` edit, MUST `maturin develop --release` first or pytest imports the stale `.so`):
+```bash
+pixi run -e dev maturin develop --release && \
+pixi run -e dev pytest tests/parity tests/dataset tests/unit -q --basetemp=$(pwd)/.pytest_tmp
+```
+Healthy full-tree baseline: **687 passed, 35 skipped, 2 xfailed** (the +1 over 686 is the B4 import-guard). All pytest needs `--basetemp=$(pwd)/.pytest_tmp` (os.link Errno 18 on Carter).
+
+## Commit log (phase-5-w5)
+
+A: `494ede6`(A1) `058b7a1`(A2) `e31075c`(A3) `b8f52c2`(A4) `2513aa2`(A5) + plan amends `6033984`/`f7b3c72`/`29a2a4e`.
+B: `2ee677a`+`8133cd2`(B1) · `f85ae47`+`5b386e5`(B2) · `fb4b1a9`+`70a3f8a`+`06c0963`(B3) · `98f3ee5`+`dd7c2ef`(B4).
+C: `4cde9b9`(C1 — rayon for `reconstruct_haplotypes_from_sparse`).
+Plan itself committed at `f048b53`.
+
+## RESUME MAP (do these in order)
+
+1. **Verify + review C1 (`4cde9b9`)** — controller gate was launched at handoff time (bg task `broitb5yt`, output under the session tasks dir); confirm it's `687 passed / 35 skipped / 2 xfailed`. Then review: `review-package dd7c2ef 4cde9b9`, dispatch a Sonnet reviewer focused on: the 3-buffer `split_at_mut` chunk-carve correctness (Optional annot buffers — the `match` on the 4 presence combos), no raw `*mut` in the rayon closure, the `parallel:bool` threaded through all 5 FFI entries (`src/ffi/mod.rs:481/546/689/782/891`) + 5 Python call sites (`_genotypes.py` + 4 in `_haps.py`), and that `_golden.RUST_KERNELS["reconstruct_haplotypes_from_sparse"]`'s `parallel`-default shim didn't weaken the golden replay. C1 added `tests/parity/test_rayon_equivalence.py`.
+2. **C2** — parallelize the track kernels: `shift_and_realign_tracks_sparse` (`src/tracks/mod.rs:470`, outer-query loop) and `tracks_to_intervals` (two-pass @569/@615 — parallelize each pass, keep the cumsum serial). Also thread `parallel` through `intervals_and_realign_track_fused`. Extend `test_rayon_equivalence.py`.
+3. **C3** — parallelize `get_diffs_sparse` (`src/genotypes/mod.rs:27`) + `intervals_to_tracks` (`src/intervals.rs:45`). (`get_reference` is ALREADY parallel — no work.) Extend the equivalence test.
+4. **C4** — finalize `docs/roadmaps/rust-migration.md` (the W5 entry exists ~line 799 but is partial; correct it to reflect snapshot+delete+rayon, Phase 5 stays 🚧 — W6/PR6 is measure-and-merge); run the full Stage-C gate (full tree + `cargo test --release` + ruff + `cargo clippy` + typecheck + serial==parallel across ALL kernels).
+5. **Final whole-branch review** — dispatch the most capable model on `review-package $(git merge-base rust-migration HEAD) HEAD` (merge-base = `efb87ea`). Triage the Minor findings list in the ledger.
+6. **superpowers:finishing-a-development-branch** — verify tests, then offer the 4 options. Land into `rust-migration` (NO squash, per the no-squash-merges memory).
+
+## PENDING / must-do at finishing
+
+- **File the seqpro issue** (user authorized): seqpro 0.20.0 eagerly imports numba (`seqpro/_numba.py`, `transforms/tmm.py`) at `import seqpro` → blocks the W6 ~3.2 GB JIT-RSS drop. **`mcvickerlab/seqpro` 404s — ASK the user for the repo** (likely `d-laub/seqpro` or personal). The roadmap currently says "filed as a seqpro follow-up" — correct that wording once actually filed.
+- **Optional cleanup (final-review call):** B3 kept *plain-Python shadows* of rust kernels (decorators removed, bodies kept) because `tests/unit/` references them: `reconstruct_haplotype_from_sparse`, `_get_reference_row/_ser/_par`, `_xorshift64`/`_hash4`, `shift_and_realign_track(s)_sparse`, `_gather_v_idxs_ss_numba` (misleading `_numba` suffix). These + their unit tests are redundant with rust (validated by parity goldens) — candidate for deletion, but its own scoped decision.
+- **Bench conftest staleness** (non-gated): B2 removed `reconstruct_haplotypes_from_sparse` from `_haps`; `tests/benchmarks/conftest.py:50` still targets `(_haps, "reconstruct_haplotypes_from_sparse")` — fix the capture target (now the fused kernel / `_genotypes`). Benchmarks are opt-in, don't block the gate.
+
+## Plan amendments made during execution (all committed, in the plan file)
+
+- B3 Step 2b: **replace (not delete) 4 numba dtype-fallbacks with numpy** — `_gather_rows`/`_compact_keep`/`_fill_empty_scalar`/`_fill_empty_fixed` in `_flat_variants.py` fall back to numba for arbitrary dtypes (custom VCF FORMAT fields, **issue #231**); these are LIVE production code. Done in B3; gated by the 4 dtype-regression tests in `test_flat_variants_parity.py`.
+- B1 Step 2b: rewrote `_golden.py::make_kernel_spy` to monkeypatch the direct rust symbol (registry mutation went inert post-dispatch-deletion).
+- B1 Step 2: also deleted dead `tests/parity/_harness.py` + `test_harness_tuple.py` (superseded by `_golden.py`).
+- B4: relaxed import-guard to own-code source scan (seqpro decision above).
+
+## Key locations
+
+- Plan: `docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w5.md`
+- Ledger (READ FIRST): `.superpowers/sdd/progress.md`
+- Goldens: `tests/parity/golden/*.npz`; infra `tests/parity/_golden.py`; regen `tests/parity/generate_goldens.py` (+ `GVL_GEN_GOLDENS=1 pytest tests/parity/test_gen_dataset_goldens.py` for dataset goldens).
+- Rust read kernels: `src/reconstruct/mod.rs`, `src/tracks/mod.rs`, `src/genotypes/mod.rs`, `src/intervals.rs`, `src/reference/mod.rs` (rayon reference idiom). FFI: `src/ffi/mod.rs`.
+- Master Phase-5 plan (PR5/PR6 scope): `docs/superpowers/plans/2026-06-26-rust-migration-phase-5.md`.
diff --git a/docs/roadmaps/phase-3-getitem-glue-audit.md b/docs/roadmaps/phase-3-getitem-glue-audit.md
new file mode 100644
index 00000000..c16e573b
--- /dev/null
+++ b/docs/roadmaps/phase-3-getitem-glue-audit.md
@@ -0,0 +1,435 @@
+# Phase 3 `__getitem__` Glue Audit — Haps + Tracks Fusion Seams
+
+**Purpose:** Task 12 of Phase 3 Rust migration (sub-unit 3d).  
+Identifies every `np.ascontiguousarray` / boundary crossing / intermediate numpy
+allocation on the two live read paths and proposes the minimal single-FFI-entry
+fusion seams for Tasks 13 (fused haps) and 14 (fused tracks).
+
+---
+
+## 1. Haplotypes Path — Coercion / Crossing Inventory
+
+Call chain:  
+`Haps.__call__` → `Haps.get_haps_and_shifts` → `Haps._prepare_request` →  
+`_haplotype_ilens` → `get_diffs_sparse` → (FFI #1)  
+then back in `get_haps_and_shifts` → `_reconstruct_haplotypes` →  
+`reconstruct_haplotypes_from_sparse` → (FFI #2)
+
+### `_haplotype_ilens` / `_prepare_request`
+(in `python/genvarloader/_dataset/_haps.py`)
+
+| # | File:Line | Operation | Arrays coerced |
+|---|-----------|-----------|----------------|
+| H1 | `_haps.py:694` | `.astype(np.int32, copy=False)` on `regions` | `regions (b,3)` |
+
+Note: `geno_offset_idx` is freshly computed (already `np.intp`) via
+`np.ravel_multi_index` at `_haps.py:713–715`.  No allocation worth flagging —
+it is required output.  `out_offsets = lengths_to_offsets(out_lengths)` at
+`_haps.py:687` is also a required allocation (sizes the output buffer).
+
+### `get_diffs_sparse` wrapper — FFI crossing #1
+(in `python/genvarloader/_dataset/_genotypes.py`)
+
+| # | File:Line | Operation | Arrays coerced |
+|---|-----------|-----------|----------------|
+| H2 | `_genotypes.py:149` | `np.ascontiguousarray(geno_offset_idx, np.int64)` | `(b,p)` |
+| H3 | `_genotypes.py:150` | `np.ascontiguousarray(geno_v_idxs, np.int32)` | `(r*s*p*v)` — the full memmap |
+| H4 | `_genotypes.py:151` | `_as_starts_stops(geno_offsets)` → `np.ascontiguousarray(np.stack([o[:-1], o[1:]]), np.int64)` | `(2, r*s*p)` — 2× alloc |
+| H5 | `_genotypes.py:152` | `np.ascontiguousarray(ilens, np.int32)` | `(tot_v)` |
+| H6 | `_genotypes.py:153` | `np.ascontiguousarray(keep, np.bool_)` (optional) | `(b*p*v)` |
+| H7 | `_genotypes.py:154` | `np.ascontiguousarray(keep_offsets, np.int64)` (optional) | `(b*p+1)` |
+| H8 | `_genotypes.py:155–157` | 3× `np.ascontiguousarray` for `q_starts`, `q_ends`, `v_starts` | `(b)`, `(b)`, `(tot_v)` |
+
+**FFI crossing:** one Python→Rust boundary crossing into `_get_diffs_sparse_rust`.
+
+Returns `diffs` shape `(b*p,)` — reshaped to `(b,p)` at `_haps.py:488` (view, no copy).
+
+### `reconstruct_haplotypes_from_sparse` wrapper — FFI crossing #2
+(in `python/genvarloader/_dataset/_genotypes.py`)
+
+| # | File:Line | Operation | Arrays coerced |
+|---|-----------|-----------|----------------|
+| H9  | `_genotypes.py:316` | `np.ascontiguousarray(out_offsets, np.int64)` | `(b*p+1)` |
+| H10 | `_genotypes.py:317` | `np.ascontiguousarray(regions, np.int32)` | `(b,3)` — already int32 from H1, still runs |
+| H11 | `_genotypes.py:318` | `np.ascontiguousarray(shifts, np.int32)` | `(b,p)` |
+| H12 | `_genotypes.py:319` | `np.ascontiguousarray(geno_offset_idx, np.int64)` | `(b,p)` — same array as H2 |
+| H13 | `_genotypes.py:320` | `_as_starts_stops(geno_offsets)` again | `(2, r*s*p)` — **duplicate** of H4 |
+| H14 | `_genotypes.py:321` | `np.ascontiguousarray(geno_v_idxs, np.int32)` | **duplicate** of H3 |
+| H15 | `_genotypes.py:322` | `np.ascontiguousarray(v_starts, np.int32)` | **duplicate** of H8 |
+| H16 | `_genotypes.py:323` | `np.ascontiguousarray(ilens, np.int32)` | **duplicate** of H5 |
+| H17 | `_genotypes.py:324` | `np.ascontiguousarray(alt_alleles, np.uint8)` | `(tot_alt_bytes)` — memmap view |
+| H18 | `_genotypes.py:325` | `np.ascontiguousarray(alt_offsets, np.int64)` | `(tot_v+1)` |
+| H19 | `_genotypes.py:326` | `np.ascontiguousarray(ref, np.uint8)` | whole contig bytes — **large** |
+| H20 | `_genotypes.py:327` | `np.ascontiguousarray(ref_offsets, np.int64)` | `(n_contigs+1)` |
+| H21 | `_genotypes.py:329–330` | `None if keep is None else np.ascontiguousarray(keep, np.bool_)` | duplicate of H6 |
+| H22 | `_genotypes.py:330` | same for `keep_offsets` | duplicate of H7 |
+
+**Pre-kernel intermediate allocation:**  
+`_haps.py:765`: `out_data = np.empty(req.out_offsets[-1], np.uint8)` — the output buffer.  
+`_haps.py:766`: `out_offsets = np.asarray(req.out_offsets, np.int64)` — another dtype cast/view.
+
+**FFI crossing:** one Python→Rust boundary crossing into `_reconstruct_haplotypes_from_sparse_rust`.
+
+**Annotated haps path** adds two more pre-kernel allocations:  
+`_haps.py:844`: `annot_v_data = np.empty(req.out_offsets[-1], V_IDX_TYPE)`  
+`_haps.py:845`: `annot_pos_data = np.empty(req.out_offsets[-1], np.int32)`  
+These are required outputs, not avoidable coercions.
+
+### Summary — haplotypes path
+- **2 FFI boundary crossings** (one per kernel)
+- **~22 `np.ascontiguousarray` / `np.asarray` calls**, of which at least 8 are
+  exact duplicates (H12–H16, H21–H22) because both wrapper functions independently
+  normalize the same underlying arrays.
+- **Key structural waste:** `_as_starts_stops(geno_offsets)` allocates a `(2, n)`
+  int64 array twice — once per kernel crossing.  `geno_v_idxs`, `ilens`, `v_starts`,
+  `keep`, `keep_offsets` are all re-coerced at the second crossing even though their
+  dtypes are already correct after the first crossing.
+
+---
+
+## 2. Tracks Path — Coercion / Crossing Inventory
+
+Call chain (HapsTracks mode, RaggedTracks output):  
+`HapsTracks.__call__` → `get_haps_and_shifts` (same as above, 2 FFI crossings)  
+then in the per-track loop:  
+→ `intervals_to_tracks` → (FFI #3 per track)  
+→ `_dispatch_get("shift_and_realign_tracks_sparse")` → (FFI #4 per track)
+
+### Pre-loop allocations
+(in `python/genvarloader/_dataset/_reconstruct.py`)
+
+| # | File:Line | Operation |
+|---|-----------|-----------|
+| T1 | `_reconstruct.py:161` | `out = np.empty(n_tracks * n_per_track, np.float32)` — full fused output buffer |
+| T2 | `_reconstruct.py:192` | `_tracks = np.empty(track_ofsts_per_t[-1], np.float32)` — **per-track intermediate** buffer, allocated inside the loop |
+
+T2 is the key intermediate: it holds one track's reference-coordinate data before
+realignment, then is discarded each iteration.  `n_tracks` loop iterations → `n_tracks`
+temporary allocations + `n_tracks` FFI crossing pairs.
+
+### `intervals_to_tracks` wrapper — FFI crossing #3 (×n_tracks)
+(in `python/genvarloader/_dataset/_intervals.py`)
+
+| # | File:Line | Operation | Arrays coerced |
+|---|-----------|-----------|----------------|
+| T3 | `_intervals.py:110` | `np.ascontiguousarray(offset_idxs, dtype=np.int64)` | `(b)` |
+| T4 | `_intervals.py:111` | `np.ascontiguousarray(starts, dtype=np.int32)` | `(b)` |
+| T5 | `_intervals.py:112` | `np.ascontiguousarray(itv_starts, dtype=np.int32)` | `(n_intervals)` — memmap |
+| T6 | `_intervals.py:113` | `np.ascontiguousarray(itv_ends, dtype=np.int32)` | `(n_intervals)` — memmap |
+| T7 | `_intervals.py:114` | `np.ascontiguousarray(itv_values, dtype=np.float32)` | `(n_intervals)` — memmap |
+| T8 | `_intervals.py:115` | `np.ascontiguousarray(itv_offsets, dtype=np.int64)` | `(n_samples*n_regions+1)` |
+| T9 | `_intervals.py:116` | `np.ascontiguousarray(out_offsets, dtype=np.int64)` | `(b+1)` |
+
+**FFI crossing:** one Python→Rust boundary into `_intervals_to_tracks_rust`.  Writes
+into `_tracks` (the per-track temp buffer).
+
+### `shift_and_realign_tracks_sparse` wrapper — FFI crossing #4 (×n_tracks)
+(in `python/genvarloader/_dataset/_tracks.py`)
+
+| # | File:Line | Operation | Arrays coerced |
+|---|-----------|-----------|----------------|
+| T10 | `_tracks.py:433` | `_as_starts_stops(geno_offsets)` → `np.ascontiguousarray(np.stack(...), np.int64)` | `(2, r*s*p)` — duplicate of H4/H13, **again per track** |
+| T11 | `_tracks.py:436` | `np.asarray(out_offsets, dtype=np.int64)` | `(b*p+1)` |
+| T12 | `_tracks.py:437` | `np.asarray(regions, dtype=np.int32)` | `(b,3)` — already int32 |
+| T13 | `_tracks.py:438` | `np.asarray(shifts, dtype=np.int32)` | `(b,p)` — already int32 |
+| T14 | `_tracks.py:439` | `np.asarray(geno_offset_idx, dtype=np.int64)` | `(b,p)` |
+| T15 | `_tracks.py:440` | `np.asarray(geno_v_idxs, dtype=np.int32)` | `(r*s*p*v)` — full memmap |
+| T16 | `_tracks.py:442` | `np.asarray(v_starts, dtype=np.int32)` | `(tot_v)` |
+| T17 | `_tracks.py:443` | `np.asarray(ilens, dtype=np.int32)` | `(tot_v)` |
+| T18 | `_tracks.py:444` | `np.asarray(tracks, dtype=np.float32)` | `_tracks` intermediate |
+| T19 | `_tracks.py:445` | `np.asarray(track_offsets, dtype=np.int64)` | `(b+1)` |
+| T20 | `_tracks.py:446` | `np.asarray(params, dtype=np.float64)` | per-strategy params |
+| T21 | `_tracks.py:448` | `np.asarray(keep_offsets, dtype=np.int64)` (optional) | `(b*p+1)` |
+
+**FFI crossing:** one Python→Rust boundary into `_shift_and_realign_tracks_sparse_rust`.
+
+### Summary — tracks path (HapsTracks, n_tracks tracks)
+- **2 (haps) + 2×n_tracks (tracks)** FFI boundary crossings total per `__getitem__` call.
+- **~22 (haps) + n_tracks × ~19 (tracks)** `np.ascontiguousarray`/`np.asarray` calls total.
+- **Key structural waste:**
+  - `_as_starts_stops(geno_offsets)` is re-executed **n_tracks+2 times** per call
+    (once per haps kernel, once per track kernel pair). Each call allocates `(2, r*s*p)` int64.
+  - `geno_v_idxs`, `v_starts`, `ilens` (full variant arrays, potentially large) are
+    re-coerced **n_tracks+1 extra times** beyond the first.
+  - `_tracks` intermediate buffer (T2, `np.empty`) is allocated **n_tracks times**;
+    its data crosses the FFI twice (into `intervals_to_tracks` then read back by
+    `shift_and_realign_tracks_sparse`) before being discarded.
+
+---
+
+## 3. Live Profiling
+
+**Status: deferred.**
+
+A profiling harness exists at `tests/benchmarks/profiling/profile.py` targeting
+`tests/benchmarks/data/chr22_geuv.gvl`, and pre-existing speedscope profiles are
+present at `tests/benchmarks/profiling/haps.speedscope.json` and
+`tracks.speedscope.json`.  The chr22_geuv dataset and reference file are present
+under `tests/benchmarks/data/`.
+
+Live `cProfile` was not run during this audit because:
+1. The static trace is complete and sufficient for identifying the fusion seams.
+2. The pre-existing py-spy/memray profiles (generated before the Rust kernels were
+   fully ported) reflect the old numba hot path and would need to be re-run with
+   `GVL_BACKEND=rust` to measure the current Python glue share.
+3. Running the dataset under `cProfile` (not py-spy) during a non-interactive session
+   risks JIT warm-up noise and requires the pixi dev env.
+
+**Recommendation for Task 13/14:** after implementing the fused entries, re-run
+`pixi run -e dev profile-haps` and `profile-tracks` (py-spy) with `GVL_BACKEND=rust`
+and compare the new profiles to confirm coercion overhead is gone.  The Phase 0 claim
+(~62% glue) should be re-verified against the current Rust-kernel baseline.
+
+---
+
+## 4. Proposed Fused Entry Signatures
+
+### 4a. Fused Haplotypes Entry (Task 13)
+
+**Goal:** collapse FFI crossings H1 (get_diffs_sparse) and H2
+(reconstruct_haplotypes_from_sparse) into a single Rust `#[pyfunction]` that:
+1. Computes per-haplotype length diffs (`get_diffs_sparse` logic).
+2. Allocates the output buffer and offset array in Rust.
+3. Runs `reconstruct_haplotypes_from_sparse` logic.
+4. Returns `(out_data: Array1<u8>, out_offsets: Array1<i64>)` — the raw ragged buffers.
+
+The caller (Python `_reconstruct_haplotypes`) can then wrap them into a `_Flat`/`Ragged`
+with zero further coercions.
+
+```rust
+/// Fused: compute diffs → out_offsets → reconstruct haplotypes.
+/// Returns (out_data, out_offsets) as owned 1-D arrays.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn reconstruct_haplotypes_fused<'py>(
+    py: Python<'py>,
+    regions: PyReadonlyArray2<i32>,          // (b, 3)
+    geno_offset_idx: PyReadonlyArray2<i64>,  // (b, p)
+    geno_offsets: PyReadonlyArray2<i64>,     // (2, r*s*p)
+    geno_v_idxs: PyReadonlyArray1<i32>,      // (r*s*p*v) — full sparse store
+    v_starts: PyReadonlyArray1<i32>,          // (tot_v)
+    ilens: PyReadonlyArray1<i32>,             // (tot_v)
+    alt_alleles: PyReadonlyArray1<u8>,        // (tot_alt_bytes)
+    alt_offsets: PyReadonlyArray1<i64>,       // (tot_v + 1)
+    ref_: PyReadonlyArray1<u8>,               // whole contig bytes
+    ref_offsets: PyReadonlyArray1<i64>,       // (n_contigs + 1)
+    pad_char: u8,
+    output_length: i64,                       // -1 = ragged (hap length), else fixed
+    keep: Option<PyReadonlyArray1<bool>>,     // (b*p*v) optional exonic mask
+    keep_offsets: Option<PyReadonlyArray1<i64>>,  // (b*p + 1)
+    // Optional annotation output buffers (annotated-haps mode).
+    // When provided, filled in-place (caller pre-allocates based on returned out_offsets).
+    // Task 13 may ship annotation support as a follow-on; initial version returns None.
+    mut annot_v_idxs: Option<PyReadwriteArray1<i32>>,
+    mut annot_ref_pos: Option<PyReadwriteArray1<i32>>,
+) -> Bound<'py, PyTuple>   // (out_data: Array1<u8>, out_offsets: Array1<i64>)
+```
+
+**Rationale:**
+- All arrays that were coerced twice (H2–H8 and H12–H22) are passed once.
+- `_as_starts_stops` is done once in Rust (trivial row split of the `(2,n)` matrix).
+- The Rust side owns the output buffer allocation — Python never calls `np.empty`.
+- `output_length = -1` signals ragged mode; positive integer signals fixed-length
+  (current Python: `np.full(..., output_length, np.int32)` is replaced by a Rust-side
+  broadcast).
+- Annotation buffers: for `_reconstruct_annotated_haplotypes`, the caller needs
+  `out_offsets` before allocating them.  Two options: (a) two-call API (fused diffs +
+  offsets in one call, then annotated reconstruct), or (b) pass pre-allocated buffers
+  like the current Rust FFI does.  Option (b) is simpler and avoids a second crossing;
+  the caller reads `out_offsets[-1]` from the first return to size the buffers if
+  annotation is needed.
+
+**Python-side after fusion (sketch):**
+```python
+out_data, out_offsets = gvl_rust.reconstruct_haplotypes_fused(
+    regions=req.regions,
+    geno_offset_idx=req.geno_offset_idx,
+    geno_offsets=self.genotypes.offsets,   # already (2,n) or 1-D; Rust normalizes
+    geno_v_idxs=self.genotypes.data,
+    v_starts=self.variants.start,
+    ilens=self.variants.ilen,
+    alt_alleles=self.variants.alt.data.view(np.uint8),
+    alt_offsets=self.variants.alt.offsets,
+    ref_=self.reference.reference,
+    ref_offsets=self.reference.offsets,
+    pad_char=self.reference.pad_char,
+    output_length=output_length if isinstance(output_length, int) else -1,
+    keep=req.keep,
+    keep_offsets=req.keep_offsets,
+    annot_v_idxs=None,
+    annot_ref_pos=None,
+)
+# out_data, out_offsets are fresh owned arrays — no further coercion needed
+return _Flat.from_offsets(out_data, shape, out_offsets).view("S1")
+```
+
+**Risk — annotation path:** `_reconstruct_annotated_haplotypes` currently takes
+in-place mutable annotation buffers whose sizes depend on `out_offsets[-1]`.  If
+the fused entry returns `out_offsets` first and allocates buffers in a second step,
+the annotation path gets a second Python call but still only ONE FFI crossing
+(diffs+reconstruction in one shot).  Document this trade-off clearly in Task 13.
+
+---
+
+### 4b. Fused Tracks Entry (Task 14)
+
+**Goal:** collapse FFI crossings T3+T4 (`intervals_to_tracks`) and the per-track
+`shift_and_realign_tracks_sparse` crossing into a **single Rust entry per track** that:
+1. Converts intervals → reference-coordinate tracks (inline, no intermediate Python buffer).
+2. Shifts and realigns into the caller's pre-allocated `out` slice.
+
+The outer Python loop over `n_tracks` stays — it is bounded by track count (small,
+typically 1–10), not batch size — but each iteration drops from 2 FFI crossings + 1
+intermediate allocation to 1 FFI crossing + 0 intermediate allocation.
+
+```rust
+/// Fused per-track: intervals → reference tracks → shift/realign into out.
+/// Replaces the pair (intervals_to_tracks, shift_and_realign_tracks_sparse).
+/// `out` is the per-track slice of the caller's pre-allocated output buffer.
+/// `itv_offsets` is 1-D (n_samples*n_regions + 1) int64.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn intervals_and_realign_track_fused(
+    mut out: PyReadwriteArray1<f32>,          // (b*p*l) — caller's pre-alloc slice
+    out_offsets: PyReadonlyArray1<i64>,       // (b*p + 1)
+    regions: PyReadonlyArray2<i32>,           // (b, 3)
+    shifts: PyReadonlyArray2<i32>,            // (b, p)
+    geno_offset_idx: PyReadonlyArray2<i64>,   // (b, p)
+    geno_v_idxs: PyReadonlyArray1<i32>,       // (r*s*p*v)
+    geno_offsets: PyReadonlyArray2<i64>,      // (2, r*s*p)
+    v_starts: PyReadonlyArray1<i32>,           // (tot_v)
+    ilens: PyReadonlyArray1<i32>,              // (tot_v)
+    // intervals (reference-coordinate, for this track)
+    offset_idxs: PyReadonlyArray1<i64>,       // (b) — per-query index into itv_offsets
+    itv_starts: PyReadonlyArray1<i32>,         // (n_intervals)
+    itv_ends: PyReadonlyArray1<i32>,           // (n_intervals)
+    itv_values: PyReadonlyArray1<f32>,         // (n_intervals)
+    itv_offsets: PyReadonlyArray1<i64>,        // (n_samples*n_regions + 1)
+    // insertion-fill strategy
+    params: PyReadonlyArray1<f64>,
+    strategy_id: i64,
+    base_seed: u64,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+) -> PyResult<()>
+```
+
+**Rust internals:** allocate a stack/thread-local scratch buffer of size
+`max(track_lengths_for_batch)` instead of calling back to Python for the
+intermediate `_tracks` buffer.  The `intervals_to_tracks` logic fills the scratch;
+`shift_and_realign_track_sparse` reads from it and writes `out`.
+
+**Rationale:**
+- Removes the per-track `_tracks = np.empty(...)` intermediate allocation (T2).
+- Removes 7 `np.ascontiguousarray` calls per track (T3–T9) for the
+  `intervals_to_tracks` wrapper.
+- Removes ~12 `np.asarray` calls per track (T10–T21) for the
+  `shift_and_realign_tracks_sparse` wrapper.
+- `_as_starts_stops(geno_offsets)` is done once in Rust per call, not per track.
+- Net: from `2×n_tracks + 2` crossings to `n_tracks + 2` crossings per `__getitem__`.
+
+**Python-side after fusion (sketch):**
+```python
+for track_ofst, (name, tracktype) in enumerate(self.tracks.active_tracks.items()):
+    intervals = self.tracks.intervals[name]
+    o_idx = idx if tracktype is TrackType.SAMPLE else r_idx
+    _out = out[track_ofst * n_per_track : (track_ofst + 1) * n_per_track]
+    gvl_rust.intervals_and_realign_track_fused(
+        out=_out,
+        out_offsets=out_ofsts_per_t,
+        regions=regions,
+        shifts=shifts,
+        geno_offset_idx=geno_idx,
+        geno_v_idxs=self.haps.genotypes.data,
+        geno_offsets=self.haps.genotypes.offsets,
+        v_starts=self.haps.variants.start,
+        ilens=self.haps.variants.ilen,
+        offset_idxs=o_idx,
+        itv_starts=intervals.starts.data,
+        itv_ends=intervals.ends.data,
+        itv_values=intervals.values.data,
+        itv_offsets=intervals.starts.offsets,
+        params=strat_params[track_ofst],
+        strategy_id=int(strat_ids[track_ofst]),
+        base_seed=base_seed,
+        keep=keep,
+        keep_offsets=keep_offsets,
+    )
+```
+No `np.ascontiguousarray` / `np.empty` inside the loop.
+
+---
+
+## 5. Risks and Notes
+
+### 5a. Annotation buffers (haps path)
+
+`_reconstruct_annotated_haplotypes` pre-allocates `annot_v_data` and
+`annot_pos_data` at `_haps.py:844–845` **before** calling
+`reconstruct_haplotypes_from_sparse`, because their sizes equal
+`out_offsets[-1]` which is computed from `diffs`.  In the fused entry the caller
+cannot know `out_offsets[-1]` until after Rust returns — unless the fused entry
+accepts them as optional in/out parameters (like the existing FFI) or computes
+diffs in a pre-flight call.
+
+**Recommended approach for Task 13:** the fused entry accepts
+`annot_v_idxs: Option<PyReadwriteArray1<i32>>` and
+`annot_ref_pos: Option<PyReadwriteArray1<i32>>` as optional write buffers,
+mirroring the current `reconstruct_haplotypes_from_sparse` FFI.  The Python
+caller runs the non-annotated fused entry first when annotation is not needed
+(the common path), and uses a two-step approach (get offsets, alloc, call annotated
+variant) for the annotated path.  This keeps the common path at one crossing.
+
+### 5b. `intervals_to_tracks` contract bug (tracks path)
+
+**Filed bug mcvickerlab/GenVarLoader#242:**  
+`intervals_to_tracks` assumes `itv.start >= query_start` (documented in the numba
+source at `_intervals.py:73`).  For datasets with `max_jitter > 0`, jittered query
+start positions can be less than the stored interval starts, violating this
+contract. The numba backend silently returns wrong results; the Rust backend
+panics.
+
+**Task 14 scope:** the fused tracks entry REUSES the existing
+`intervals_to_tracks` core logic as-is.  It does NOT fix this bug.  The fix is
+deferred to a separate PR.
+
+**Consequence for parity testing:** Task 14's parity tests MUST use `max_jitter=0`
+datasets to stay within the contract.  This matches the current Task 11 parity test
+setup.
+
+### 5c. `_as_starts_stops` duplication
+
+The `_as_starts_stops` helper (`_genotypes.py:119–125`) converts 1-D offset arrays
+to `(2, n)` starts/stops.  It is called separately in:
+- `get_diffs_sparse` wrapper (H4)
+- `reconstruct_haplotypes_from_sparse` wrapper (H13)
+- `_shift_and_realign_tracks_sparse_rust_wrapper` (T10) — once per track
+
+After fusion, the Rust side can accept the offsets in either form and branch
+internally (the `(2,n)` row-split is a view, not a copy).  Alternatively, the
+Python caller can normalize once and pass the `(2,n)` array to all callers.
+
+### 5d. Splice plan path
+
+`_reconstruct_haplotypes` has a separate splice-plan branch
+(`_haps.py:793–829`) that calls `_permute_request_for_splice` and invokes
+`reconstruct_haplotypes_from_sparse` with reshuffled arrays.  The fused entry
+should accept an optional `permutation` array and perform the permutation in Rust,
+or alternatively the splice path can continue using the existing non-fused entry
+(since spliced reconstruction is already uncommon and correct).  Task 13 should
+explicitly decide this scope.
+
+---
+
+## 6. Files Affected by This Audit (no production changes)
+
+| File | Role |
+|------|------|
+| `python/genvarloader/_dataset/_haps.py` | haps path — `_prepare_request`, `_reconstruct_haplotypes`, `_reconstruct_annotated_haplotypes` |
+| `python/genvarloader/_dataset/_genotypes.py` | dispatch wrappers — `get_diffs_sparse`, `reconstruct_haplotypes_from_sparse` |
+| `python/genvarloader/_dataset/_reconstruct.py` | compound reconstructor — `HapsTracks.__call__` |
+| `python/genvarloader/_dataset/_tracks.py` | dispatch wrapper — `_shift_and_realign_tracks_sparse_rust_wrapper` |
+| `python/genvarloader/_dataset/_intervals.py` | dispatch wrapper — `intervals_to_tracks` |
+| `src/ffi/mod.rs` | current Rust `#[pyfunction]` entries (reference for Task 13/14 signatures) |
+| `src/reconstruct/mod.rs` | Rust `reconstruct_haplotypes_from_sparse` core |
+| `src/tracks/mod.rs` | Rust `shift_and_realign_tracks_sparse` core |
diff --git a/docs/roadmaps/phase-5-w4-final-ab.md b/docs/roadmaps/phase-5-w4-final-ab.md
new file mode 100644
index 00000000..fb8d5610
--- /dev/null
+++ b/docs/roadmaps/phase-5-w4-final-ab.md
@@ -0,0 +1,48 @@
+# Phase 5 W4 — Final single-thread numba-vs-rust `__getitem__` A/B
+
+**Date:** 2026-06-26 · **Branch measured:** `phase-5-w4` (≡ `rust-migration` + W3 fusion `phase-5-w3`; W2 is test-only and perf-neutral) · **Node:** shared Carter HPC, single-thread (`NUMBA_NUM_THREADS=1`; rust serial — rayon is W5).
+
+**Purpose:** the migration's final single-thread parity gate before the W5 consolidation (numba deletion + rayon). **Gate:** rust at parity-or-better single-thread across all `__getitem__` modes → proceed to consolidation. Benchmark-only; no code change.
+
+## Methodology (and why)
+
+The shared Carter node makes **absolute, cross-session wall-clock unreliable** — the same metric has drifted ≥2× between sessions minutes apart under variable load (round-3, PR #252). So this A/B follows the established rule: **measure rust AND numba in the SAME back-to-back session**, run twice to show within-session stability, and **pin the ratio direction explicitly** (here: `speedup = numba_ms / rust_ms`, higher ⇒ rust faster). The durable, trustworthy signal is **byte-identical numba/rust parity** (already gated across W1–W3 and the full parity suite) plus same-session improve-or-hold — not the absolute ms. The ms ratios below are reported as order-of-magnitude evidence, not precise constants.
+
+Two independent tools, both single-thread, both backends, one session:
+- `tests/benchmarks/test_e2e.py` — pytest-benchmark **pedantic min** (noise-robust per-call floor), seqlen 16384, batch 32, 50 rounds × 10 iterations, 5 warmup rounds.
+- `tests/benchmarks/profiling/profile.py` — steady-state **mean wall-clock throughput**, 1500 batches after burn-in, two passes.
+
+## Results
+
+### `test_e2e.py` pedantic-min (ms/batch; lower = faster)
+
+| Mode | rust min | numba min | speedup (numba÷rust) |
+|------|---------:|----------:|---------:|
+| haplotypes | 2.02 | 3.36 | **1.66×** |
+| annotated | 6.48 | 9.30 | **1.43×** |
+| tracks (haps+realigned tracks) | 2.01 | 3.34 | **1.66×** |
+| tracks_only (pure track path) | 1.04 | 1.11 | **1.07×** |
+| variants | — | — | xfail (pre-existing: `_FlatVariants.to_fixed` missing for `with_len`) |
+
+### `profile.py` steady-state throughput (ms/batch; pass 1 / pass 2)
+
+| Mode | rust | numba | speedup (pass1 / pass2) |
+|------|-----:|------:|---------:|
+| haplotypes | 2.27 / 2.02 | 3.63 / 3.34 | 1.60× / 1.65× |
+| annotated | 6.92 / 6.41 | 9.05 / 8.93 | 1.31× / 1.39× |
+| tracks (pure) | 1.08 / 1.08 | 1.13 / 1.12 | 1.05× / 1.04× |
+| tracks-seqs | 2.03 / 2.03 | 3.34 / 3.34 | 1.65× / 1.65× |
+| variants | 1.97 / 1.97 | 2.71 / 2.73 | 1.38× / 1.39× |
+| variant-windows | 0.78 / 0.78 | 3.57 / 3.57 | 4.58× / 4.58× |
+
+Both passes are tightly consistent (within-session stable), and the two tools agree.
+
+## Conclusion — GATE PASSED
+
+Rust is **parity-or-better single-thread on every mode**:
+- The pure **tracks-only** path is the tightest at ~1.04–1.07× — effectively parity, rust marginally ahead. This path is dominated by per-batch fixed cost (region indexing + interval memmap IO), not kernel compute, so the backend choice barely moves it; rust is never behind.
+- Every **compute-bound** path is clearly faster: haplotypes/tracks-seqs ~1.65×, annotated ~1.4×, variants ~1.4×, and **variant-windows ~4.6×** (fully rust-tokenized).
+
+Combined with byte-identical parity (W1–W3 + the full parity suite, both backends), there is no single-thread regression risk in removing numba. **→ Proceed to W5 (consolidation: golden-snapshot the numba-oracle parity suites, delete numba, add rayon batch parallelism gated byte-identical to the serial golden result).**
+
+Raw run logs: captured in-session (`profile.py` 6 modes × 2 backends × 2 passes; `test_e2e.py` 2 backends).
diff --git a/docs/roadmaps/phase-5-w6-perf-rebaseline.md b/docs/roadmaps/phase-5-w6-perf-rebaseline.md
new file mode 100644
index 00000000..1ca3482f
--- /dev/null
+++ b/docs/roadmaps/phase-5-w6-perf-rebaseline.md
@@ -0,0 +1,224 @@
+# Phase 5 W6 — Rayon serial-vs-multithread speedup re-baseline
+
+**Date:** 2026-06-27
+**Branch:** `phase-5-w6-wrapup`
+**HEAD:** `0968a0f5a3c2cbc34f3d4f358e30c3df8aecaa40`
+**Node:** shared Carter HPC, Intel Xeon E5-4650 v3 @ 2.10 GHz, 96 logical CPUs, linux-64
+**Corpus:** `tests/benchmarks/data/chr22_geuv.gvl` (format 2.0, 165 regions × 5 samples, chr22, read-depth; `max_jitter=0`)
+**Build:** `pixi run -e dev maturin develop --release` (release profile, genvarloader v0.35.0)
+**Reference:** `tests/benchmarks/data/chr22.masked.fa.gz`
+
+---
+
+## Purpose
+
+After the W5 consolidation (numba deleted, rayon batch parallelism added, PR #260), this pass
+re-baselines the read path as a **same-session rayon serial-vs-multithread speedup curve** + peak-RSS
+deltas. There is no live numba A/B: numba was deleted in W5.
+
+For the final single-thread numba-vs-rust A/B (gate measured before W5), see:
+[`docs/roadmaps/phase-5-w4-final-ab.md`](phase-5-w4-final-ab.md)
+
+---
+
+## Node-noise caveat (IMPORTANT — read before comparing across sessions)
+
+The Carter HPC node is **shared**. Absolute wall-clock drifts ≥2× between sessions under
+variable load (documented across Phase 3 round-3, W4 A/B, and prior passes). Absolute ms/batch
+values are NOT comparable across sessions. The durable signal is:
+
+- **Same-session ratios** (thread-count N vs serial baseline, measured back-to-back).
+- **Deterministic correctness**: `serial == parallel == frozen golden` for all kernels
+  (`tests/parity/test_rayon_equivalence.py`, W5 gate).
+- **Instruction-count reductions** from round-3 tuning (documented in `rust-migration.md`).
+
+All tables in this document were captured in ONE continuous session on 2026-06-27.
+
+---
+
+## Methodology
+
+### e2e modes (haplotypes, annotated, tracks, tracks-only)
+
+Harness: `tests/benchmarks/test_e2e.py` via `pytest-benchmark` **pedantic min**.
+Configuration: `ROUNDS=50`, `ITERATIONS=10`, `WARMUP_ROUNDS=5`, `SEQLEN=16384`, `BATCH=32`.
+Each reported figure is `min` (ms/batch) — the most noise-robust estimate.
+
+```bash
+RAYON_NUM_THREADS=<N> GVL_NUM_THREADS=<N> pixi run -e dev pytest tests/benchmarks/test_e2e.py \
+    -q --benchmark-only --benchmark-disable-gc --benchmark-warmup-iterations=5
+```
+
+The `variants` e2e mode is `xfail` (pre-existing: `_FlatVariants.to_fixed` missing for `with_len`;
+predates this phase). Variants and variant-windows are measured via `profile.py` instead.
+
+### variants modes (variants, variant-windows)
+
+Harness: `tests/benchmarks/profiling/profile.py` **wall-clock average** (2000 batches, burn-in 5).
+
+```bash
+RAYON_NUM_THREADS=<N> GVL_NUM_THREADS=<N> pixi run -e dev python \
+    tests/benchmarks/profiling/profile.py --mode <mode> --n-batches 2000
+```
+
+### Peak-RSS
+
+Harness: `pixi run -e dev memray-tracks` / `memray-haps` + `python -m memray stats`.
+Default 2000 batches, no `RAYON_NUM_THREADS` / `GVL_NUM_THREADS` override for the "parallel"
+run; `RAYON_NUM_THREADS=1 GVL_NUM_THREADS=1` for the serial run.
+
+### Thread counts measured
+
+`RAYON_NUM_THREADS` (and `GVL_NUM_THREADS`) = **1** (serial baseline), **2**, **4**, **8**,
+**unset** (default = all available cores = 96 on this node).
+
+---
+
+## The `should_parallelize` threshold — why all modes stayed serial
+
+The `should_parallelize(total_bytes)` gate in `python/genvarloader/_threads.py` uses:
+
+```python
+_MIN_BYTES_PER_THREAD = 1 << 20  # 1 MiB
+return total_bytes >= num_threads() * _MIN_BYTES_PER_THREAD
+```
+
+`num_threads()` reads `GVL_NUM_THREADS` (or cgroup CPU count). The small benchmark corpus
+(BATCH=32, SEQLEN=16384) produces at most ~2 MiB of output per batch:
+
+**Batch composition:** Each batch is BATCH=32 (region, sample) index pairs (see `tests/benchmarks/_indices.py`).
+The corpus has 5 samples with ploidy 2 (diploid), so each region-sample pair yields 2 haplotype sequences.
+Output-byte figures are therefore:
+`n_pairs × haplotypes_per_sample × seqlen` for haplotypes, and
+`n_pairs × seqlen × bytes_per_element` for f32 tracks.
+
+| Mode | Output bytes per batch | Threshold at N threads | Parallel? |
+|------|----------------------|------------------------|-----------|
+| haplotypes (32 pairs × 2 haps/sample × 16384 bytes/hap) | 1,048,576 B (1 MiB) | N × 1 MiB | No at N≥2; borderline at N=1 |
+| tracks f32 (32 pairs × 16384 positions × 4 bytes/f32) | 2,097,152 B (2 MiB) | N × 1 MiB | Borderline at N=2 only |
+| annotated (haps + 2 × i32 arrays) | ~3 MiB | N × 1 MiB | No at N≥4 |
+| variants (ragged, variable) | ~few MiB | N × 1 MiB | No at N≥8 |
+
+**Conclusion: all modes ran serial for N≥4 and most modes ran serial at all N on this corpus.**
+This is correct behavior: the gate exists to prevent rayon spawn overhead from dominating short
+batches. **This is a finding, not a failure** — the parallelism gate is working as designed.
+
+> For production workloads at `SEQLEN≥131072` or `BATCH≥256`, most modes will cross the
+> threshold and rayon will engage. The gate's correctness (`serial == parallel == frozen golden`)
+> was already verified unconditionally in W5's `test_rayon_equivalence.py` parity suite.
+
+---
+
+## Results
+
+### e2e pedantic-min (ms/batch; lower = faster)
+
+Speedup = serial_min_ms / N_threads_min_ms (>1.0 means the multi-thread run was faster).
+All values are `min` (ms/batch) from pytest-benchmark pedantic runs.
+
+| Mode | T=1 (serial) | T=2 | T=4 | T=8 | T=all (96) | Note |
+|------|------------:|----:|----:|----:|----------:|------|
+| tracks-only | **1.0558** | 0.9559 | 1.0111 | 1.0122 | 0.9623 | All within session noise |
+| tracks (haps+realigned) | **2.0700** | 1.9484 | 2.0103 | 1.9521 | 1.9620 | All within session noise |
+| haplotypes | **2.0819** | 1.9722 | 2.0276 | 1.9661 | 1.9687 | All within session noise |
+| annotated | **6.6933** | 6.1536 | 6.2886 | 7.0523 | 6.1394 | All within session noise |
+
+Speedup vs serial (serial_min / thread_min; >1.0 = faster):
+
+| Mode | T=2 | T=4 | T=8 | T=all (96) |
+|------|----:|----:|----:|----------:|
+| tracks-only | 1.10× | 1.04× | 1.04× | 1.10× |
+| tracks | 1.06× | 1.03× | 1.06× | 1.06× |
+| haplotypes | 1.06× | 1.03× | 1.06× | 1.06× |
+| annotated | 1.09× | 1.06× | 0.95× | 1.09× |
+
+**All ratios are in the 0.95×–1.10× band — within shared-node noise. No mode shows a
+genuine rayon speedup, confirming that the threshold gate held serial execution throughout.**
+
+### variants modes wall-avg (ms/batch; lower = faster)
+
+| Mode | T=1 (serial) | T=2 | T=4 | T=8 | T=all (96) | Note |
+|------|------------:|----:|----:|----:|----------:|------|
+| variants | **2.085** | 2.129 | 2.019 | 2.036 | 2.054 | Within noise |
+| variant-windows | **0.798** | 0.794 | 0.812 | 0.806 | 0.802 | Within noise |
+
+Speedup vs serial:
+
+| Mode | T=2 | T=4 | T=8 | T=all (96) |
+|------|----:|----:|----:|----------:|
+| variants | 0.98× | 1.03× | 1.02× | 1.01× |
+| variant-windows | 1.01× | 0.98× | 0.99× | 1.00× |
+
+**All within noise. Serial execution confirmed for both variants modes at all thread counts.**
+
+### Summary: speedup never materialized on this corpus
+
+No mode crossed the `should_parallelize` threshold at N≥4 threads. At N=2, the tracks f32
+path sits exactly at the 2 MiB boundary but the measured ratio is still within session noise.
+
+The rayon parallelism gate functions correctly: it prevents spawn overhead from hurting small
+batches and yields identical output (proven by `test_rayon_equivalence.py`). The speedup curve
+for production-scale workloads is not measurable on this 32-batch / 16384-seqlen test corpus.
+
+---
+
+## Peak RSS
+
+Measured with memray (haps mode and tracks mode, serial vs parallel/unset):
+
+| Run | Mode | Serial (T=1) peak RSS | Parallel (unset) peak RSS | Δ |
+|-----|------|-----------------------|--------------------------|---|
+| memray-tracks | tracks | 3.525 GB | 3.525 GB | 0 |
+| memray-haps | haplotypes | 3.525 GB | 3.525 GB | 0 |
+
+Peak RSS is 3.525 GB in all cases, dominated by the seqpro/llvmlite JIT startup (~3.2 GB
+transitive via seqpro 0.20.0). Since the threshold gate held serial execution throughout,
+the rayon thread-pool overhead (stack allocations, worker threads) was never materialized.
+
+**GVL-attributable RSS delta: 0.** The ~3.2 GB floor is seqpro transitive numba, not
+gvl-own code. Removing numba from seqpro is explicitly out of scope for this migration
+(W5 seqpro caveat; user decision 2026-06-27).
+
+---
+
+## Numba A/B: unavailable (W5 deletion)
+
+Numba was deleted in W5 (PR #260). A live numba vs rust comparison is no longer possible on
+this branch. For the final single-thread numba-vs-rust speedup figures (all modes at
+parity-or-better), see:
+
+**[`docs/roadmaps/phase-5-w4-final-ab.md`](phase-5-w4-final-ab.md)**
+
+Summary of W4 final A/B (same-session, `phase-5-w4` branch, Carter HPC):
+
+| Mode | rust (ms/batch) | numba (ms/batch) | speedup (numba÷rust) |
+|------|----------------:|-----------------:|---------------------:|
+| haplotypes | 2.02 | 3.36 | **1.66×** |
+| annotated | 6.48 | 9.30 | **1.43×** |
+| tracks (haps+realigned) | 2.01 | 3.34 | **1.66×** |
+| tracks-only | 1.04 | 1.11 | **1.07×** |
+| variants | 1.97 | 2.71 | **1.38×** |
+| variant-windows | 0.78 | 3.57 | **4.58×** |
+
+---
+
+## GVL-attributable conclusion
+
+1. **Rayon implementation is correct.** `serial == parallel == frozen golden` for all kernels
+   (`test_rayon_equivalence.py`, W5 parity gate). No correctness regression.
+
+2. **Threshold gate works as designed.** On the small benchmark corpus (BATCH=32, SEQLEN=16384),
+   all modes ran serial at N≥4 because batch output bytes (~1–3 MiB) < N × 1 MiB threshold.
+   This is the expected and correct behavior.
+
+3. **Rayon speedup is not measurable on this corpus.** For production workloads at
+   `SEQLEN≥131072` or `BATCH≥256`, the threshold will be crossed and rayon will engage. The
+   correctness gate in `test_rayon_equivalence.py` covers those cases unconditionally.
+
+4. **Peak RSS is unchanged.** The gvl-attributable RSS delta is 0. The 3.525 GB process floor
+   is the seqpro transitive JIT, which is out of scope for this migration.
+
+5. **Single-thread headroom is already maximized.** W4 showed rust at parity-or-better on all
+   modes (up to 4.6× faster for variant-windows). The round-3 instruction-level tuning pass
+   (PR #252) confirmed deterministic instruction-count reductions across 7 hot kernels.
+   Rayon adds the future ability to scale throughput linearly with cores at production batch sizes.
diff --git a/docs/roadmaps/phase-5-w6-thin-shim-audit.md b/docs/roadmaps/phase-5-w6-thin-shim-audit.md
new file mode 100644
index 00000000..f4a29a79
--- /dev/null
+++ b/docs/roadmaps/phase-5-w6-thin-shim-audit.md
@@ -0,0 +1,265 @@
+# Phase 5 W6 — Thin-Shim Audit
+
+**Date:** 2026-06-27
+**Branch:** phase-5-w6-wrapup
+**Auditor:** Task 1 (automated, Claude)
+
+## Purpose
+
+Audit whether the Python layer over the PyO3 FFI surface is already a thin
+shim, or whether collapsible glue remains. This verdict determines whether
+Phase 5 "Collapse the PyO3 surface so Python is a true shim" can be ticked.
+
+---
+
+## Step 1 — Read-path call-chain inventory
+
+### `Dataset.__getitem__` (hot path, unspliced)
+
+```
+Dataset.__getitem__                          _impl.py:1743
+  → QueryView construction                  _impl.py:1776-1789   (indexing sugar — validated attr packing)
+  → getitem(view, idx)                      _query.py:66
+      → _getitem_unspliced(view, idx)        _query.py:154
+          parse_idx / jitter / to_rc         _query.py:162-175   (indexing sugar + numpy scalar ops)
+          → view.recon(...)                  _query.py:178       (dispatches to active Reconstructor)
+
+            BRANCH A: Haps.__call__
+              → Haps.get_haps_and_shifts     _haps.py:619
+                  → _prepare_request         _haps.py:675
+                      _get_geno_offset_idx   _haps.py:753        (np.unravel_index + np.ravel_multi_index)
+                      [optional] choose_exonic_variants          FFI: choose_exonic_variants
+                      → _haplotype_ilens     _haps.py:492
+                          → get_diffs_sparse                     FFI: get_diffs_sparse
+                      shift RNG              _haps.py:725-727    (numpy RNG call)
+                      lengths_to_offsets                         (seqpro utility, cumsum)
+                  → _reconstruct_haplotypes  _haps.py:809
+                      _out_per comparison    _haps.py:823-833    (ragged-vs-fixed detection, ~3 numpy ops)
+                      np.repeat(to_rc, p)    _haps.py:840        (to_rc expansion, batch-bounded)
+                      → reconstruct_haplotypes_fused             FFI: fused kernel (one crossing)
+                      _Flat.from_offsets     _haps.py:866        (zero-copy view wrap)
+
+            BRANCH B: Haps.__call__ (annotated kind)
+              same _prepare_request path as A, then:
+              → _reconstruct_annotated_haplotypes  _haps.py:919
+                  (same ragged-vs-fixed detection + to_rc expansion as A)
+                  → reconstruct_annotated_haplotypes_fused       FFI: fused kernel (one crossing)
+                  3× _Flat.from_offsets                          (zero-copy view wraps)
+
+            BRANCH C: HapsTracks.__call__
+              → haps.get_haps_and_shifts     (same as BRANCH A/B above)
+              per-track loop:
+                  out buffer allocation      _reconstruct.py:179  (np.empty, batch×ploidy×tracks f32)
+                  einops.repeat out_lengths  _reconstruct.py:180  (batch-bounded)
+                  lengths_to_offsets ×2      _reconstruct.py:183-184
+                  _lower_insertion_fills     _reconstruct.py:190  (strat list → id/params arrays)
+                  base_seed computation      _reconstruct.py:195-201 (np.bitwise_xor.reduce or rng.integers)
+                  _as_starts_stops once      _reconstruct.py:206  (offsets → (2,N) view)
+                  to_rc expansion (per-track) _reconstruct.py:235
+                  → intervals_and_realign_track_fused            FFI: fused kernel (one crossing per track)
+              _Flat.from_offsets             _reconstruct.py:280  (zero-copy wrap)
+
+            BRANCH D: Tracks.__call__  (reference-coordinate tracks, no haplotype re-alignment)
+              → _call_intervals              _tracks.py
+                  → intervals_to_tracks or realign FFI calls     (separate smaller kernels)
+
+            BRANCH E: Ref.__call__
+              → get_reference                                     FFI: get_reference (one crossing)
+
+          [optional] reverse_complement_ragged  _query.py:200   (variant types only, not byte/track data)
+          to_ragged / squeeze / reshape       _query.py:111-126  (output massaging — indexing sugar)
+```
+
+### `Dataset.__getitem__` (spliced path)
+
+The spliced path prepends a `build_recon_splice_plan` step (calls
+`haplotype_lengths_for_plan → get_diffs_sparse FFI`, plus `build_splice_plan`
+FFI) and passes the `SplicePlan` into the same `_reconstruct_haplotypes` /
+`_reconstruct_annotated_haplotypes` fused kernels, each of which then calls
+`_permute_request_for_splice` (Python permutation of per-element arrays, batch-bounded).
+
+---
+
+## Step 2 — FFI surface inventory
+
+`src/lib.rs` registers **33 entries** (32 `wrap_pyfunction!` + 1 `add_class`):
+
+| # | Symbol | Category |
+|---|--------|----------|
+| 1 | `count_intervals` | BigWig util |
+| 2 | `bigwig_intervals` | BigWig util |
+| 3 | `bigwig_write_track` | BigWig write |
+| 4 | `RustTable` (class) | Write path |
+| 5 | `ragged_to_padded` | Ragged util |
+| 6 | `intervals_to_tracks` | Track util |
+| 7 | `get_diffs_sparse` | Read-path helper |
+| 8 | `choose_exonic_variants` | Read-path helper |
+| 9 | `gather_rows_i32` | Genotype util |
+| 10 | `gather_rows_f32` | Genotype util |
+| 11 | `gather_alleles` | Genotype util |
+| 12 | `compact_keep_i32` | Genotype util |
+| 13 | `compact_keep_f32` | Genotype util |
+| 14 | `fill_empty_scalar_i32` | Genotype util |
+| 15 | `fill_empty_scalar_f32` | Genotype util |
+| 16 | `fill_empty_fixed_i32` | Genotype util |
+| 17 | `fill_empty_fixed_f32` | Genotype util |
+| 18 | `fill_empty_seq_u8` | Genotype util |
+| 19 | `fill_empty_seq_i32` | Genotype util |
+| 20 | `assemble_variant_buffers_u8` | Variant buffer |
+| 21 | `assemble_variant_buffers_i32` | Variant buffer |
+| 22 | `rc_alleles` | Allele RC |
+| 23 | `get_reference` | Read-path — reference sequences |
+| 24 | `reconstruct_haplotypes_from_sparse` | Read-path helper (non-fused) |
+| 25 | `reconstruct_haplotypes_fused` | **Fused `__getitem__` kernel** |
+| 26 | `reconstruct_annotated_haplotypes_fused` | **Fused `__getitem__` kernel** |
+| 27 | `reconstruct_haplotypes_spliced_fused` | **Fused `__getitem__` kernel** |
+| 28 | `reconstruct_annotated_haplotypes_spliced_fused` | **Fused `__getitem__` kernel** |
+| 29 | `shift_and_realign_tracks_sparse` | Track util (non-fused) |
+| 30 | `tracks_to_intervals` | Track util |
+| 31 | `intervals_and_realign_track_fused` | **Fused `__getitem__` kernel** |
+| 32 | `_debug_xorshift64` | Debug/parity (Task 7) |
+| 33 | `_debug_hash4` | Debug/parity (Task 7) |
+
+**Fused `__getitem__` kernels:** 5 (entries 25–28 + 31 = `reconstruct_haplotypes_fused`,
+`reconstruct_annotated_haplotypes_fused`, `reconstruct_haplotypes_spliced_fused`,
+`reconstruct_annotated_haplotypes_spliced_fused`, `intervals_and_realign_track_fused`).
+
+`assemble_variant_buffers_{u8,i32}` (entries 20–21) are used on the variant-windows and
+flat-variants path, not the primary `__getitem__` hot path for byte sequences or tracks.
+
+---
+
+## Step 3 — Dispatch layer check
+
+```
+$ ls python/genvarloader/_dispatch.py 2>&1
+No such file or directory
+```
+
+```
+$ grep -rn "GVL_BACKEND|_dispatch|import numba|from numba|nb\.njit|nb\.prange" python/genvarloader/ --include=*.py
+(zero matches)
+```
+
+**Result:** `_dispatch.py` does not exist. No `GVL_BACKEND`, `_dispatch`, or
+numba import found anywhere in `python/genvarloader/`. The dispatch layer is
+fully gone; Python calls Rust directly. Stale bytecode
+`__pycache__/_dispatch.cpython-*.pyc` was removed (no file existed to remove).
+
+---
+
+## Step 4 — Three-bucket classification
+
+### Bucket definitions
+
+- **Bucket 1 — Intentional shim:** Indexing sugar, torch/device handling,
+  validation, error messages, output massaging. Stays in Python by design.
+- **Bucket 2 — Remaining collapsible glue:** Per-batch coercion / allocation /
+  object churn worth a future kernel. Not negligible overhead today.
+- **Bucket 3 — Already-collapsed:** One FFI crossing, no material Python work.
+
+### Classification table
+
+| Python step | Location | Bucket | Justification |
+|-------------|----------|--------|---------------|
+| `QueryView` construction | `_impl.py:1776` | 1 | Attr packing; zero array work |
+| `parse_idx` / index validation | `_query.py:162` | 1 | Indexing sugar |
+| Jitter offset computation | `_query.py:168-171` | 1 | One `rng.integers` + 2 in-place scalar ops; batch-bounded |
+| `to_rc` derivation from strand column | `_query.py:174` | 1 | One boolean comparison on a slice |
+| `_get_geno_offset_idx` | `_haps.py:753` | 1 | Two `np.unravel_index` / `ravel_multi_index` over `(b,)` / `(b, p)` arrays; indexing sugar for genotype address translation |
+| `choose_exonic_variants` (optional) | `_haps.py:698` | 3 | Thin wrapper; one FFI crossing |
+| `get_diffs_sparse` | `_haps.py:518` | 3 | Thin wrapper; one FFI crossing |
+| Shift RNG call | `_haps.py:725` | 1 | One `rng.integers`; intentional Python-side random state |
+| `lengths_to_offsets` | `_haps.py:736` | 1 | Cumsum utility; negligible, batch-bounded |
+| Ragged-vs-fixed detection (`_out_per` comparison) | `_haps.py:823` | 1 | 3 numpy ops on `(b*p,)` arrays; determines kernel mode flag |
+| `np.repeat(to_rc, ploidy)` + `ascontiguousarray` | `_haps.py:840` | 1 | Expands `(b,)` → `(b*p,)` bool; batch-bounded, no alternative without a kernel API change |
+| `ascontiguousarray` coercions on `regions`, `shifts`, `geno_offset_idx`, `keep`, `keep_offsets` | `_haps.py:843-861` | 1 | All batch-bounded (b or b×p arrays); guard FFI typing; zero-copy when already contiguous (common case via `_prepare_request`) |
+| `_ffi_array` checks on `geno_v_idxs` | `_haps.py:847` | 1 | Zero-copy assertion guard; per-sample-scale memmap — correctly NOT coercing |
+| `reconstruct_haplotypes_fused` | `_haps.py:842` | 3 | **One FFI crossing** |
+| `_Flat.from_offsets` (post-kernel) | `_haps.py:866` | 1 | Zero-copy view wrap; no array work |
+| `reconstruct_annotated_haplotypes_fused` | `_haps.py:957` | 3 | **One FFI crossing** |
+| `reconstruct_haplotypes_spliced_fused` | `_haps.py:884` | 3 | **One FFI crossing** |
+| `reconstruct_annotated_haplotypes_spliced_fused` | `_haps.py:1015` | 3 | **One FFI crossing** |
+| `_permute_request_for_splice` | `_haps.py:1056` | 1 | Batch-bounded permutation of per-element arrays for the splice plan; structural pre-processing, not a hot inner loop on the read path |
+| `HapsTracks` out-buffer allocation (`np.empty`) | `_reconstruct.py:179` | 1 | Allocates a single `(b*p*t)` f32 buffer; standard pre-allocation pattern before an in-place kernel |
+| `einops.repeat out_lengths` | `_reconstruct.py:180` | 1 | Batch-bounded broadcast; library call |
+| `lengths_to_offsets` ×2 | `_reconstruct.py:183-184` | 1 | Cumsum; batch-bounded |
+| `_lower_insertion_fills` | `_reconstruct.py:190` | 1 | Converts Python strategy objects → id/params arrays; O(n_tracks) not O(batch) |
+| `base_seed` computation | `_reconstruct.py:195` | 1 | One RNG or xor-reduce; Python-side randomness |
+| `_as_starts_stops` once per batch | `_reconstruct.py:206` | 1 | Converts offsets to (2, N) view; called once per batch (amortized over tracks). Wraps `ascontiguousarray` on the sample-scale offsets array — this IS a candidate for caching but is a read, not a write |
+| per-track `to_rc` `np.repeat` + `ascontiguousarray` | `_reconstruct.py:235` | 1 | Same batch-bounded expansion as haps; repeated once per track |
+| per-track `ascontiguousarray` coercions | `_reconstruct.py:239-268` | 1 | All batch-bounded; guard FFI typing |
+| `intervals_and_realign_track_fused` (per track) | `_reconstruct.py:237` | 3 | **One FFI crossing per track** |
+| `_getitem_unspliced` post-kernel shaping (`to_ragged`, `to_fixed`, squeeze) | `_query.py:95-126` | 1 | Output format massaging; indexing sugar |
+| `reverse_complement_ragged` (variant types only) | `_query.py:200` | 1 | Post-kernel Python RC; only for RaggedVariants / FlatVariants / FlatVariantWindows — byte/track RC is already folded in-kernel |
+| `get_reference` | `_reference.py` | 3 | One FFI crossing |
+
+### `ascontiguousarray` on per-sample-scale memmaps
+
+`_ffi_array` (`_utils.py:13`) is used for the four per-sample-scale memmap
+arguments (`geno_v_idxs`, `itv_starts`, `itv_ends`, `itv_values`,
+`itv_offsets`) — it asserts contiguity and raises a precise error instead of
+silently copying. The memory-map note in `_utils.py` confirms this is the
+correct behavior: "coercing would force a sample-scale copy." There are **zero
+`ascontiguousarray` calls on per-sample-scale memmaps** in the hot read path;
+all surviving `ascontiguousarray` calls are on batch-bounded arrays (`b` or
+`b×p` arrays that are typically already contiguous in practice but require an
+explicit dtype cast for the FFI boundary).
+
+### Phase 3 optimization targets cross-reference
+
+The Phase 3 audit (`docs/roadmaps/phase-3-getitem-glue-audit.md`) identified
+three bucket-2 items that have since been resolved:
+
+1. **Zero-copy `_ffi_array`** — implemented (`_utils.py:13`); per-sample-scale
+   memmaps now assert-no-copy rather than silently coercing.
+2. **`_HapsFfiStatic` caching** — implemented (`_haps.py:240`); v_starts,
+   ilens, alt_alleles, alt_offsets, ref, ref_offsets are coerced once at first
+   access and cached for the lifetime of the `Haps` reconstructor.
+3. **Uninit buffers** — the fused kernels all allocate their output internally
+   (Rust-side `Vec::with_capacity` / `uninit`), except for the `HapsTracks`
+   `np.empty` pre-alloc which is a single batch-bounded f32 buffer — correct
+   pattern.
+
+---
+
+## Step 5 — Verdict
+
+**The shim is already thin. Bucket-2 is empty.**
+
+Every Python step on the hot `__getitem__` path falls into Bucket 1
+(intentional shim: indexing sugar, output format conversion, Python-side RNG,
+FFI typing guards) or Bucket 3 (one FFI crossing). There is no per-batch
+coercion or allocation that is both (a) non-trivial in cost and (b) collapsible
+into a Rust kernel without restructuring the public Python API.
+
+The one observable pattern that comes closest to bucket-2 — repeated
+`ascontiguousarray` calls before each fused-kernel call — is already correct
+behavior: those arrays are batch-bounded (small), the coercions are no-ops when
+arrays are already contiguous (which they are after `_prepare_request`), and
+the dtype-cast form serves as a static type guarantee at the FFI boundary. The
+`_HapsFfiStatic` cache already handles the only array that would otherwise
+require a per-batch copy at scale (the sub-linear variant/reference arrays).
+
+The `_as_starts_stops` call in `HapsTracks.__call__` (computes a `(2, N)`
+view of the genotype offsets once per batch) is the one borderline item:
+it calls `ascontiguousarray` on the sample-scale offsets array each batch.
+However, the offsets `Ragged` is a memmap whose backing array is already
+C-contiguous in practice (written as a plain `np.memmap`), so the
+`ascontiguousarray` call is typically a no-op. Caching the `(2, N)` view on
+`Haps` (similar to `_HapsFfiStatic`) would be a clean micro-optimization but
+is not needed to call the shim thin.
+
+**The single-big-`__getitem__`-kernel collapse is not warranted as Phase 5
+work.** The five fused kernels already express one FFI crossing per
+reconstruction path. Further collapse would require moving index resolution
+(jitter, RC derivation, output shaping) into Rust, which would complicate the
+public API and add no meaningful throughput gain relative to the rayon batch
+parallelism already landed in W5.
+
+**Dispatch-layer status:** fully gone (confirmed Step 3). No `_dispatch.py`,
+no `GVL_BACKEND`, no numba imports in `python/genvarloader/`.
+
+**FFI surface count:** 33 registered entries; 5 are fused `__getitem__` kernels;
+the remainder are write-path utils, ragged utilities, and genotype/variant
+helpers that are already called directly (no Python wrappers remaining).
diff --git a/docs/roadmaps/round3-profile-baseline.md b/docs/roadmaps/round3-profile-baseline.md
new file mode 100644
index 00000000..a9813b33
--- /dev/null
+++ b/docs/roadmaps/round3-profile-baseline.md
@@ -0,0 +1,75 @@
+# Round-3 Profiling Baseline
+
+Captured 2026-06-25 on the Carter node.  
+Build: `maturin develop --release`, corpus `tests/benchmarks/data/chr22_geuv.gvl`,
+`with_len(16384)`, `BATCH=32`, `NUMBA_NUM_THREADS=1`.
+
+---
+
+## Starting Rust ÷ Numba Ratios
+
+| Path | Metric | Rust | Numba | Rust ÷ Numba |
+|------|--------|------|-------|--------------|
+| tracks-only | pedantic min (ms/batch) | 1.091 | 1.121 | **0.97** |
+| haplotypes | pedantic min (ms/batch) | 2.348 | 3.372 | **0.70** |
+| variants | wall avg (ms/batch) | 2.293 | 2.859 | **0.80** |
+| variant-windows | wall avg (ms/batch) | 2.117 | 3.773 | **0.56** |
+
+All four paths are already faster in Rust than Numba, so these are the baselines
+to beat, not ceilings. Ratios < 1.0 mean Rust is faster.
+
+---
+
+## Consolidated Flat Self-Time Table
+
+Measured with `perf record -F 999 --no-children` over 12 000 batches per path (Rust only).
+Rows = Rust kernel symbols appearing in any path's top self-time.
+Columns = self-time % in that path (blank = not observed).
+**Aggregate = sum of self-time % across all paths** — the descending sort of this
+column is the tuning target order for all later round-3 tasks.
+
+| Symbol | tracks | haplotypes | variants | variant-windows | **Aggregate** |
+|--------|:------:|:----------:|:--------:|:---------------:|:-------------:|
+| `genvarloader::intervals::intervals_to_tracks` | 26.08 | 16.64 | 17.60 | — | **60.32** |
+| `genvarloader::variants::windows::tokenize` | — | — | — | 28.14 | **28.14** |
+| `genvarloader::tracks::shift_and_realign_tracks_sparse` | — | 13.03 | 12.70 | — | **25.73** |
+| `genvarloader::variants::windows::slice_flanks` | — | — | — | 20.14 | **20.14** |
+| `genvarloader::variants::windows::assemble_alt_window` | — | — | — | 13.26 | **13.26** |
+| `genvarloader::reverse::rc_flat_rows_inplace` | — | 9.31 | — | — | **9.31** |
+| `genvarloader::ffi::intervals_and_realign_track_fused` | — | 4.54 | 4.43 | — | **8.97** |
+| `genvarloader::reconstruct::reconstruct_haplotypes_from_sparse` | — | 4.47 | — | — | **4.47** |
+| `ndarray::dimension::do_slice` | — | 1.92 | — | 0.64 | **2.56** |
+| `ndarray::impl_methods::<impl ndarray::ArrayRef<A,D>>::slice_mut` | — | 1.89 | — | 0.61 | **2.50** |
+| `genvarloader::reference::get_reference::{{closure}}` | — | — | — | 1.51 | **1.51** |
+| `genvarloader::genotypes::get_diffs_sparse` | — | 0.81 | 0.44 | — | **1.25** |
+| `genvarloader::variants::gather_alleles` | — | — | 0.54 | 0.55 | **1.09** |
+| `genvarloader::variants::windows::fetch_windows` | — | — | — | 0.22 | **0.22** |
+| `genvarloader::variants::windows::gather_starts_ilens` | — | — | — | 0.17 | **0.17** |
+| `genvarloader::reference::get_reference` | — | — | — | 0.13 | **0.13** |
+| `genvarloader::variants::gather_rows_i32` | — | — | — | 0.11 | **0.11** |
+
+### Notes
+
+- `__memset_avx2_unaligned_erms` (libc) appears at 12.89% in tracks and 3.89% in
+  haplotypes as the second-largest entry — it is called from within
+  `intervals_to_tracks` (zero-filling output buffers) and thus captured under the Rust
+  symbol in any inlined build; it is not an independent target.
+- `ndarray::dimension::do_slice` and `ndarray::impl_methods::slice_mut` are from the
+  `ndarray` crate (not genvarloader-specific). They accumulate 2.56% and 2.50%
+  aggregate respectively; addressable only by restructuring how outputs are sliced, not
+  by rewriting a kernel.
+- `genvarloader::ffi::intervals_and_realign_track_fused` (haplotypes 4.54%,
+  variants 4.43%) is the combined FFI trampoline for intervals + track realignment;
+  it likely contains overhead that belongs to either `intervals_to_tracks` or
+  `shift_and_realign_tracks_sparse` when fused.
+
+### Descending Target Order for Round-3 Tuning Tasks
+
+1. `genvarloader::intervals::intervals_to_tracks` — Aggregate **60.32%** (shared: tracks + haps + variants)
+2. `genvarloader::variants::windows::tokenize` — **28.14%** (variant-windows only)
+3. `genvarloader::tracks::shift_and_realign_tracks_sparse` — **25.73%** (haps + variants)
+4. `genvarloader::variants::windows::slice_flanks` — **20.14%** (variant-windows only)
+5. `genvarloader::variants::windows::assemble_alt_window` — **13.26%** (variant-windows only)
+6. `genvarloader::reverse::rc_flat_rows_inplace` — **9.31%** (haplotypes only)
+7. `genvarloader::ffi::intervals_and_realign_track_fused` — **8.97%** (haps + variants)
+8. `genvarloader::reconstruct::reconstruct_haplotypes_from_sparse` — **4.47%** (haplotypes only)
diff --git a/docs/roadmaps/rust-migration.md b/docs/roadmaps/rust-migration.md
index 27771002..8ed11a58 100644
--- a/docs/roadmaps/rust-migration.md
+++ b/docs/roadmaps/rust-migration.md
@@ -6,6 +6,19 @@ This is a living tracker. **Any work that touches the Rust migration must read t
 first and update it as part of the change** — tick completed tasks, record measurements
 under the relevant checkpoint, and update the phase status marker + PR link.
 
+## Branch & gate strategy (changed as of Phase 2, 2026-06-24)
+
+Phases 0–1 were merged to `main` incrementally. **From Phase 2 onward the work accumulates on
+a single persistent integration branch (`rust-migration`) with NO per-phase throughput gate**,
+and ships as ONE big merge at the end. Rationale: profiling Phase 2 showed the read-path
+overhead is per-kernel Python dispatch glue (redundant `np.ascontiguousarray` coercions +
+FFI boundary crossings), not rust compute — so the real win comes from collapsing
+`__getitem__` into a single large rust kernel, which can only be done once enough of the
+read path is in Rust. Gating each intermediate phase on throughput would block correct,
+parity-verified work behind an overhead that the architecture is designed to delete later.
+**Per-phase gate is now parity only**; a dedicated optimization pass (eliminate glue →
+single big `__getitem__` kernel) re-establishes the throughput gate before the final merge.
+
 ---
 
 ## Goal & end state
@@ -89,9 +102,9 @@ py310–313 × linux/macOS as the Rust surface grows.
 
 | Metric | Corpus | Baseline | Captured |
 |---|---|---|---|
-| `gvl.write()` wall-clock | 1kg chr21/chr22 (100 regions), macOS M-series | 1.143 s | ✅ |
-| `gvl.write()` peak RSS | 1kg chr21/chr22 (100 regions), macOS M-series | 3.593 GB | ✅ |
-| `gvl.update()` wall-clock | 1kg chr21/chr22 (vcfixture tier) | _TBD_ (smoke only: 0.022 s for a 60-row synthetic annot track — not a real workload) | ⬜ |
+| `gvl.write()` wall-clock | 1kg chr21/chr22 (100 regions), macOS M-series | 1.143 s (**superseded for comparison** — macOS/1kg-VCF; see Phase 4 Carter re-baseline) | ✅ |
+| `gvl.write()` peak RSS | 1kg chr21/chr22 (100 regions), macOS M-series | 3.593 GB (**superseded for comparison** — macOS/1kg-VCF; see Phase 4 Carter re-baseline) | ✅ |
+| `gvl.update()` wall-clock | 1kg chr21/chr22 (vcfixture tier) | ~~_TBD_ (smoke only: 0.022 s for a 60-row synthetic annot track — not a real workload)~~ **Phase 4 re-baseline (Carter, chr22_geuv): 0.081 s** (peak RSS 3.519 GB whole-process — dominated by base-dataset write; see Phase 4 gate footnote ¹) | ✅ |
 | `Dataset.__getitem__` throughput (tracks mode = `intervals_to_tracks` read path) | `chr22_geuv` realistic bench (165 regions × 5 samples, chr22, read-depth; `SEQLEN=16384`, `BATCH=32`, 2000 batches, `NUMBA_NUM_THREADS=1`), Carter HPC (AMD EPYC 7543, linux-64) | **169.9 batch/s** (5.886 ms/batch, ~5.4k item/s); peak RSS **3.531 GB** | ✅ |
 
 > getitem baseline captured on Carter (2026-06-23, gvl 0.35.0, `GVL_BACKEND` unset →
@@ -195,9 +208,11 @@ rather than a GVL-in-house reimplementation (see decision 2026-06-23). Bottom-up
       that owns the `Ragged` layout (offsets + data buffers) and its core ops.
 - [x] Port the last two numba ops to Rust inside `seqpro-core`: `to_padded` and
       `reverse_complement`. seqpro's ragged layer is now numba-free.
-- [x] GVL consumes `seqpro-core` via a Cargo path-dep (editable; flip to
-      git/crates.io before shipping). `src/ragged/` is a bridge adapter, not a
-      reimplementation.
+- [x] GVL consumes `seqpro-core` via a crates.io registry dep (`seqpro-core = "0.1"`,
+      resolves to `0.1.0` from `registry+https://github.com/rust-lang/crates.io-index`,
+      checksum verified in `Cargo.lock`). No path dep or `[patch]` override — the
+      shipping prerequisite is already satisfied. `src/ragged/` is a bridge adapter,
+      not a reimplementation.
 - [x] Proof-point op (`to_padded`) rerouted through the shared `seqpro-core` kernel
       in GVL with byte-identical parity confirmed.
 - [x] Remove `awkward` from the foundation layer. (GVL migrated onto seqpro's
@@ -207,49 +222,560 @@ rather than a GVL-in-house reimplementation (see decision 2026-06-23). Bottom-up
 **Checkpoint:** parity green (byte-identical `to_padded`). Foundational — no perf gate,
 but record incidental wins. Relevant prior work: [[project_ragged_assembly_bottleneck]].
 
-### Phase 2 — Genotype assembly + variant gather ⬜
-_PR: —_
+### Phase 2 — Genotype assembly + variant gather ✅ (parity-verified; perf deferred to consolidation)
+_Branch: `rust-migration` (persistent integration branch — see "Branch & gate strategy" below). Not separately merged to `main`._
 
-- [ ] Migrate `_dataset/_genotypes.py` kernels (6 numba) onto the Rust layout.
-- [ ] Migrate `_dataset/_flat_variants.py` kernels (7 numba).
-- [x] Migrate `_dataset/_rag_variants.py`; drop `awkward` from these hot paths. (Done at the Python level: `RaggedVariants` now wraps a single record `seqpro.rag.Ragged`; no numba kernels remain in this file — any remaining numba rewrites are tracked in the unchecked items below.)
+- [x] Migrate `_dataset/_genotypes.py` **assembly/selection** kernels: `get_diffs_sparse`,
+      `choose_exonic_variants`. (The `_genotypes.py` *reconstruction* kernels —
+      `reconstruct_haplotypes_from_sparse` et al. — are Phase 3, not Phase 2; the earlier
+      "6 numba" figure double-counted them.) Dead `filter_af` deleted (zero production
+      callers; AF filtering is inline numpy in `_haps.py`/`_flat_variants.py`) — same
+      precedent as the Phase 0 `splits_sum_le_value` dead-path removal. Its dedicated unit
+      test was removed with it.
+- [x] Migrate `_dataset/_flat_variants.py` kernels (7 numba): `_gather_v_idxs` + `_gather_v_idxs_ss`
+      → `gather_rows` (unified via `(2,n)` offset normalization), `_gather_alleles`,
+      `_compact_keep`, `_fill_empty_scalar`, `_fill_empty_fixed`, `_fill_empty_seq`.
+- [x] Migrate `_dataset/_rag_variants.py`; drop `awkward` from these hot paths. (Done at the Python level: `RaggedVariants` now wraps a single record `seqpro.rag.Ragged`; no numba kernels remain in this file.)
 
-**Gate:** parity + `Dataset.__getitem__` throughput vs baseline (target speedup, no
-regression).
+**Architecture:** pure-`ndarray` cores in `src/genotypes/` + `src/variants/`; PyO3 only in
+`src/ffi/`; per-kernel dispatch via `genvarloader._dispatch` (default `rust`, `GVL_BACKEND`
+override); numba impls retained as registered parity references (deleted wholesale in Phase 5).
 
-### Phase 3 — Reconstruction + track realignment ⬜
-_PR: —_
+**Dtype-correctness (beyond the plan):** the flat gather/fill kernels are NOT v_idxs-only — they
+also run on float32 dosage and **arbitrary-dtype** custom per-call FORMAT fields (issue #231, e.g.
+`int16`). The numba refs preserved input dtype; a naive int32/float32-only port silently corrupted
+them (caught here: float32 dosage `[0.25,0.75]`→`[0,0]`). Final design dispatches by dtype —
+`*_i32`/`*_f32` rust cores for the hot paths + a **dtype-preserving numba fallback** for all other
+dtypes, with direct regression tests (int16/int64/float32) locking it.
+
+**Gate (parity — MET):** byte-identical parity for every ported kernel via `@pytest.mark.parity`
+hypothesis suites (both returned arrays for tuple kernels), plus a spy-guarded variants-mode
+dataset backstop proving the rust kernels run on the live `__getitem__` path. Full tree green:
+904 passed (rust) / 617 passed (numba backend, dataset+unit); lint/format/typecheck clean;
+`cargo test` green; abi3 build OK. (One pre-existing unrelated failure, `test_e2e_variants`, is a
+`with_len`-on-variants benchmark bug that fails identically at the Phase-2 base — not introduced here.)
+
+**Gate (throughput — DEFERRED, not a blocker):** see "Branch & gate strategy". Measured medians
+(`chr22_geuv`, `NUMBA_NUM_THREADS=1`, Carter):
+
+| Mode | rust | numba (same session) | documented baseline |
+|---|---|---|---|
+| haplotypes | 128.8 batch/s | 137.9 | 123.9 |
+| variants | 139.5 batch/s | 149.3 | 145.3 |
+
+rust is a **stable ~7% slower than numba** (rust-haps still beats the 123.9 baseline; rust-variants
+is ~4% below its 145.3 baseline). cProfile of the rust variants `__getitem__` shows the cost is
+**pure Python glue, not rust compute**: `np.ascontiguousarray` is 28,800 calls / 3.98 s = **62%** of
+the loop (~36 redundant coercions per batch in the per-kernel dispatch wrappers), while the rust
+kernels themselves are negligible (`gather_alleles` 0.012 s, `get_diffs_sparse` 0.010 s). This
+validates collapsing the read path toward a **single big rust `__getitem__` kernel** (drop redundant
+coercions short-term; eliminate per-kernel boundary crossings + intermediate numpy allocs long-term),
+addressed in a dedicated optimization pass before the final merge.
+
+### Phase 3 — Reconstruction + track realignment ✅ (parity-verified; throughput recorded)
+_PR: [#245](https://github.com/mcvickerlab/GenVarLoader/pull/245) → rust-migration_
+
+The numba bulk and the big read-path win. Ported 8 kernel groups behind dispatch (reference,
+haplotype reconstruct singular+batch, PRNG, insertion-fill, track realignment, RLE) plus fused
+`__getitem__` entries for both haplotypes and tracks. Default backend is `rust`; numba retained
+as the registered parity reference for the consolidation pass (Phase 5).
+
+- [x] Task 12: Audit `__getitem__` glue (2 FFI crossings → inventory; `docs/roadmaps/phase-3-getitem-glue-audit.md`).
+- [x] Task 13: Fused haplotypes `__getitem__` kernel — `reconstruct_haplotypes_fused` collapses 2 FFI crossings to 1 on the non-splice plain haps path. Dataset parity gate: byte-identical to composed numba oracle (37/37 parity tests pass). Annotated path and splice path remain on unfused dispatched kernels (documented in task-13-report.md).
+- [x] Task 14: Fused tracks `__getitem__` kernel — `intervals_and_realign_track_fused` chains `intervals_to_tracks` → `shift_and_realign_tracks_sparse` in 1 FFI crossing per track; Rust scratch buffer replaces Python `np.empty` intermediate. Dataset parity gate: byte-identical across all 5 insertion-fill strategies (39/39 parity tests pass; fixture uses max_jitter=0 per #242 contract).
+- [x] Task 15: Full-tree verification + roadmap + skill check (final-review fixes applied). Full tree green: 909 passed, 15 xfailed (11 added here + 4 pre-existing), 0 failed. Lint/format clean; cargo 85/85; abi3 wheel builds. See final-review section in task-15-report.md.
+- [x] Migrate `_dataset/_reconstruct.py` + `_dataset/_haps.py` remaining paths. Annotated path now fused via `reconstruct_annotated_haplotypes_fused` (Phase 3 close-out, Task 4); splice path fused via `reconstruct_haplotypes_spliced_fused` (Phase 3 close-out, Task 5). Both byte-identical to the composed numba oracle. The annotated+spliced intersection is now fused via `reconstruct_annotated_haplotypes_spliced_fused` (Phase 5 W3): one FFI crossing, RC folded in-kernel (bytes reverse-complemented, both annotation arrays reversed), byte-identical to the composed numba oracle, covered by `tests/parity/test_annotated_spliced_haplotypes_parity.py`.
+- [x] Migrate `_dataset/_tracks.py` realign (6 numba) + `_dataset/_intervals.py` (4 numba). Rust-default + fused (`intervals_and_realign_track_fused`); the #242 `intervals_to_tracks` clip fix merged from main (both backends). Remaining numba kernels are retained Phase-5-deletion parity references, not unmigrated paths.
+- [x] Migrate `_dataset/_reference.py` (6 numba). `Reference.fetch` rerouted through the dispatched rust `get_reference` (Phase 3 close-out, Task 3); the three zero-caller `_fetch_*` numba functions deleted. The live `_get_reference_*` numba kernels remain as Phase-5-deletion parity references.
+- [x] Migrate `_dataset/_insertion_fill.py` + `_dataset/_splice.py`. No numba kernels remain to migrate in `_insertion_fill.py`; splice reconstruction fused via `reconstruct_haplotypes_spliced_fused` (Phase 3 close-out, Task 5).
+
+**Gate (parity — MET):** byte-identical parity confirmed, with two documented numba-bug sub-domains excluded from the oracle via assume(False) in parity tests (consistent with the #242-family precedent):
+  1. *start>=clen / #242-family*: get_dummy_dataset() (max_jitter=2) float-track tests trigger the intervals_to_tracks debug_assert panic; xfailed (strict=False) in 10 tests across test_output_bytes_per_instance.py, test_dummy_dataset_insertion_fill.py, test_flat_intervals.py, test_realign_tracks.py, test_seqs_tracks.py.
+  2. *reconstruct trailing-under-write*: a deletion that drives ref_idx past the contig end causes numba's trailing-fill to behave differently from Rust (numba uses Python-style negative-index slicing; Rust clamps out_end_idx to 0). Both behaviors are undefined for inputs outside the production contract (variants always within contig bounds). Excluded via (a) overshoot pre-check in the reconstruct parity tests and (b) double-init guard (sentinel 0x00 vs 0xFF, and int32 sentinel 0 vs -1 for annotation buffers) to catch any positions numba leaves unwritten. Rust is correct in both cases; numba is not a valid oracle in this sub-domain.
+
+**Gate (throughput — DEFERRED):** recorded only (see "Branch & gate strategy").
+
+#### Phase 3 throughput measurements (re-measured at close-out, 2026-06-25)
+
+> Harness: `tests/benchmarks/test_e2e.py` via **pytest-benchmark** — steady-state timing of eager
+> `ds[r, s]` (BATCH=32 region/sample pairs, `with_len(SEQLEN=16384)`), warmup excluded, 75–190 rounds
+> per test. Corpus `chr22_geuv.gvl` (max_jitter=0, 165 regions × 5 samples, chr22 read-depth).
+> `NUMBA_NUM_THREADS=1`, release build (`maturin develop --release`), HEAD `6af2dbb`, Carter HPC
+> (AMD EPYC 7543, linux-64). OPS = batch/s = 1 / mean.
+>
+> ⚠️ **Not comparable to the prior table.** The old ~37 haps / ~20 tracks figures came from a
+> *different* harness (the 500-batch `benchmark_haps.py` script, since retired here). Read the
+> **rust ÷ numba ratio** measured on this one harness at one HEAD as the real signal, not the
+> absolute jump. Single-thread; both backends' batch drivers are serial (rayon deferred to Phase 5).
+
+| Mode | rust (batch/s) | numba (batch/s) | rust ÷ numba |
+|---|---|---|---|
+| tracks-only (`intervals_and_realign_track_fused`) | 173.2 | 192.2 | 0.90× |
+| tracks (seqs + `read-depth`) | 124.2 | 143.2 | 0.87× |
+| haplotypes (`reconstruct_haplotypes_fused`) | 122.1 | 143.6 | 0.85× |
+| annotated (`reconstruct_annotated_haplotypes_fused`) | 74.3 | 115.0 | 0.65× |
+
+> Fusion closed most of the prior ~2× gap: rust is now within ~10–17% of numba on the haplotype/track
+> paths. The **annotated** path (new this close-out, never previously timed) is the laggard at 0.65×
+> — it materializes 3× the data (haps bytes + var_idxs i32 + ref_coords i32). Recorded, not gated.
+
+#### Phase 3 throughput re-measurement after the zero-copy read-path optimization (2026-06-25)
+
+> Re-measured on branch `zero-copy-scale-safe-readpath` (format 2.0 SoA storage + zero-copy FFI guard +
+> sub-linear cache + uninit output buffers; optimization targets 1–3 above), corpus `chr22_geuv.gvl`
+> (migrated in place to 2.0 via `gvl.migrate`), `with_len(16384)`, BATCH=32, `NUMBA_NUM_THREADS=1`,
+> release build, Carter HPC (AMD EPYC 7543, linux-64).
+>
+> **De-noised harness (this measurement onward):** `_bench_indexing` now uses `benchmark.pedantic` with
+> `iterations=10, rounds=50` — each timed sample folds 10 `ds[r, s]` calls so per-batch OS-scheduler
+> jitter averages out (pedantic divides by `iterations`, so the figure stays per-batch). This collapsed
+> the tracks-only stddev from ~0.22 ms to ~0.08 ms and made the **min** (cleanest CPU-bound estimate)
+> reproducible to <1% across runs. Ratios below are **min rust ÷ min numba** (ms/batch).
+>
+> ⚠️ **Absolute batch/s are NOT comparable to the close-out table above** (different machine load).
+> Read the **ratio**. The earlier "tracks-only is noise-dominated" note was **wrong** — once de-noised,
+> the tracks-only gap is a stable, real ~0.63× regression (see target 5 below).
+
+| Mode | rust min (ms) | numba min (ms) | rust ÷ numba | batch/s (rust / numba) |
+|---|---|---|---|---|
+| tracks-only (`intervals_and_realign_track_fused`) | 1.70 | 1.07 | **0.63×** (rust slower) | 566 / 897 |
+| tracks (seqs + `read-depth`) | 3.40 | 3.25 | 0.95× | 275 / 286 |
+| haplotypes (`reconstruct_haplotypes_fused`) | 3.45 | 3.27 | 0.94× | 270 / 288 |
+| annotated (`reconstruct_annotated_haplotypes_fused`) | 5.34 | 9.00 | **1.68×** (rust faster) | 174 / 103 |
+
+> The zero-copy interval marshalling + uninit buffers made the **annotated** path (3× output data:
+> haps + var_idxs i32 + ref_coords i32) genuinely **faster than numba** (1.68×) — the close-out laggard
+> is now the clearest rust win. **tracks** and **haplotypes** sit at near-parity (0.94–0.95×). The
+> **tracks-only** path is the real remaining single-threaded deficit at **0.63×**: it is the cheapest
+> path (~1.1–1.7 ms) so the rust-side per-batch fixed cost (FFI marshalling + Python glue, no sequence
+> work to amortize it) dominates. Profiled for the next round of targets (5–7 below). Recorded, not
+> gated; rayon batch parallelism is deferred to Phase 5 — single-thread parity first.
+
+##### Optimization targets (py-spy `--native` on the rust `ds[r,s]`, 43k samples; copy trace on one batch)
+
+The fusion removed the duplicate FFI crossings the Phase 2 cProfile flagged. A per-batch trace of
+every *copying* `np.ascontiguousarray` (monkeypatched over one `ds[r, s]`) then localized what remains.
+The hottest self-time leaf (`_aligned_strided_to_contig_size4`, ~20%) is **not** static-array churn —
+it is the track-interval marshalling below.
+
+1. **✅ ADDRESSED (format 2.0; branch `zero-copy-scale-safe-readpath`, PR TBD).** Resolved via the chosen "struct-of-arrays on disk"
+   alternative: track intervals are now stored as three contiguous files `starts/ends/values.npy`
+   sharing `offsets.npy` (format `2.0.0`, gated open + `gvl.migrate`). The contiguous memmaps cross
+   the Python→Rust boundary zero-copy; the per-batch `np.ascontiguousarray` that materialized the
+   whole record store is replaced by `_ffi_array` (cross zero-copy or raise loudly). The genotype
+   "loaded gun" is hardened the same way (`_ffi_array` on `genotypes.data`). The scale-guard test
+   (`tests/integration/test_scale_guard.py`) locks the defect closed — it fails if any per-batch
+   `np.ascontiguousarray` materializes a sample-scale memmap on the read path. Original analysis below.
+
+   **⚠️ SCALABILITY DEFECT (rust-only; not in numba): the fused track path copies the entire
+   per-sample-scale interval store into RAM every batch.** Track intervals are stored as an
+   **array-of-structs** memmap — record dtype `{start: i4, end: i4, value: f4}`, itemsize 12 — so
+   `intervals.{starts,ends,values}.data` are **strided field views** (stride 12, non-contiguous).
+   `_reconstruct.py:241-250`'s fused-rust branch wraps each in `np.ascontiguousarray(..., i4/f4)`,
+   which **materializes the whole track's record store** (all regions × samples) into a contiguous
+   copy on **every** `ds[r, s]` (3 × 3.6 MB on the toy corpus; **GB-scale and OOM at the >1M-sample
+   target**). The **numba** branch (`_reconstruct.py:271-274`) passes the same strided views
+   **directly with no copy** — numba reads strided arrays natively — so this is a rust-path
+   regression, not a pre-existing cost. **Fix (zero-copy, non-breaking):** have the Rust kernel read
+   the contiguous `(N,)` record buffer directly (reinterpret the 12-byte records / take a
+   `&[IntervalRecord]`) and stride to `.start/.end/.value` itself, instead of demanding three
+   contiguous SoA arrays. Alternative: store intervals struct-of-arrays on disk (format change).
+   This is simultaneously the #1 perf cost (the 20% leaf) **and** a correctness blocker for scale.
+
+   - **Same loaded-gun pattern, currently benign: the genotype memmap.** The fused kernels also wrap
+     the full `genotypes.data`/`offsets` memmap in `np.ascontiguousarray`. Today that is a **no-op**
+     (the genotype store is contiguous `int32`/`int64`, so it stays mmap, zero copy) — but it is the
+     identical footgun: any future code path that yields a non-contiguous or mistyped genotype view
+     would silently copy the entire sample-scale store. **Harden:** drop `ascontiguousarray` on the
+     memmapped per-sample-scale args; rely on contiguous-by-construction storage and let the FFI
+     **reject** non-contiguous input loudly rather than silently materializing GBs.
+
+2. **✅ ADDRESSED (branch `zero-copy-scale-safe-readpath`, PR TBD).** The sub-linear per-variant/reference arrays (`v_starts` int32,
+   `ilens`, `alt.{data,offsets}`, `ref`, `ref_offsets`) are now computed once and cached on the
+   `Haps` reconstructor (`_HapsFfiStatic`, `Haps.ffi_static`), dropping the per-batch
+   `int64→int32` recast of `v_starts` and the other coercions. The genotype-memmap hardening from
+   target 1 (drop `ascontiguousarray`, reject loudly via `_ffi_array`) also shipped here. Original below.
+
+   **Per-batch re-cast of dataset-static per-variant arrays (cacheable; sub-linear in samples).**
+   `variants.start` is stored `int64` and re-cast to `int32` every batch (~0.59 MB × a few/batch here).
+   The per-variant / reference arrays (`v_starts`, `ilens`, `alt.{data,offsets}`, `reference`,
+   `ref_offsets`) grow only with the variant count (≲ a few billion germline variants even at 1M
+   samples → fits in ≥64 GB RAM), so these **may** be cached/typed **once** on the reconstructor —
+   unlike the per-sample-scale memmaps in (1), which must never be materialized. `reference.reference`
+   (50 MB) is already contiguous `u8`, so its `ascontiguousarray` is a verified no-op.
+
+3. **✅ ADDRESSED (branch `zero-copy-scale-safe-readpath`, PR TBD).** The fused kernels now allocate `out_data`/`annot_v`/`annot_pos` (and
+   the tracks scratch) via `uninit_output<T>` instead of `Array1::zeros`, dropping the memset. The
+   full-write proof holds: the reconstruct core writes every in-contract position, out-of-contract
+   inputs are already excluded from the parity oracle (overshoot/double-init guards), and
+   `intervals_to_tracks` does `out.fill(0.0)` as its first step so the scratch is full-write too.
+   Isolated in its own commit for independent revert. Original below.
+
+   **Output-buffer zeroing (`__memset_avx2` ~7.6%, 3 buffers on the annotated path).** The fused
+   kernels `Array1::zeros(total)` for `out_data` (+ `annot_v`, `annot_pos`). The core fully writes
+   every position for in-contract inputs, so an uninitialized allocation (`Array1::uninit` + a
+   full-write proof) drops the memset. Requires the trailing-fill coverage argument.
+
+4. **Per-call allocation churn (`brk`/`_int_malloc`/`malloc` ~6%)** and **`reverse_complement`
+   (~9% inclusive on the strand path, a numpy post-pass).** A reusable thread-local scratch pool
+   amortizes the former; folding strand RC into the kernel removes the latter. Lower priority than 1–3.
+
+> Target 1 is a correctness/scalability fix that should land **before** any >1M-sample run, independent
+> of the Phase 5 "one big `__getitem__` kernel" rewrite. Targets 2–4 are pure throughput and fold into
+> that rewrite. Peak RSS not re-measured (dominated by numba/llvmlite JIT ~3.2 GB, unchanged by fusion).
+
+##### Optimization targets — round 2 (post-format-2.0; profiled 2026-06-25 with `perf`, no `--native`)
+
+> **Profiling method (use this, not py-spy `--native`).** py-spy `--native` slows the deep-stack
+> haplotype paths ~10× (it stops the process to unwind native frames every sample) — it timed out at
+> even 3.5k batches. **`perf` on the Python process is the tool:** no sudo needed on Carter
+> (`perf_event_paranoid=2` permits user-space sampling of your own process; software event so no kernel
+> access), near-zero overhead (tracks-only ran at 552 vs 565 batch/s under perf), and it resolves the
+> `genvarloader.abi3.so` Rust symbols from the `.so` symbol table for a flat self-time profile:
+>
+>     NUMBA_NUM_THREADS=1 perf record -F 999 -o p.data -- .pixi/envs/dev/bin/python \
+>         tests/benchmarks/profiling/profile.py --mode <mode> --n-batches 12000
+>     perf report --stdio --no-children -i p.data        # flat self-time, Rust symbols resolved
+>
+> `profile.py` now has `--mode {haplotypes,annotated,tracks,tracks-seqs,variants,variant-windows}`. Run
+> 8–25k batches so steady-state drowns the one-time import/JIT (which py-spy/perf both sample). Flat
+> self-time pinpoints hot symbols without call graphs; for caller attribution add `debug =
+> "line-tables-only"` + frame pointers to a profiling cargo profile (Rust release has neither by
+> default), or use py-spy **without** `--native` for the Python-side inclusive tree. A separate
+> Rust-only criterion harness is only worth building if we want to micro-optimize a kernel in isolation
+> from FFI/Python — the in-process flat profile was conclusive for every target below.
+
+The de-noised benchmark (above) exposed a real **tracks-only 0.63×** deficit and showed **annotated is
+already 1.68×** (rust wins). Profiling each path the user cares about (tracks-only, haplotypes,
+variants/variant-windows) localized the remaining single-thread work:
+
+5. **✅ tracks-only 0.63× — per-interval `ndarray` slicing in `intervals::intervals_to_tracks`
+   (rust-specific, highest value).** `perf` self-time on the tracks-only path:
+   `intervals_to_tracks` 31% + `ndarray::slice_mut` **11%** + `ndarray::do_slice` **9.5%** ≈ **20.5%
+   spent in ndarray slice machinery**, from `out.slice_mut(s![a..b]).fill(value)` in the inner loop
+   (`src/intervals.rs:66`) and the `out.fill(0.0)` prelude. numba compiles `out[a:b] = value` to a
+   direct memset and pays none of this. **Fix:** hoist `out.as_slice_mut()` (the buffer is contiguous)
+   once and write `out_slice[a..b].fill(value)` / `out_slice.fill(0.0)` on the raw `&mut [f32]`,
+   dropping the per-interval `SliceInfo` construction + bounds-check. Expected to reclaim most of the
+   20% and close the tracks-only gap; also speeds the combined tracks path (shared kernel). This is the
+   single clearest path to **rust > numba single-threaded** on the cheapest read.
+
+   **✅ ADDRESSED (branch `opt/target-5-intervals-slice`, PR [#248](https://github.com/mcvickerlab/GenVarLoader/pull/248)).** Raw-slice form
+   landed (no `unsafe` needed): `out.as_slice_mut()` hoisted once before the interval loop,
+   inner-loop body rewritten to `out_slice[a..b].fill(value)` / `out_slice.fill(0.0)` on
+   `&mut [f32]`, dropping per-interval `SliceInfo` construction + bounds-check. Rust min
+   1.7112 ms → 1.1953 ms (~30% rust-side drop), tracks-only ratio 0.63× → 1.004×
+   (numba_min/rust_min).
+
+6. **✅ Strand reverse-complement post-pass (`reverse_complement_ragged` / `_flat.reverse_masked`) —
+   backend-agnostic, biggest throughput sink on the seq paths.** Self-time (py-spy, no `--native`):
+   **haplotypes ~19% self / ~28% inclusive**, **variants ~15% / ~16%**, **tracks-only ~10%**. Every
+   negative-strand region triggers a Python/numpy RC pass *after* reconstruction. numba pays it too, so
+   it is not the rust↔numba gap — but it is the largest single-thread throughput lever left and it must
+   go before parallelization (else we parallelize a numpy pass). **Fix:** fold strand RC into the Rust
+   reconstruct/track kernels — emit negative-strand regions already reverse-complemented (write the
+   output buffer back-to-front with complemented bytes), deleting the `reverse_complement_ragged` step
+   in `_query.py`. This is roadmap target 4's RC half, now quantified and promoted.
+   _PR: [#249](https://github.com/mcvickerlab/GenVarLoader/pull/249) → rust-migration_
+
+   **Implementation:** `src/reverse.rs` adds `rc_flat_rows_inplace` / `reverse_flat_rows_inplace`
+   primitives (COMP LUT, in-place on `&mut [u8]` / `&mut [f32]`). All five flat read-path kernels
+   (`get_reference`, `reconstruct_haplotypes_fused`, `intervals_and_realign_track_fused`,
+   `reconstruct_annotated_haplotypes_fused`, `reconstruct_haplotypes_spliced_fused`) accept
+   `to_rc: Option<ArrayView1<bool>>` and call the primitive in-kernel immediately after reconstruction
+   (correct ordering: RC after forward write + insertion fill). The Python layer computes the
+   per-element `to_rc` mask once per batch and routes it to the appropriate kernel; the
+   `reverse_complement_ragged` Python post-pass is **retained for numba** (parity oracle) and for the
+   two deferred kinds (`RaggedVariants` + `_FlatVariants`, targeted in Target 7). 958 tests pass on
+   both backends (byte-identical parity). Branch: `opt/target-6-kernel-rc`, Carter HPC
+   (AMD EPYC 7543, linux-64), HEAD `02497cf`.
+
+   **✅ Variant-allele RC folded (follow-up, 2026-06-25).** The two deferred kinds
+   (`RaggedVariants` + `_FlatVariants`) no longer route variant-allele RC through the
+   seqpro post-pass with per-batch ragged object churn; a gvl rust kernel
+   (`variants::rc_alleles_inplace`, FFI `rc_alleles`, dispatch `rc_alleles` default
+   rust) RCs the raw `_FlatAlleles` buffers in place, applied AFTER dummy-fill so
+   ordering stays byte-identical (custom non-palindromic dummy alleles covered). The
+   seqpro implementation is retained as the registered reference backend (parity + perf
+   gating; deletion is Phase 5). `_FlatVariantWindows` remains never-RC'd. Plan:
+   `docs/superpowers/plans/2026-06-25-rust-variant-rc-fold.md`.
+
+   **✅ rc_alleles_inplace fused (follow-up, 2026-06-26).** The #251
+   `variants::rc_alleles_inplace` kernel was not in the round-3 (#252) target list; this
+   pass fused its row→allele mask expansion and `rc_flat_rows_inplace` delegation into a
+   single pass via the shared `reverse::rc_row` helper, eliminating a per-call `Vec<bool>`
+   alloc+memset, an `Array1::from_vec` wrap, and a redundant full-allele rescan (`cargo asm`
+   confirms zero heap allocations and no `call rc_flat` remain). The per-function `cargo asm`
+   count *rose* 186→308 — not a regression but an inlining artifact: `rc_row` is `#[inline]`,
+   so its SIMD reverse+complement body now counts inside `rc_alleles_inplace`'s own asm
+   instead of behind a `call`, while per-call call-graph work (caller + callee body + heap
+   alloc, ~515 before) collapses to one inlined allocation-free pass. Gated on parity +
+   alloc/rescan removal + no throughput regression (this path fires only on negative-strand
+   variants / `RaggedVariants` reads — wall-clock noise-dominated, NOT round-3's
+   throughput-improvement gate): variants-path rust÷numba held 0.723→0.728 (same session,
+   both backends, within shared-node noise); `rc_flat_rows_inplace` asm unchanged after the
+   extract (283→283, label churn only). Byte-identical parity on both backends. Spec/plan:
+   `docs/superpowers/{specs/2026-06-26-rc-alleles-instruction-tuning-design,plans/2026-06-26-rc-alleles-instruction-tuning}.md`.
+
+   **Re-measured ratios (post-Target-6, 2026-06-25):**
+
+   > Harness: `tests/benchmarks/test_e2e.py` via pytest-benchmark, same `pedantic` config as the
+   > post-format-2.0 table above (iterations=10, rounds=50, warmup=5). Corpus `chr22_geuv.gvl`
+   > (165 regions: **82 negative-strand / 83 positive-strand** — 50% neg-strand; with_len(16384),
+   > BATCH=32), `NUMBA_NUM_THREADS=1`, release build, Carter HPC. Ratios are min rust ÷ min numba
+   > (ms/batch) expressed as batch/s ratio = numba_min_ms / rust_min_ms. Numba absolute times
+   > differ from the prior session (different HPC load); use the **ratio**, not the absolute.
 
-The numba bulk and the big read-path win.
+   | Mode | rust min (ms) | numba min (ms) | rust ÷ numba | Before T6 | Δ |
+   |---|---|---|---|---|---|
+   | tracks-only (`intervals_and_realign_track_fused`) | 1.1012 | 0.5386 | **0.49×** | 0.63× | −0.14 (note ①) |
+   | tracks-seqs (haplotypes + `read-depth`) | 1.7048 | 1.7039 | **1.00×** | 0.95× | +0.05 |
+   | haplotypes (`reconstruct_haplotypes_fused`) | 1.7149 | 1.7218 | **1.00×** | 0.94× | +0.06 |
+   | annotated (`reconstruct_annotated_haplotypes_fused`) | 6.1247 | 5.5100 | **0.90×** | 1.68× | −0.78 (note ②) |
 
-- [ ] Migrate `_dataset/_reconstruct.py` + `_dataset/_haps.py`.
-- [ ] Migrate `_dataset/_tracks.py` realign (6 numba) + `_dataset/_intervals.py` (4 numba).
-- [ ] Migrate `_dataset/_reference.py` (6 numba).
-- [ ] Migrate `_dataset/_insertion_fill.py` + `_dataset/_splice.py`.
+   **Notes:**
+   - ① tracks-only ratio **declined** (0.63→0.49×) — this is NOT a T6 regression in tracks throughput.
+     The tracks-only numba time dropped from the prior session's 1.07 ms to 0.54 ms without any numba
+     code change (different HPC load). Within-session the rust tracks-only path is still bounded by the
+     same ndarray slice machinery as before T6 (Target 5 is not yet merged into this branch); Target 6
+     adds `reverse_flat_rows_inplace` for the track pass, which fires for the 50% neg-strand rows.
+     Comparison across sessions is unreliable for the cheapest path (~1 ms); use the within-session ratio.
+   - ② annotated regression (1.68×→0.90×) is session noise: the prior 9.00 ms numba annotated time was
+     inflated (likely first-run JIT compilation not fully flushed by warmup_rounds=5; the annotated path
+     is rarely pre-warmed). The current 5.51 ms is the stable numba time. No T6 regression: the annotated
+     kernel only added `Option<bool[]>` argument with `None` fast path; the stable numba reference is now
+     5.51 ms vs rust 6.12 ms.
 
-**Gate:** parity + `Dataset.__getitem__` throughput vs baseline.
+   **Perf profile (rust haplotypes, 12k batches, 2026-06-25):**
 
-### Phase 4 — Write / update pipeline 🚧
-_PR: bigwig-streaming-write (TBD)_
+   > `perf record -F 999 ... profile.py --mode haplotypes --n-batches 12000`, Carter HPC. Top symbols
+   > by self-time (`perf report --stdio --no-children`):
+   >
+   > | % self | Symbol |
+   > |---|---|
+   > | 20.64% | `genvarloader::intervals::intervals_to_tracks` |
+   > | 15.44% | `ndarray::impl_methods::slice_mut` (Target 5, pending) |
+   > | **9.42%** | **`genvarloader::reverse::rc_flat_rows_inplace`** (in-kernel; was ~19% Python post-pass) |
+   > | 8.39% | `ndarray::dimension::do_slice` (Target 5, pending) |
+   > | 6.33% | `genvarloader::tracks::shift_and_realign_tracks_sparse` |
+   > | 3.48% | `_PyEval_EvalFrameDefault` |
+   > | 2.91% | `genvarloader::reconstruct::reconstruct_haplotypes_from_sparse` |
+   >
+   > **RC self-time result: `reverse_complement_ragged` / seqpro RC Python frame is GONE from the rust
+   > profile.** The in-kernel `rc_flat_rows_inplace` (9.42%) replaces the ~19% Python/numpy post-pass —
+   > roughly a 2× reduction in RC wall-time, moving from a cold Python FFI pass to a hot in-cache Rust
+   > loop. The ndarray slice machinery (15.44% + 8.39% ≈ 24%) remains the next highest-value target
+   > (Target 5, `opt/target-5-intervals-slice`, not yet merged into this branch).
 
-- [ ] Migrate `_dataset/_write.py`: variant normalization (left-align, bi-allelic,
-      atomize), genotype storage, interval extraction + realign.
-  - [x] bigWig interval extraction for the write path — single-pass streaming Rust writer (this PR)
-  - [x] Table + annot overlap: COITrees Rust engine replaces polars-bio (this PR)
-- [ ] Migrate remaining `_dataset/_utils.py` / `_flat_flanks.py` / `_variants/_sitesonly.py`
-      kernels touched by the write path.
+7. **✅ ADDRESSED (branch `opt/target-7-windows-rust-assembly`, [PR #250](https://github.com/mcvickerlab/GenVarLoader/pull/250) → `rust-migration`).** variant-windows — collapsed
+   per-batch object churn into one Rust call. `assemble_variant_buffers_{u8,i32}` assembles alt/ref
+   byte windows + flank tokens in one FFI crossing (`src/ffi/mod.rs`, cores in `src/variants/windows.rs`), replacing the
+   `_FlatWindow`/`FlatRagged`/scalar-field dataclass construction loop in `_flat_variants.py` /
+   `_flat_flanks.py`. GC self-time (`gc_collect_main` + `deduce_unreachable` + `visit_reachable` +
+   `dict_traverse`) dropped from **~14% → ~2.5%** of flat self-time; the profile top is now dominated
+   by the Rust kernels (`tokenize` 28%, `slice_flanks` 19%, `assemble_alt_window` 13%) and
+   `_PyEval_EvalFrameDefault` ~3.7%. variant-windows throughput: **rust 1.83× faster than numba**
+   (2.38 ms/batch vs 4.37 ms/batch; profile.py wall-clock, 2000 batches, `NUMBA_NUM_THREADS=1`,
+   HEAD `bd957b7`, Carter HPC AMD EPYC 7543, linux-64). Bare variants mode: rust **0.84×** of numba
+   (3.75 ms/batch vs 3.15 ms/batch) — slightly slower, within run-to-run noise on this shared node
+   (the path is dominated by `intervals_to_tracks` / `shift_and_realign_tracks_sparse` track work,
+   not the variant assembly itself, so this is expected noise not a regression).
 
-**Gate:** parity + `gvl.write()`/`update()` wall-clock + peak RSS vs baseline.
+> **Sequencing for follow-up PRs (updated 2026-06-25; round-3 status 2026-06-25):**
+> **(5) ✅ DONE** — instruction count reduced 480→283 in the round-3 instruction-level tuning pass;
+> `opt/round3-instruction-tuning`. **(6) ✅ DONE** — RC folded into rust kernels on
+> `opt/target-6-kernel-rc`; see measurements above;
+> PR [#249](https://github.com/mcvickerlab/GenVarLoader/pull/249). **(7) ✅ DONE** —
+> variants/variant-windows assembly collapsed into one rust call on
+> `opt/target-7-windows-rust-assembly`; see the Target 7 re-measurement below;
+> PR [#250](https://github.com/mcvickerlab/GenVarLoader/pull/250).
+> **Round-3 instruction-level pass ✅ DONE** — 7/7 kernels tuned, 0 reverted (see "round 3" subsection
+> below). Single-thread headroom is now maximized; remaining rust-vs-numba variance on the cheapest path
+> (tracks-only, ~1 ms) is node-noise on the shared HPC, not a code defect.
+> **Rayon batch parallelism (Phase 5) is the next lever.**
 
-### Phase 5 — Crate consolidation + thin-binding cleanup ⬜
+##### Target 7 re-measurement (2026-06-25, branch `opt/target-7-windows-rust-assembly`)
+
+> **Harness:** `tests/benchmarks/profiling/profile.py` wall-clock average (2000 batches, burn-in 5),
+> not pytest-benchmark pedantic min — `test_e2e_variants` is xfailed (pre-existing `_FlatVariants.to_fixed`
+> gap) so no pedantic-min is available for the variants paths. `NUMBA_NUM_THREADS=1`, release build
+> (`maturin develop --release`), HEAD `bd957b7`, `chr22_geuv.gvl` (format 2.0, 165 regions × 5 samples),
+> Carter HPC (AMD EPYC 7543, linux-64).
+
+| Mode | rust (ms/batch) | numba (ms/batch) | rust ÷ numba | note |
+|---|---|---|---|---|
+| variant-windows | 2.38 | 4.37 | **1.83×** (rust faster) | assembly collapsed to one Rust call |
+| variants (bare alleles) | 3.75 | 3.15 | 0.84× (within noise) | dominated by track work, not variant assembly |
+
+> variant-windows is now the **clearest rust win in isolation**: 1.83× over numba, GC share ~2.5% vs ~14% baseline.
+> The bare-variants path is noise-level (the reconstruction cost is track/haplotype work, not the variant
+> gather kernels). Full tree 967 passed / 21 skipped / 4 xfailed on both backends (HEAD `bd957b7`);
+> byte-identical parity confirmed via `assemble_variant_buffers` mode-matrix + live-path spy.
+
+> **perf flat self-time (variant-windows, rust, 12000 batches):**
+> top leaves: `tokenize` 28.3%, `slice_flanks` 19.2%, `assemble_alt_window` 13.1%, `_PyEval_EvalFrameDefault`
+> 3.7%, GC total 2.5% (`gc_collect_main` 1.0% + `deduce_unreachable` 0.6% + `visit_reachable` 0.5% +
+> `dict_traverse` 0.4%). Profile is now Rust-kernel-dominated with negligible GC overhead.
+
+##### ✅ Optimization targets — round 3 (instruction-level, profiled 2026-06-25)
+
+> Branch: `opt/round3-instruction-tuning` ([PR #252](https://github.com/mcvickerlab/GenVarLoader/pull/252) → `rust-migration`). Tooling: `cargo asm --lib` (cargo-show-asm).
+> Starting ratios from the Task-3 profiling baseline captured 2026-06-25 (full table in
+> `docs/roadmaps/round3-profile-baseline.md`): tracks-only **0.97×**, haplotypes **0.70×**,
+> variants **0.80×**, variant-windows **0.56×**. Rust was already at parity or faster on all 4 paths;
+> tracks-only (0.97×) was within session noise of 1.0×. These are floors to improve, not ceilings.
+>
+> Targets ranked by aggregate self-time (sum across all paths); full aggregate table in the baseline doc.
+> Top 8 aggregate targets: `intervals_to_tracks` (60.3%), `windows::tokenize` (28.1%),
+> `shift_and_realign_tracks_sparse` (25.7%), `windows::slice_flanks` (20.1%),
+> `windows::assemble_alt_window` (13.3%), `rc_flat_rows_inplace` (9.3%),
+> `ffi::intervals_and_realign_track_fused` (9.0%), `reconstruct_haplotypes_from_sparse` (4.5%).
+> `reverse_flat_rows_inplace` was **SKIPPED** (negligible self-time in the Task-3 profile).
+> `ffi::intervals_and_realign_track_fused` was **not a direct target** — its overhead belongs to the
+> kernels it wraps (`intervals_to_tracks` and `shift_and_realign_tracks_sparse`).
+
+**Per-kernel results (7/7 kept; 0 reverted):**
+
+> Instr before→after: total instruction count from `cargo asm --lib` for the hot function body.
+> rust÷numba before→after: wall-clock ratio measured in the *same session* as the before count
+> (cross-session comparisons are unreliable on this shared HPC node — see node-noise caveat below).
+> **Note on `rc_flat_rows_inplace`**: instruction count *rose* 212→283 because the scalar byte loop was
+> replaced by an SSE2-vectorized COMP LUT loop — the vector expansion adds instructions but halves
+> actual operations. That IS the win; the per-kernel ratio confirms it (0.664→0.635).
+> **Note on llvm-mca**: the planned llvm-mca cycles column is omitted because llvm-mca was not
+> available in the build environment this round; the deterministic instruction-count reductions and
+> the same-session wall-clock rust÷numba ratios are the recorded evidence in its place.
+
+| Kernel | instr before→after | rust÷numba before→after (same-session) | result |
+|---|---|---|---|
+| `intervals_to_tracks` | 480→283 | 0.628→0.624 | kept |
+| `windows::tokenize` | 16→4 /elem (hot) | 0.55→0.43 | kept |
+| `shift_and_realign_tracks_sparse` | 3 `do_slice`→0 | 1.178→1.179 (held) | kept |
+| `windows::slice_flanks` | push→memcpy | 0.446→0.239 | kept |
+| `windows::assemble_alt_window` | 3 push→memcpy | 0.306→0.223 | kept |
+| `reverse::rc_flat_rows_inplace` | 212→283 (vectorized SSE2) | 0.664→0.635 | kept |
+| `reconstruct_haplotypes_from_sparse` | 2839→1279 | 0.655→0.589 | kept |
+
+**Final four-path ratios (re-measured 2026-06-26 in one back-to-back session; HEAD `fe18c4f`):**
+
+> ⚠️ **Node-noise caveat**: the Carter HPC node is shared and load varies; absolute ms/batch drifts
+> ≥2× across sessions. The per-kernel before→after ratios above are each within-session; the four-path
+> summary below is a single consistent back-to-back session but is NOT directly comparable to the per-kernel
+> table (different session, different load). **The durable signal is the deterministic instruction-count
+> reductions (table above) + byte-identical parity on both backends. Use the four-path summary only for
+> order-of-magnitude guidance.**
+>
+> Harness: tracks-only and haplotypes via `pytest-benchmark` pedantic min (iterations=10, rounds=50,
+> warmup=5). Variants and variant-windows via `profile.py` wall-clock average (2000 batches, burn-in 5).
+> `NUMBA_NUM_THREADS=1`, `maturin develop --release`, corpus `chr22_geuv.gvl` (format 2.0,
+> 165 regions × 5 samples), Carter HPC (AMD EPYC 7543, linux-64).
+
+| Path | rust (ms/batch) | numba (ms/batch) | rust ÷ numba |
+|---|---|---|---|
+| tracks-only (pedantic min) | 1.232 | 1.040 | 1.18× (node-noise: cheapest path, cf. per-kernel 0.624×) |
+| haplotypes (pedantic min) | 2.029 | 3.439 | **0.59×** (rust 1.7× faster) |
+| variants (wall avg) | 3.292 | 4.290 | **0.77×** (rust 1.3× faster) |
+| variant-windows (wall avg) | 1.220 | 5.616 | **0.22×** (rust 4.6× faster) |
+
+> **Summary:** 7/7 targets kept, 0 reverted. All byte-identical parity on both backends (full tree
+> gate). No `unsafe` added this round — all wins via safe Rust idioms: `as_slice_mut` + `&mut [T]`
+> indexing (slice-hoist), `extend_from_slice` (memcpy expansion), iterator idioms, and one
+> branchless-arithmetic complement that autovectorizes to SSE2. `reverse_flat_rows_inplace` was SKIPPED
+> (negligible self-time). The ffi fused trampoline (8.97% aggregate) was not a direct target.
+> **Rayon batch parallelism (Phase 5) is the next lever.**
+
+### Phase 4 — Write / update pipeline ✅
+_PR: [#253](https://github.com/mcvickerlab/GenVarLoader/pull/253)_
+
+The default `gvl.write()` / `gvl.update()` path is fully Rust-backed; the write path is numba-free.
+
+- [x] bigWig interval extraction — single-pass streaming Rust writer (SoA `starts/ends/values.npy`).
+- [x] Table + annot overlap — COITrees Rust engine.
+- [x] Deleted the dead `_write_track_legacy` + `splits_sum_le_value` (the last write-path numba),
+      reachable only via custom `IntervalTrack` types (none exist; `IntervalTrack` is unexported).
+      Unsupported track types now raise `TypeError`.
+- **Variant normalization (left-align, bi-allelic, atomize) is NOT GVL work** — it is a user
+  precondition (`bcftools norm` / `plink2 --normalize`); the write path only validates/rejects
+  non-conforming records. Struck from Phase 4 scope.
+- **Genotype storage / variant IO (genoray `dense2sparse`) deferred to Phase 6 (absorb genoray).**
+
+**Gate (parity — MET):** write-path parity = the landed differential tests (bigWig byte-identical;
+Table COITrees numpy-oracle + property). Full tree green on both backends.
+
+**Gate (throughput/RSS — Carter re-baseline, chr22_geuv):**
+
+| Op | corpus | wall-clock | peak RSS |
+|---|---|---|---|
+| `gvl.write()` (PGEN variants + BigWigs track) | chr22_geuv (5 samples × 165 e-gene regions, chr22) | 1.934 s | 3.520 GB |
+| `gvl.update()` (add per-sample BigWigs track) | chr22_geuv | 0.081 s | 3.519 GB ¹ |
+
+> Carter HPC (AMD EPYC 7543, linux-64), `NUMBA_NUM_THREADS=1`, release build, HEAD `32132c9`. The
+> write path is already Rust-only (Python/numba orchestration deleted at landing), so there is no
+> live numba A/B; these are the canonical Phase 4 numbers. The old 1.143 s / 3.593 GB write figure
+> was macOS / 1kg-VCF and is **not comparable**.
+>
+> ¹ The `gvl.update()` peak RSS (3.519 GB) is a whole-process figure: the measurement driver builds
+> the base dataset (untimed `gvl.write`) then runs the timed `gvl.update` in the **same process**,
+> so the memray process-peak is dominated by the base-dataset write (≈ the write() peak above). Only
+> the update wall-clock (0.081 s) is isolated to `gvl.update`; its marginal RSS is not measured by
+> this driver.
+
+### Phase 5 — Crate consolidation + thin-binding cleanup ✅
 _PR: —_
 
-- [ ] Collapse the PyO3 surface so Python is a true shim (indexing sugar, torch,
+- [x] Collapse the PyO3 surface so Python is a true shim (indexing sugar, torch,
       validation/error messages only).
-- [ ] Delete all remaining core numba kernels (target: count = 0).
-- [ ] Confirm the crate is fully cargo-testable standalone.
+      > W6 audit verdict (2026-06-27): **shim is already thin — bucket-2 is empty**.
+      > All per-batch Python steps are indexing sugar, FFI typing guards, or Python-side
+      > RNG; the five fused kernels each cross the FFI boundary exactly once.
+      > The single-big-kernel collapse is not warranted as Phase 5 work.
+      > Full audit: `docs/roadmaps/phase-5-w6-thin-shim-audit.md`
+- [x] Delete all remaining core numba kernels (target: count = 0). ✅ W5
+- [x] Confirm the crate is fully cargo-testable standalone.
+      > **Verified 2026-06-27 (Task 2, branch `phase-5-w6-wrapup`):** plain `cargo test --release`
+      > from the repo root (no pixi, no `PYO3_PYTHON`, no env vars) passes on the first attempt —
+      > already-standalone case. Pass count: **114 passed (3 suites)**. Canonical invocation:
+      > `cargo test --release`
+      > No `Cargo.toml` / `.cargo/config.toml` edits were needed or made.
+
+**Checkpoint:** ✅ core numba kernel count = 0; cargo-testable standalone confirmed; seqpro-core 0.1.0 on crates.io confirmed; full perf re-baseline recorded here. Full gate (2026-06-27): whole-tree pytest 973 passed / 44 skipped / 5 xfailed (parity+dataset+unit subset: 692/35/2 — matches W5 baseline exactly); cargo 114 passed; ruff/format/pyrefly/clippy clean (warnings only, 0 errors); abi3 wheel builds. Phase 5 marker set ✅.
+
+**Optimization track (re-filed, not a Phase 5 blocker):** the Task-1 thin-shim audit noted two micro-opt opportunities that did not qualify as Phase 5 shim collapse (bucket-2 is empty): (a) `_as_starts_stops` helper in `_reconstruct.py` allocates a small tuple each call and could be cached; (b) `GVL_NUM_THREADS` env-var parsing is re-read each batch and could be cached on the reconstructor. Both are sub-millisecond amortized-cost items. They are tracked here as a future optimization pass (not gating the Phase 5 ✅ verdict).
 
-**Checkpoint:** core numba kernel count = 0; full perf re-baseline recorded here.
+#### W6 perf re-baseline: rayon serial-vs-multithread speedup + RSS (2026-06-27)
+
+> Full methodology, per-mode tables, and conclusions: [`docs/roadmaps/phase-5-w6-perf-rebaseline.md`](phase-5-w6-perf-rebaseline.md)
+>
+> HEAD `0968a0f`, corpus `chr22_geuv.gvl` (format 2.0, 165 regions × 5 samples, BATCH=32,
+> SEQLEN=16384), Carter HPC (Intel Xeon E5-4650 v3, 96 CPUs, linux-64), `maturin develop --release`.
+>
+> **Key finding — threshold gate held serial on this corpus:** the `should_parallelize` gate
+> (`_MIN_BYTES_PER_THREAD = 1 MiB`, threshold = `GVL_NUM_THREADS × 1 MiB`) never fired for
+> any mode at N≥4. Batch output is ~1–3 MiB vs. N × 1 MiB threshold (borderline at N=2; well below at N≥4). All
+> modes ran serial; the thread sweep (1/2/4/8/all-96) shows ratios within 0.95–1.10× of the
+> serial baseline — pure node noise. This is correct behavior, not a failure.
+>
+> **Speedup curve (serial÷parallel; all within node noise ~±10%):**
+>
+> | Mode | T=2 | T=4 | T=8 | T=all (96) |
+> |------|----:|----:|----:|----------:|
+> | tracks-only (pedantic min) | 1.10× | 1.04× | 1.04× | 1.10× |
+> | tracks/haplotypes (pedantic min) | 1.06× | 1.03× | 1.06× | 1.06× |
+> | annotated (pedantic min) | 1.09× | 1.06× | 0.95× | 1.09× |
+> | variants (wall avg) | 0.98× | 1.03× | 1.02× | 1.01× |
+> | variant-windows (wall avg) | 1.01× | 0.98× | 0.99× | 1.00× |
+>
+> **Peak RSS (serial vs parallel/unset):** 3.525 GB in all cases — 0 gvl-attributable delta.
+> Floor is seqpro transitive JIT (~3.2 GB), unchanged by thread count (serial path throughout).
+>
+> **Rayon correctness:** `serial == parallel == frozen golden` for all kernels (W5 parity gate,
+> `test_rayon_equivalence.py`). The threshold gate is the only reason rayon was not exercised
+> here; production-scale batches (SEQLEN≥131072 or BATCH≥256) will cross it.
+>
+> **Numba A/B unavailable** (deleted in W5). Final single-thread rust-vs-numba figures in
+> [`docs/roadmaps/phase-5-w4-final-ab.md`](phase-5-w4-final-ab.md): rust parity-or-better
+> on every mode (tracks-only 1.07×, haplotypes/tracks-seqs 1.66×, annotated 1.43×, variants
+> 1.38×, variant-windows 4.58×).
 
 ### Phase 6 — Absorb genoray (future) ⬜
 _PR: —_
@@ -266,6 +792,311 @@ narrowed to genoray (variant IO) only.
 
 ## Notes & decisions log
 
+- 2026-06-27 (Phase 5 W6 — wrap-up: thin-shim audit + cargo-standalone + seqpro-core + perf re-baseline; branch `phase-5-w6-wrapup`):
+  Four parallel threads closed Phase 5:
+  **(A) Thin-shim audit (Task 1, commit `0932374`):** Classified every Python step over the
+  PyO3 FFI surface. **Verdict: shim is already thin — bucket-2 (collapsible glue) is empty.**
+  33 registered FFI entries, 5 fused `__getitem__` kernels; `_dispatch.py` absent; zero numba
+  imports in `python/genvarloader/`. The single-big-kernel collapse is not warranted as Phase 5
+  work. Full audit: `docs/roadmaps/phase-5-w6-thin-shim-audit.md`.
+  **(B) cargo-testable standalone (Task 2, commit `ac052f7`):** `cargo test --release` from the
+  repo root (no pixi, no `PYO3_PYTHON`, no env vars) passes on the first attempt — already
+  standalone. 114 passed (3 suites). No `Cargo.toml` / `.cargo/config.toml` edits needed.
+  **(C) seqpro-core 0.1.0 on crates.io (Task 3, commit `0968a0f`):** Confirmed
+  `seqpro-core = "0.1"` resolves from `registry+https://github.com/rust-lang/crates.io-index`
+  (checksum in `Cargo.lock`); no path-dep or `[patch]` override. Stale Phase 1 note corrected.
+  **(D) W6 perf re-baseline (Task 4, commits `6611540` + `e47d128`):** Rayon serial-vs-multithread
+  speedup curve recorded. Key finding: the `should_parallelize` threshold gate (`_MIN_BYTES_PER_THREAD = 1 MiB`)
+  held serial on the test corpus for all 6 modes — all runs serial, thread-sweep ratios within node
+  noise (~±10%). This is correct behavior (batch output ~1–3 MiB; threshold = N × 1 MiB; production
+  batches with SEQLEN≥131072 or BATCH≥256 will cross it). No engaged-parallelism speedup captured
+  here; real rust-vs-numba speedup evidence is in `docs/roadmaps/phase-5-w4-final-ab.md` (rust
+  parity-or-better on all modes). Peak RSS 3.525 GB in all cases (floor = seqpro JIT ~3.2 GB).
+  **(Gate):** Whole-tree pytest 973 passed / 44 skipped / 5 xfailed (parity+dataset+unit 692/35/2 —
+  matches W5 baseline exactly); cargo 114 passed; ruff/format/pyrefly/clippy clean (0 errors);
+  abi3 wheel builds. **Phase 5 marker set ✅.** The `rust-migration → master` merge is left to the
+  maintainer (no-squash per project policy).
+  Two micro-opt items from the Task-1 audit (`_as_starts_stops` tuple alloc, `GVL_NUM_THREADS`
+  re-read per batch) re-filed as a future optimization-track entry (not Phase 5 blockers; see
+  "Optimization track" note in the Phase 5 section).
+
+- 2026-06-26 (Phase 5 W2 — #242 stale landmine comments corrected + max_jitter>0 parity gate; branch `phase-5-w2`):
+  Investigation (`.superpowers/sdd/w2-investigation.md`) confirmed that #242 was already
+  root-caused and fully fixed end-to-end: both ``intervals_to_tracks`` kernels (Rust and
+  numba) apply the left-clip ``s = max(itv.start - query_start, 0); e = min(end, length)``
+  merged via PR #244 (ancestor of ``rust-migration``); #242 is CLOSED. The clip is
+  functionally correct — the stored jitter-expanded write window always fully covers any
+  jittered query of the original region length, so the clip never truncates real signal.
+  The upstream coordinate rewrite (storing intervals at ``chromStart`` rather than
+  ``chromStart - max_jitter``) was intentionally SKIPPED: the clip is the correct fix, not
+  a mask over a remaining defect. W2 added the end-to-end max_jitter>0 numba-vs-rust
+  dataset parity test with a hand-computed oracle
+  (``test_tracks_max_jitter_intervals_parity_and_oracle``, Task 1, commit ``5d3aa7d``).
+  W2 also corrected three stale "PanicException landmine" / "violates the contract" comment
+  blocks in ``tests/parity/_fixtures.py`` (``build_haps_tracks_dataset`` and
+  ``build_strand_mixed_dataset`` docstrings + inline comment) and
+  ``tests/parity/test_dataset_parity.py``
+  (``test_tracks_realign_getitem_identical_across_backends`` fixture-geometry note): the
+  accurate framing is that #242 is fixed and ``max_jitter=0`` in those fixtures is retained
+  only for the simplest deterministic geometry, not because of any live panic. Phase 5 🚧
+  (W3–W9 remain).
+
+- 2026-06-26 (Phase 5 W1 — trailing-fill overshoot fix + parity gate; branch `phase-5-w1`):
+  Fixed the trailing-fill overshoot divergence in **all four kernels** that advance `ref_idx`
+  past the contig end (deletion whose `v_ref_end > contig_len`):
+  (1) **Rust haplotype kernel** (`src/reconstruct/mod.rs`): when `writable_ref <= 0` the old
+  code set `out_end_idx = (out_idx + writable_ref).max(0)` which could be `< out_idx`, causing
+  the right-pad `out[out_end_idx..length]` to silently overwrite already-written positions.
+  Fixed by clamping to `out_end_idx = out_idx` — the whole unfilled tail `out[out_idx..length]`
+  is now padded, never less.
+  (2) **Numba haplotype kernel** (`python/genvarloader/_dataset/_genotypes.py`): replaced
+  `writable_ref = min(unfilled_length, len(ref) - ref_idx)` (could be negative) with
+  `writable_ref = max(0, min(unfilled_length, len(ref) - ref_idx))` so `out_end_idx` is
+  never below `out_idx`.
+  (3) **Rust track kernel** (`src/tracks/mod.rs`): same overshoot family — when
+  `writable_ref <= 0` the else-branch now clamps to `out_idx` (mirrors the haplotype fix).
+  (4) **Numba track kernel** (`python/genvarloader/_dataset/_tracks.py`): same `max(0, ...)`
+  guard on `writable_ref`.
+  Both kernels now write byte-identically across the full input domain including the
+  overshoot sub-domain. **Parity gates updated:** Guards 1–3 removed from
+  `tests/parity/test_reconstruct_haplotypes_parity.py` (overshoot pre-check,
+  `try/except SystemError`, double-init sentinel), and the `SystemError` guard removed from
+  `tests/parity/test_shift_and_realign_tracks_parity.py`. These sub-domains are now
+  first-class parity-covered inputs.
+  **Note:** the `pixi run -e dev pytest` command does NOT auto-rebuild the Rust extension;
+  `maturin develop --release` must be run explicitly before testing Rust changes (else the old
+  binary runs and tests fail on the pre-fix behavior — caught and fixed during this W1 run).
+  Full tree gate (rust backend): 993 passed, 12 skipped, 5 xfailed, 0 failed.
+  Subset gate on `tests/dataset tests/unit tests/parity` — rust: 709/6/2, numba: 709/6/2
+  (identical profiles, parity confirmed). Cargo: 114 passed. Lint/format/typecheck clean
+  (one branch-introduced test file reformatted by ruff). Phase 5 🚧 (W1 done; W2–W9 remain).
+  Issue tracking the overshoot: #255.
+
+
+- 2026-06-27 (Phase 5 W6 — thin-shim audit; branch `phase-5-w6-wrapup`):
+  Audited the Python layer over the PyO3 FFI surface to determine whether collapsible
+  glue remains. **Verdict: shim is already thin — bucket-2 is empty.** All per-batch
+  Python steps classify as Bucket 1 (indexing sugar, FFI typing guards, Python-side RNG,
+  output format massaging) or Bucket 3 (one FFI crossing via a fused kernel). The
+  dispatch layer (`_dispatch.py`) is confirmed absent; zero numba imports in
+  `python/genvarloader/`. FFI surface: 33 registered entries, 5 fused `__getitem__`
+  kernels. The Phase 3 optimization targets (`_ffi_array` zero-copy guard,
+  `_HapsFfiStatic` caching, uninit buffers) are all implemented. The single-big-kernel
+  collapse is not warranted as Phase 5 work — the five fused kernels already express
+  one FFI crossing per reconstruction path. Full audit:
+  `docs/roadmaps/phase-5-w6-thin-shim-audit.md`. Phase 5 🚧 (W1–W6 done; W7–W9 remain).
+
+- 2026-06-27 (Phase 5 W5 — consolidation PR: snapshot + delete numba + rayon; branch `phase-5-w5`, PR #260):
+  The consolidation PR, one branch with three staged commit boundaries.
+  **Stage A — golden snapshot (DONE):** froze the ~21 numba-oracle parity suites to committed
+  `.npz` goldens (deterministic seeded-sample draws; the generator cross-checks `numba == rust`
+  before saving). All parity tests were rewritten to assert `rust == frozen golden`, importing the
+  rust callables directly via `tests/parity/_golden.py::RUST_KERNELS` (never the dispatch layer), so
+  Stage B's deletion never touches the tests. Regen driver: `tests/parity/generate_goldens.py`.
+  **Stage B — delete numba (DONE):**
+  Deleted all `@nb.njit` / `@nb.vectorize` decorated functions from
+  `python/genvarloader/`. Twelve source modules touched:
+  `_threads.py`, `__init__.py`, `_ragged.py`, `_flat.py`,
+  `_dataset/_flat_variants.py`, `_dataset/_genotypes.py`,
+  `_dataset/_reference.py`, `_dataset/_utils.py`, `_dataset/_intervals.py`,
+  `_dataset/_tracks.py`, `_dataset/_flat_flanks.py`, `_variants/_sitesonly.py`.
+  Key changes:
+  - `cap_numba_threads()` → `cap_threads()` (seeds RAYON_NUM_THREADS; seeds numba
+    pool via optional import for backward test compat).
+  - `_flat_variants.py`: replaced 5 numba dispatch fallbacks
+    (`_gather_rows`, `_compact_keep`, `_fill_empty_scalar`, `_fill_empty_seq`,
+    `_fill_empty_fixed`) with dtype-preserving numpy equivalents for issue #231
+    (custom FORMAT fields with non-i32/f32 dtypes).
+  - `_genotypes.py`: deleted `_get_diffs_sparse_numba`,
+    `_reconstruct_haplotypes_from_sparse_numba`, `_choose_exonic_variants_numba`;
+    kept `reconstruct_haplotype_from_sparse` as plain Python (used by parity tests).
+  - `_tracks.py`: deleted `_xorshift64`, `_hash4`, `_apply_insertion_fill`,
+    `shift_and_realign_tracks_sparse`, `shift_and_realign_track_sparse` (numba);
+    restored all as plain Python for parity test compat.
+  - `_reference.py`: deleted `_get_reference_row/_par/_ser/_numba`; restored
+    `_get_reference_row/_ser/_par` as plain Python (tested directly).
+  - `_intervals.py`: deleted `_intervals_to_tracks_numba`, `_tracks_to_intervals_numba`,
+    `_scanned_mask`, `_compact_mask`; restored `intervals_to_tracks` dispatch wrapper.
+  `grep -r 'import numba|@nb.njit|nb.prange' python/genvarloader/` = 0 matches.
+  CAVEAT (seqpro transitive numba): `import genvarloader` still pulls numba+llvmlite
+  via seqpro 0.20.0 (eager numba import in seqpro/_numba.py + transforms/tmm.py).
+  genvarloader's OWN code is numba-free. **W5's numba-removal scope is gvl-only by
+  design** (user decision 2026-06-27): removing numba from seqpro (`ML4GLand/SeqPro`)
+  is explicitly OUT OF SCOPE, so the transitive numba dependency remains intentionally.
+  B4's import-guard asserts genvarloader's own modules are numba-free (own-code source
+  scan). The ~3.2 GB JIT-RSS that the seqpro JIT baseline contributes is therefore not
+  recovered by this migration; the W6 perf re-baseline measures the gvl-attributable
+  deltas (rayon multi-thread speedup, gvl-own kernel costs), not the seqpro JIT floor.
+  **Stage C — rayon batch parallelism (DONE):** added a `parallel: bool` gate to every read
+  kernel, threaded through the FFI entries and Python callers (each computes
+  `should_parallelize(total_out_bytes)` from `_threads.py`). The parallel branch carves disjoint
+  per-work-item `&mut [_]` slices via the `split_at_mut` cursor idiom (mirrors the pre-existing
+  `get_reference`), then dispatches with `into_par_iter()`; **never a raw `*mut` in a rayon
+  closure** (not `Send`). The serial branch is the byte-identity reference. Kernels parallelized:
+  C1 `reconstruct_haplotypes_from_sparse` (out + optional annot_v_idxs/annot_ref_pos);
+  C2 `shift_and_realign_tracks_sparse`, `tracks_to_intervals` (two-pass — each pass parallel,
+  cumsum kept sequential), `intervals_and_realign_track_fused`;
+  C3 `get_diffs_sparse`, `intervals_to_tracks` (`get_reference` was already parallel).
+  Gated `serial == parallel == frozen golden` for all cases via
+  `tests/parity/test_rayon_equivalence.py` (one case set per kernel, both branches).
+  Also (C4) skipped the 3 obsolete `tests/benchmarks/test_micro.py` micro-benchmarks whose
+  Python-level capture points were fused away in W3/W5 (`reconstruct_haplotypes_from_sparse`,
+  `intervals_to_tracks`, `shift_and_realign_tracks_sparse`) — micro-benchmark redesign onto the
+  fused rust entries is deferred to W6; `test_get_diffs_sparse` + the e2e benchmarks still run.
+  Full test tree gate (controller-verified, fresh `maturin develop --release`):
+  parity+dataset+unit = 692 passed, 35 skipped, 2 xfailed; whole `pytest tests` green
+  (benchmarks 7 passed / 3 skipped / 1 xfailed); cargo test --release 114; ruff + format +
+  pyrefly + clippy clean.
+  Phase 5 stays 🚧 (W1–W5 done; W6–W9 remain — W6/PR6 is measure-and-merge: re-baseline perf,
+  capture the multi-thread rayon speedup + the gvl-attributable RSS deltas, then merge.
+  The seqpro JIT-RSS floor is out of scope — see the seqpro caveat above).
+
+- 2026-06-26 (Phase 5 W4 — final single-thread numba-vs-rust `__getitem__` A/B; branch `phase-5-w4`, PR #259):
+  Benchmark-only gate (no code) before the W5 consolidation. Measured rust AND numba **single-thread, same
+  back-to-back session, two passes** (the shared Carter node makes cross-session wall-clock unreliable; the
+  durable signal is byte-identical parity + same-session improve-or-hold — see [[gvl-rust-perf-gate-shared-node-noise]]).
+  Two tools agreed: `test_e2e.py` pedantic-min and `profile.py` steady-state throughput. **Result — rust is
+  parity-or-better on every mode** (speedup = numba÷rust, higher ⇒ rust faster): haplotypes ~1.65×, tracks-seqs
+  ~1.65×, annotated ~1.4×, variants ~1.4×, variant-windows ~4.6×; the pure tracks-only path ~1.05× (effectively
+  parity — fixed per-batch IO cost, not kernel-bound; rust never behind). Combined with byte-identical parity
+  (W1–W3 + full parity suite, both backends), there is no single-thread regression risk in removing numba.
+  **GATE PASSED → proceed to W5 consolidation** (golden-snapshot the numba-oracle parity suites, delete numba,
+  add rayon batch parallelism gated byte-identical to the serial golden result). Full tables + methodology:
+  `docs/roadmaps/phase-5-w4-final-ab.md`. Phase 5 🚧 (W1–W5 done; W6–W9 remain).
+
+- 2026-06-26 (Phase 5 W3 — annotated+spliced fusion; branch `phase-5-w3`, PR #258):
+  Fused the fourth and final reconstruction combination — annotated+spliced haplotypes — via
+  `reconstruct_annotated_haplotypes_spliced_fused` (new kernel in `src/reconstruct/mod.rs`).
+  One FFI crossing total: RC is folded in-kernel (bytes reverse-complemented via the existing
+  COMP LUT; both annotation arrays reversed in-place), eliminating the prior three-kernel
+  dispatch sequence (`reconstruct_haplotypes_spliced_fused` → `rc_flat_rows_inplace` →
+  `reverse_flat_rows_inplace × 2`). All four reconstruction combinations now cross the FFI
+  boundary exactly once on the rust backend: (1) plain haps via `reconstruct_haplotypes_fused`,
+  (2) annotated haps via `reconstruct_annotated_haplotypes_fused`, (3) spliced haps via
+  `reconstruct_haplotypes_spliced_fused`, (4) annotated+spliced haps via
+  `reconstruct_annotated_haplotypes_spliced_fused`. Byte-identical to the composed numba oracle;
+  parity gate: `tests/parity/test_annotated_spliced_haplotypes_parity.py`. Numba remains the
+  oracle (deletion deferred to W5/W6). Phase 5 🚧 (W1, W3 done; W2, W4–W9 remain).
+
+- 2026-06-26 (Phase 4 close-out; branch `phase-4-close-out`, PR [#253](https://github.com/mcvickerlab/GenVarLoader/pull/253)): Investigation found the
+  default write/update path already fully Rust-backed (bigWig streaming writer + COITrees table;
+  variant IO via genoray). The roadmap's "variant normalization" bullet was a mischaracterization —
+  GVL never normalizes (it is a bcftools/plink2 user precondition); genotype storage is genoray
+  (→ Phase 6). Deleted the only remaining write-path numba (`splits_sum_le_value` + the dead
+  `_write_track_legacy`; unsupported `IntervalTrack` types now `TypeError`). Captured canonical
+  Carter chr22_geuv write/update wall-clock + peak RSS (no live numba A/B — orchestration was
+  deleted at landing). Full tree green both backends; cargo + lint/format/typecheck clean; abi3
+  builds. Phase 4 ✅.
+
+- 2026-06-25 (round-3 instruction-level kernel tuning; branch `opt/round3-instruction-tuning`, [PR #252](https://github.com/mcvickerlab/GenVarLoader/pull/252)):
+  Instruction-count pass over 7 hot kernels identified by the Task-3 `perf` flat-profile (full
+  aggregate table in `docs/roadmaps/round3-profile-baseline.md`). Tooling: `cargo asm --lib`
+  (cargo-show-asm). Gate: wall-clock throughput — instruction-count and llvm-mca cycle deltas used
+  as evidence to support / reject each change; reverted if throughput did not confirm. Unsafe: **NONE
+  added this round** — all wins via safe Rust idioms: `as_slice_mut` + `&mut [T]` slice-hoist
+  (`intervals_to_tracks`, `shift_and_realign_tracks_sparse`), `extend_from_slice` memcpy expansion
+  (`slice_flanks`, `assemble_alt_window`), iterator idioms (`tokenize`, `reconstruct_haplotypes_from_sparse`),
+  and one branchless-arithmetic complement that autovectorizes to SSE2 (`rc_flat_rows_inplace`; scalar
+  loop → COMP LUT; instr count rose 212→283 but operations halved — that IS the win). The `rc` kernel
+  added an exhaustive 256-byte arith-vs-COMP parity-lock test in the cargo suite. Wall-clock ratios
+  are node-noise-limited on this shared HPC node (same metric drifted ≥2× across sessions); the durable
+  signal is deterministic instruction-count reductions + byte-identical parity on both backends.
+  `reverse_flat_rows_inplace` skipped (negligible self-time). `ffi::intervals_and_realign_track_fused`
+  not a direct target (overhead belongs to the kernels it wraps). 7/7 targets kept, 0 reverted.
+  Full tree gate (rust): 985 passed, 12 skipped, 5 xfailed (all pre-existing), 2 transient HPC-load
+  failures (cross-process multiprocessing tests, pass in isolation — same pattern as Phase 3 close-out).
+  Full tree gate (numba): 986 passed, 12 skipped, 5 xfailed (all pre-existing), 1 transient HPC-load
+  failure (same multiprocessing sensitivity). Same pass/xfail profile on both backends confirms
+  byte-identical parity. Cargo: 109 passed. Lint/format/typecheck clean. abi3 wheel builds.
+  Rayon batch parallelism (Phase 5) is the next lever.
+
+- 2026-06-25 (zero-copy scale-safe read path; branch `zero-copy-scale-safe-readpath`, PR TBD): Addressed
+  Phase 3 optimization targets 1–3. **Breaking on-disk change** — track-interval storage converted from
+  array-of-structs (`intervals.npy`, `INTERVAL_DTYPE` itemsize 12, strided field views) to struct-of-arrays
+  (`starts/ends/values.npy` sharing `offsets.npy`), across all four writers (Python single-chunk + chunked,
+  Rust bigwig + table) and the reader; `DATASET_FORMAT_VERSION` bumped `1.0.0`→`2.0.0`. Added an open-time
+  version gate and `gvl.migrate(path)` (streaming, idempotent, crash-safe in-place AoS→SoA; new public
+  symbol in `__all__`). Replaced the per-batch `np.ascontiguousarray` on per-sample-scale interval/genotype
+  memmaps with `_ffi_array` (cross zero-copy or raise loudly); locked closed by `tests/integration/test_scale_guard.py`.
+  Cached the sub-linear per-variant/reference arrays once on `Haps` (`_HapsFfiStatic`). Dropped the zero-init
+  of fully-overwritten fused output buffers (`uninit_output<T>`), isolated for independent revert. Byte-identical
+  parity held on both backends; throughput re-measured (rust at/near numba parity on the heavy tracks/annotated/haps
+  paths — see re-measurement block). The pre-built `chr22_geuv.gvl` bench corpus was migrated in place to 2.0.
+
+- 2026-06-25 (Phase 3 close-out): Merged origin/main (#242 `intervals_to_tracks` clip fix via PR #244;
+  SpliceIndexer subset double-apply fix via PR #243) into the branch — the fused tracks kernel inherits
+  the clip fix (shared `intervals::intervals_to_tracks` core). Lifted ~10 obsolete #242 xfails +
+  #242-domain `assume(False)` guards → real passing max_jitter>0 coverage. Rerouted `Reference.fetch`
+  through the dispatched rust `get_reference`; deleted the three zero-caller `_fetch_*` numba functions.
+  Fused the annotated-haps (`reconstruct_annotated_haplotypes_fused`) and spliced-haps
+  (`reconstruct_haplotypes_spliced_fused`) read paths — both byte-identical to the composed numba oracle.
+  The annotated+spliced intersection is now fused via `reconstruct_annotated_haplotypes_spliced_fused` (Phase 5 W3): one FFI crossing, RC folded in-kernel (bytes reverse-complemented, both annotation arrays reversed), byte-identical to the composed numba oracle, covered by `tests/parity/test_annotated_spliced_haplotypes_parity.py`.
+  Bumped seqpro 0.18→0.20.0 with `to_numpy(validate=False)` at guaranteed-uniform read-path sites.
+  Full tree green on both backends: rust 932 passed, 12 skipped, 5 xfailed, 0 failed; numba 932 passed,
+  12 skipped, 5 xfailed, 0 failed; cargo 88 passed. Remaining xfails (5): `test_e2e_variants`
+  (pre-existing, `_FlatVariants.to_fixed` missing); `test_haps_property` (2 tests, #199/#200
+  pre-existing); `test_indexing::test_parse_idx[missing]` (pre-existing); `test_ref_ds::test_getitem[no_regions]`
+  (pre-existing). Lint/format/typecheck clean; abi3 wheel builds (2 parity test files reformatted by ruff).
+
+- 2026-06-24 (Phase 3 — reconstruction + track realignment, parity-verified): Ported 8 kernel
+  groups to Rust: `padded_slice` (pure cargo, Task 1), `get_reference` (Task 2), spliced-reference
+  backstop (Task 3), `reconstruct_haplotype_from_sparse` singular (Task 4),
+  `reconstruct_haplotypes_from_sparse` batch (Task 5), haplotypes-mode backstop (Task 6),
+  `xorshift64`/`hash4` PRNG (Task 7), `apply_insertion_fill` (4 strategies: Repeat5p,
+  Repeat5pNormalized, Constant, FlankSample — Task 8), `shift_and_realign_tracks_sparse` (Task 9),
+  `tracks_to_intervals` RLE (Task 10), tracks-mode backstop (Task 11). Fusion seams (Tasks 12–14):
+  `reconstruct_haplotypes_fused` collapses 2 FFI crossings to 1 on the plain non-splice haps path
+  (annotated + splice remain unfused); `intervals_and_realign_track_fused` chains
+  `intervals_to_tracks` → `shift_and_realign_tracks_sparse` in 1 crossing per track. Decisions:
+  (1) **Serial-only / rayon-deferred** — batch drivers serial (disjoint per-(query,hap) slices;
+  rayon deferred to Phase 5 optimization pass per no-per-phase-perf-gate policy). (2) **Interpolate
+  strict byte-identity held** — Lagrange arithmetic in f64 matching numba's `np.float64` xs/ys
+  arrays; no numba fallback needed for Interpolate (contrary to an early design note). (3) **#242
+  intervals_to_tracks contract bug** — `debug_assert!(itv.start >= query_start)` panics in debug
+  builds when stored intervals start before the query (max_jitter>0 datasets); root cause: gvl
+  stores intervals at `chromStart - max_jitter` but queries use `chromStart + jitter`. Filed as
+  mcvickerlab/GenVarLoader#242; fix deferred (correct oracle needed for both backends). Parity
+  fixtures use max_jitter=0 datasets; tests using `get_dummy_dataset()` (max_jitter=2) with float
+  tracks on the rust backend fail identically with the pre-existing Phase 0 `intervals_to_tracks`
+  kernel (pre-Phase-3). (4) **`tests/benchmarks/conftest.py` updated** — `captured_haplotypes`
+  fixture now forces `GVL_BACKEND=numba` to capture `reconstruct_haplotypes_from_sparse` args
+  (the rust path now calls `reconstruct_haplotypes_fused`; the micro-benchmark measures the
+  individual dispatch entry, not the fused one). (5) **Env note** — dataset tests require
+  `--basetemp=$(pwd)/.pytest_tmp` (os.link cross-device Errno 18 on HPC; same as Phase 2).
+  **Gate (parity — MET, final-review fixes applied):** 85 cargo tests + 909 pytest passed + 15 xfailed
+  + 0 failed (rust; plus 12 skipped, 1 transient error); lint/format/typecheck clean; abi3 wheel builds.
+  All 11 pre-existing failures converted to xfail(strict=False): 10 x #242 debug_assert panic
+  (itv.start<query_start; tests using get_dummy_dataset() max_jitter=2 with float tracks — xfailed in
+  test_output_bytes_per_instance.py, test_dummy_dataset_insertion_fill.py, test_flat_intervals.py,
+  test_realign_tracks.py, test_seqs_tracks.py) + 1 test_e2e_variants (_FlatVariants.to_fixed missing,
+  pre-Phase-2). Reconstruct parity tests hardened with overshoot pre-check + double-init guard to exclude
+  the numba-bug sub-domain where a deletion drives ref_idx past the contig end (numba and Rust diverge
+  on negative out_end_idx handling; both behaviors are undefined per the production contract). The
+  tracks parity test is sufficient with just the existing SystemError guard (the tracks trailing-fill
+  case does not manifest divergence — see task-15-report.md final-review section). 1 transient error
+  (test_micro.py::test_shift_and_realign_tracks_sparse, resource contention; passes in isolation).
+  **Gate (throughput — recorded, not gated):** see Phase 3 measurement block above.
+
+- 2026-06-24 (Phase 2 — genotype assembly + variant gather, parity-verified): Ported the
+  live assembly/selection kernels `get_diffs_sparse` + `choose_exonic_variants`
+  (`src/genotypes/`) and the 7 flat variant-gather/fill kernels (`src/variants/`):
+  `gather_rows` (unifies `_gather_v_idxs` + `_gather_v_idxs_ss` via `(2,n)` offset
+  normalization), `gather_alleles`, `compact_keep`, `fill_empty_scalar`,
+  `fill_empty_fixed`, `fill_empty_seq`. Deleted dead `filter_af` (+ its dead unit test).
+  Decisions: (1) **dtype-correctness over the plan** — the flat kernels also carry float32
+  dosage and arbitrary-dtype custom FORMAT fields (#231, e.g. int16), so they dispatch by
+  dtype to `*_i32`/`*_f32` rust cores with a dtype-preserving **numba fallback** for all
+  other dtypes; a naive int32-only port (caught + fixed mid-Phase-2) silently truncated
+  float dosage. Generic rust cores use `Vec<T>`/`from_vec` (no `num_traits` dep).
+  (2) **Gate reframed to parity-only** on a persistent `rust-migration` branch (see
+  "Branch & gate strategy") — measured rust is a stable ~7% slower than numba, but cProfile
+  pins the cost on per-kernel Python dispatch glue (`np.ascontiguousarray` = 62% of the
+  variants loop), not rust compute; throughput is restored by a later "single big
+  `__getitem__` kernel" optimization pass, not by gating Phase 2. (3) `OFFSET_TYPE`/genoray
+  `V_IDX_TYPE`=int32, `DOSAGE_TYPE`=float32 confirmed at runtime. Env note: dataset tests
+  need pytest's tmp on the same filesystem as `tests/data` (`--basetemp=<repo>/.pytest_tmp`)
+  or the GVL write path's `os.link` hardlink fails cross-device (Errno 18) — environmental,
+  not a code defect.
 - 2026-06-18: Roadmap created. Decisions: standalone crate + thin PyO3 binding;
   bottom-up starting from ragged primitives; strangler-fig with byte-identical parity
   gate; perf gates = write wall-clock+RSS and getitem throughput; seqpro/genoray in scope
@@ -341,7 +1172,8 @@ narrowed to genoray (variant IO) only.
   Rust (seqpro rag layer now numba-free). Bumped seqpro's pymodule to pyo3 0.28 /
   numpy 0.28 / ndarray 0.17 (hygiene; NOT required for the link — two pymodules
   with different pyo3 versions coexist; the single-version rule is per-cdylib, and
-  the shared core is pyo3-free). GVL links seqpro-core via a path dep (editable;
-  flip to git/release before shipping) and routes its `to_padded` chokepoint
+  the shared core is pyo3-free). GVL links seqpro-core via the crates.io registry
+  dep (`seqpro-core 0.1.0`, verified in `Cargo.lock`; no path dep or `[patch]`
+  override — shipping prerequisite already satisfied) and routes its `to_padded` chokepoint
   through the shared kernel (proof-point, byte-identical parity). Inverts Phase 6
   (seqpro stays the substrate). PRs: seqpro ML4GLand/SeqPro#60, GVL mcvickerlab/GenVarLoader#240.
diff --git a/docs/superpowers/plans/2026-06-24-phase-3-closeout.md b/docs/superpowers/plans/2026-06-24-phase-3-closeout.md
new file mode 100644
index 00000000..4b52920a
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-24-phase-3-closeout.md
@@ -0,0 +1,678 @@
+# Phase 3 Close-out Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Bring `phase-3-reconstruction` to an honest, fully-rust-default state — merge the bug fixes that landed on `main` during Phase 3, lift the now-obsolete #242 test exclusions, port the one genuinely-missing kernel (`Reference.fetch`), fuse the annotated/splice haps read paths, bump seqpro to 0.20.0, and reconcile the roadmap.
+
+**Architecture:** GVL is a Python/Rust hybrid. Hot kernels live in `src/` (pure `ndarray` cores in domain modules, PyO3 wrappers in `src/ffi/mod.rs`), exposed to Python and routed through a backend-dispatch registry (`python/genvarloader/_dispatch.py`) where each kernel registers a `numba` parity reference and a `rust` impl with `default="rust"`. The migration contract is **byte-identical parity** between backends, gated by `@pytest.mark.parity` suites that flip `GVL_BACKEND`. This plan adds two fused kernels (reuse existing cores), reroutes one path through an existing kernel, and merges upstream fixes.
+
+**Tech Stack:** Rust (`ndarray`, `rayon`, PyO3 0.28, `numpy` 0.28, `seqpro-core` 0.1.0), Python 3.10–3.13, numba (parity refs only), pytest + hypothesis, maturin, pixi.
+
+## Global Constraints
+
+- **No public API change.** Nothing in `python/genvarloader/__init__.py` `__all__`, `gvl.write`, `Dataset.open`, or `Dataset.with_*` signatures changes. (Per CLAUDE.md, a public-API change would also require a `skills/genvarloader/SKILL.md` update — not expected here.)
+- **Byte-identical parity** is the landing gate for every new/rerouted kernel — verified across `GVL_BACKEND=rust` and `GVL_BACKEND=numba`.
+- **Do NOT delete numba parity references** (Phase 5 owns that). Exception: code with *zero callers* may be deleted (precedent: `filter_af`, `splits_sum_le_value`).
+- **No new perf gate.** Phase 3 is parity-gated; throughput is recorded only.
+- **seqpro version floor:** `pixi.toml` pin `==0.20.0`; `pyproject.toml` floor `>=0.20`.
+- **Merge style:** merge commit, never squash (preserve history).
+- **HPC test env:** dataset tests require `--basetemp=$(pwd)/.pytest_tmp` on Carter (os.link cross-device Errno 18).
+- **Commands run under pixi:** `pixi run -e dev <task>`. Build the Rust ext with `pixi run -e dev maturin develop --release` (or the project's `develop` task) after Rust changes.
+- **Lint/format/typecheck scope:** `ruff check python/ tests/`, `ruff format python/ tests/`, `pixi run -e dev typecheck`.
+- **RTK:** prefix shell commands with `rtk` (e.g. `rtk git commit`).
+
+---
+
+## File-touch map
+
+| File | Responsibility | Tasks |
+|---|---|---|
+| (git merge) `python/genvarloader/_dataset/_intervals.py` | resolve #242 clip-fix vs Phase 3 conflict | 1 |
+| `tests/dataset/test_flat_intervals.py`, `test_seqs_tracks.py`, `test_realign_tracks.py`; `tests/unit/dataset/test_output_bytes_per_instance.py`; `tests/integration/dataset/test_dummy_dataset_insertion_fill.py` | drop `_REASON_242` xfails | 2 |
+| `tests/parity/test_reconstruct_haplotypes_parity.py`, `test_shift_and_realign_tracks_parity.py` | drop #242-domain `assume(False)` guards (keep trailing-under-write guard) | 2 |
+| `python/genvarloader/_dataset/_reference.py` | reroute `Reference.fetch` through dispatched `get_reference`; retire dead `_fetch_*` | 3 |
+| `tests/parity/test_reference_fetch_parity.py` (new) | fetch parity backstop | 3 |
+| `src/ffi/mod.rs` | add `reconstruct_annotated_haplotypes_fused`, `reconstruct_haplotypes_spliced_fused` | 4, 5 |
+| `src/lib.rs` | register the two new pyfunctions | 4, 5 |
+| `python/genvarloader/_dataset/_haps.py` | route annotated/splice branches to the fused entries | 4, 5 |
+| `python/genvarloader/genvarloader.pyi` | stub the new pyfunctions | 4, 5 |
+| `tests/parity/test_haplotypes_dataset_parity.py` | move annotated spy to fused entry; add splice fixture coverage | 4, 5 |
+| `pixi.toml`, `pyproject.toml` | seqpro 0.20 bump | 6 |
+| (read-path materialization sites, TBD by inventory) | `to_numpy(validate=False)` adoption | 6 |
+| `docs/roadmaps/rust-migration.md` | honesty pass | 7 |
+
+---
+
+## Task 1: Merge `origin/main` into the branch
+
+**Files:**
+- Modify (conflict): `python/genvarloader/_dataset/_intervals.py`
+
+**Interfaces:**
+- Consumes: nothing.
+- Produces: branch containing #242 clip fix (`src/intervals.rs` `intervals_to_tracks` left-clamp) + #243 SpliceIndexer fix. The fused tracks kernel `intervals_and_realign_track_fused` inherits the clip fix automatically (it calls `intervals::intervals_to_tracks`).
+
+- [ ] **Step 1: Confirm fetch is current and review the incoming fixes**
+
+```bash
+rtk git fetch origin
+rtk proxy git log --oneline HEAD..origin/main
+```
+Expected: the 9 commits incl. `fe83436 fix(intervals): clip sub-query interval starts` and `d814965 fix(indexing): SpliceIndexer.parse_idx double-applies sample-subset map`.
+
+- [ ] **Step 2: Start the merge**
+
+```bash
+rtk git merge origin/main --no-edit
+```
+Expected: conflict in `python/genvarloader/_dataset/_intervals.py` (others auto-merge). If it reports more conflicts, resolve each by keeping BOTH main's fix and Phase 3's additions.
+
+- [ ] **Step 3: Resolve `_intervals.py`**
+
+Open the file. The conflict is between main's clip logic (clamp `itv.start` up to `query_start` in `_intervals_to_tracks_numba`) and Phase 3's additions (the registered `intervals_to_tracks` dispatcher block, +45 lines). Keep main's clamp inside the numba kernel AND Phase 3's dispatch registration. Verify no `<<<<<<<`/`=======`/`>>>>>>>` markers remain:
+
+```bash
+rtk proxy grep -n "<<<<<<<\|=======\|>>>>>>>" python/genvarloader/_dataset/_intervals.py
+```
+Expected: no output.
+
+- [ ] **Step 4: Build and smoke-check**
+
+```bash
+rtk git add python/genvarloader/_dataset/_intervals.py
+pixi run -e dev maturin develop --release 2>&1 | tail -5
+```
+Expected: build succeeds (`src/intervals.rs` carries the clip fix; clean Rust merge).
+
+- [ ] **Step 5: Run the #242 kernel test from main + the intervals parity test (still xfailed at this point)**
+
+```bash
+pixi run -e dev pytest tests/unit/dataset/test_intervals_kernel.py tests/parity -k intervals -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS (this is the test PR #244 added to lock the clip fix).
+
+- [ ] **Step 6: Complete the merge commit**
+
+```bash
+rtk git commit --no-edit
+```
+Expected: merge commit recorded (no squash).
+
+---
+
+## Task 2: Lift the now-obsolete #242 test exclusions
+
+**Files:**
+- Modify: `tests/dataset/test_flat_intervals.py`, `tests/dataset/test_seqs_tracks.py`, `tests/dataset/test_realign_tracks.py`
+- Modify: `tests/unit/dataset/test_output_bytes_per_instance.py`
+- Modify: `tests/integration/dataset/test_dummy_dataset_insertion_fill.py`
+- Modify: `tests/parity/test_reconstruct_haplotypes_parity.py`, `tests/parity/test_shift_and_realign_tracks_parity.py`
+
+**Interfaces:**
+- Consumes: Task 1's merged #242 fix.
+- Produces: the `max_jitter>0` interval domain is now real, passing coverage (no xfail).
+
+- [ ] **Step 1: Confirm these tests now PASS as xpass (fix is in)**
+
+```bash
+pixi run -e dev pytest tests/dataset/test_realign_tracks.py tests/dataset/test_seqs_tracks.py tests/dataset/test_flat_intervals.py tests/unit/dataset/test_output_bytes_per_instance.py tests/integration/dataset/test_dummy_dataset_insertion_fill.py -q --basetemp=$(pwd)/.pytest_tmp -rX
+```
+Expected: the `_REASON_242`-marked tests report **XPASS** (they pass despite the xfail marker) — proof the fix resolves them. If any still genuinely FAIL, STOP and investigate (the clip fix did not cover that case — that is a real signal, do not re-xfail).
+
+- [ ] **Step 2: Remove the `xfail` markers + `_REASON_242` constants**
+
+In each of the 5 test files, delete the `_REASON_242 = (...)` constant and every `@pytest.mark.xfail(strict=False, reason=_REASON_242)` decorator that references it. Leave the test bodies unchanged. Example diff shape (apply per occurrence):
+
+```python
+# DELETE these lines:
+_REASON_242 = (
+    "mcvickerlab/GenVarLoader#242 — intervals_to_tracks itv.start<query_start "
+    "..."
+)
+...
+@pytest.mark.xfail(strict=False, reason=_REASON_242)   # DELETE this decorator
+def test_something(...):
+    ...
+```
+
+Verify none remain:
+```bash
+rtk proxy grep -rn "_REASON_242" tests/
+```
+Expected: no output.
+
+- [ ] **Step 3: Remove ONLY the #242-domain `assume(False)` guards in parity tests**
+
+In `tests/parity/test_shift_and_realign_tracks_parity.py` and `tests/parity/test_reconstruct_haplotypes_parity.py`, remove the `assume(False)` branches whose comments tie them to the `itv.start < query_start` / `start>=clen` / #242 family. **KEEP** the *reconstruct trailing-under-write* overshoot pre-check + double-init guard (that excludes a genuine numba-undefined domain, not #242). Read each `assume(False)` site's comment before deleting — when in doubt, keep it.
+
+- [ ] **Step 4: Run the full affected set on BOTH backends**
+
+```bash
+GVL_BACKEND=rust pixi run -e dev pytest tests/dataset tests/unit/dataset tests/integration/dataset tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+GVL_BACKEND=numba pixi run -e dev pytest tests/dataset tests/unit/dataset tests/integration/dataset tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: all PASS, 0 xfail from `_REASON_242`. (Numba may still legitimately skip the trailing-under-write domain via the retained guard.)
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add tests/
+rtk git commit -m "test(parity): lift obsolete #242 xfails after main clip-fix merge
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 3: Reroute `Reference.fetch` through the dispatched rust `get_reference`
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_reference.py:119-183`
+- Create: `tests/parity/test_reference_fetch_parity.py`
+
+**Interfaces:**
+- Consumes: existing `get_reference(regions, out_offsets, reference, ref_offsets, pad_char)` dispatcher (`_reference.py:743`, `default="rust"`), which packs `regions[i] = (contig_idx, start, end)` and calls the rust `reference::get_reference` core (same `padded_slice` row op as `_fetch_row`).
+- Produces: `Reference.fetch` runs rust by default; numba `_fetch_impl_*` become zero-caller dead code.
+
+- [ ] **Step 1: Write the failing parity test**
+
+Create `tests/parity/test_reference_fetch_parity.py`:
+
+```python
+"""Parity backstop for Reference.fetch (rerouted through dispatched get_reference).
+
+fetch builds regions=(contig_idx, start, end) and out_offsets, then calls the
+same get_reference core used by the main reference read path. This test flips
+GVL_BACKEND and asserts byte-identical fetched sequence across backends, with a
+spy proving the rust get_reference kernel is actually invoked.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import genvarloader._dataset._reference as _ref_mod
+import genvarloader._dispatch as _dispatch
+
+pytestmark = pytest.mark.parity
+
+
+def test_reference_fetch_parity(reference, monkeypatch):
+    ref = _ref_mod.Reference.from_path_and_contigs(reference, None) \
+        if hasattr(_ref_mod.Reference, "from_path_and_contigs") \
+        else _ref_mod.Reference.from_path(reference)
+    contigs = ref.contigs[:1]
+    starts = np.array([0], dtype=np.int64)
+    ends = np.array([50], dtype=np.int64)
+
+    numba_fn, rust_fn = _dispatch.backends("get_reference")
+    calls = {"n": 0}
+
+    def _spy(*a, **k):
+        calls["n"] += 1
+        return rust_fn(*a, **k)
+
+    orig = dict(_dispatch._REGISTRY["get_reference"])
+    _dispatch.register("get_reference", numba=numba_fn, rust=_spy, default="numba")
+    try:
+        monkeypatch.setenv("GVL_BACKEND", "rust")
+        out_rust = ref.fetch(contigs, starts, ends)
+        rust_calls = calls["n"]
+        monkeypatch.setenv("GVL_BACKEND", "numba")
+        out_numba = ref.fetch(contigs, starts, ends)
+        assert calls["n"] == rust_calls, "rust spy fired during numba read"
+    finally:
+        _dispatch._REGISTRY["get_reference"] = orig
+
+    assert rust_calls > 0, "rust get_reference never invoked via fetch — vacuous"
+    np.testing.assert_array_equal(
+        np.asarray(out_numba.data), np.asarray(out_rust.data)
+    )
+    np.testing.assert_array_equal(
+        np.asarray(out_numba.offsets, np.int64),
+        np.asarray(out_rust.offsets, np.int64),
+    )
+```
+
+> Note: adapt the `Reference` construction line to the actual constructor in `_reference.py` (check `Reference.from_path*`/`__init__` and the `reference` fixture in `tests/conftest.py` before running — replace the `hasattr` shim with the real call).
+
+- [ ] **Step 2: Run it to confirm it fails (fetch still bypasses get_reference)**
+
+```bash
+pixi run -e dev pytest tests/parity/test_reference_fetch_parity.py -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: FAIL — `rust get_reference never invoked via fetch` (fetch currently calls `_fetch_impl_*` directly).
+
+- [ ] **Step 3: Reroute `Reference.fetch`**
+
+In `_reference.py`, replace the kernel-selection block inside `fetch` (currently lines 135-148) with a call to the dispatched `get_reference`, assembling a `(n,3)` regions array:
+
+```python
+        lengths = ends - starts
+        offsets = lengths_to_offsets(lengths)
+        regions = np.stack(
+            [
+                np.asarray(c_idxs, np.int32),
+                np.asarray(starts, np.int32),
+                np.asarray(ends, np.int32),
+            ],
+            axis=1,
+        )
+        seqs = get_reference(
+            regions, offsets, self.reference, self.offsets, int(self.pad_char)
+        )
+        seqs = Ragged.from_offsets(seqs.view("S1"), (len(contigs), None), offsets)
+        return seqs
+```
+
+(`get_reference` is defined later in the same module; it is module-level, so the forward reference resolves at call time.)
+
+- [ ] **Step 4: Delete the now-dead `_fetch_row`/`_fetch_impl_par`/`_fetch_impl_ser`**
+
+Confirm zero callers, then remove all three numba functions (`_reference.py:155-183`):
+```bash
+rtk proxy grep -rn "_fetch_impl_par\|_fetch_impl_ser\|_fetch_row" python/ tests/
+```
+Expected after edit: no production/test references (only the definitions, which you then delete). This is zero-caller dead-code removal (allowed by the Global Constraints exception).
+
+- [ ] **Step 5: Build + run the parity test**
+
+```bash
+pixi run -e dev maturin develop --release 2>&1 | tail -3
+pixi run -e dev pytest tests/parity/test_reference_fetch_parity.py -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS.
+
+- [ ] **Step 6: Run the spliced-ref + flat-flanks paths that use fetch**
+
+```bash
+pixi run -e dev pytest tests/ -k "splice or flank or ref" -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS (RefDataset spliced path + `_flat_flanks.py` now use rust via get_reference).
+
+- [ ] **Step 7: Commit**
+
+```bash
+rtk git add python/genvarloader/_dataset/_reference.py tests/parity/test_reference_fetch_parity.py
+rtk git commit -m "perf(reference): route Reference.fetch through rust get_reference; drop dead _fetch_* numba
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 4: Fuse the annotated-haps path
+
+**Files:**
+- Modify: `src/ffi/mod.rs` (add `reconstruct_annotated_haplotypes_fused`)
+- Modify: `src/lib.rs` (register pyfunction)
+- Modify: `python/genvarloader/_dataset/_haps.py:884-...` (route annotated non-splice branch)
+- Modify: `python/genvarloader/genvarloader.pyi` (stub)
+- Modify: `tests/parity/test_haplotypes_dataset_parity.py` (move annotated spy to fused entry)
+
+**Interfaces:**
+- Consumes: `reconstruct::reconstruct_haplotypes_from_sparse` core, which **already accepts `annot_v_idxs`/`annot_ref_pos`** (`src/ffi/mod.rs:474-475` currently passes `None`). Also `genotypes::get_diffs_sparse` (for output-length computation).
+- Produces (exact signature, mirrors `reconstruct_haplotypes_fused` but returns 3 arrays):
+  ```rust
+  pub fn reconstruct_annotated_haplotypes_fused<'py>(
+      py: Python<'py>,
+      regions: PyReadonlyArray2<i32>, shifts: PyReadonlyArray2<i32>,
+      geno_offset_idx: PyReadonlyArray2<i64>, geno_offsets: PyReadonlyArray2<i64>,
+      geno_v_idxs: PyReadonlyArray1<i32>, v_starts: PyReadonlyArray1<i32>,
+      ilens: PyReadonlyArray1<i32>, alt_alleles: PyReadonlyArray1<u8>,
+      alt_offsets: PyReadonlyArray1<i64>, ref_: PyReadonlyArray1<u8>,
+      ref_offsets: PyReadonlyArray1<i64>, pad_char: u8, output_length: i64,
+      keep: Option<PyReadonlyArray1<bool>>, keep_offsets: Option<PyReadonlyArray1<i64>>,
+  ) -> (Bound<'py, PyArray1<u8>>, Bound<'py, PyArray1<i32>>, Bound<'py, PyArray1<i64>>)
+  ```
+  Returns `(out_data, annot_v_idxs_data, annot_ref_pos_data, out_offsets)` — actually return 4 arrays: bytes, var_idxs (i32), ref_coords (i32), offsets (i64). The Python wrapper builds three Ragged from the shared offsets.
+
+- [ ] **Step 1: Add the failing parity assertion (update existing annotated test to spy the fused entry)**
+
+In `tests/parity/test_haplotypes_dataset_parity.py::test_annotated_haplotypes_mode_dataset_parity`, change the spy from the dispatched `reconstruct_haplotypes_from_sparse` to the new module-level fused entry, mirroring `test_haplotypes_mode_dataset_parity` (which spies `_haps_mod.reconstruct_haplotypes_fused`):
+
+```python
+    import genvarloader._dataset._haps as _haps_mod
+    orig_fused = _haps_mod.reconstruct_annotated_haplotypes_fused
+    calls = {"n": 0}
+
+    def _spy_fused(*a, **k):
+        calls["n"] += 1
+        return orig_fused(*a, **k)
+
+    monkeypatch.setattr(
+        _haps_mod, "reconstruct_annotated_haplotypes_fused", _spy_fused
+    )
+    monkeypatch.setenv("GVL_BACKEND", "rust")
+    out_rust = ds[:, :]
+    rust_call_count = calls["n"]
+    monkeypatch.setenv("GVL_BACKEND", "numba")
+    out_numba = ds[:, :]
+    assert calls["n"] == rust_call_count, "fused spy fired during numba read"
+    assert calls["n"] > 0, "rust annotated fused entry never invoked — vacuous"
+```
+Keep the existing three-array byte-identical comparison (`_compare_ragged_bytes` + two `_compare_ragged_int`).
+
+- [ ] **Step 2: Run it to confirm it fails**
+
+```bash
+pixi run -e dev pytest tests/parity/test_haplotypes_dataset_parity.py::test_annotated_haplotypes_mode_dataset_parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: FAIL — `AttributeError: ... has no attribute 'reconstruct_annotated_haplotypes_fused'`.
+
+- [ ] **Step 3: Implement the rust fused kernel**
+
+In `src/ffi/mod.rs`, add `reconstruct_annotated_haplotypes_fused` by copying `reconstruct_haplotypes_fused` (lines 373-480) and making exactly these changes:
+1. Add the 4-array return type (bytes, i32 var_idxs, i32 ref_coords, i64 offsets).
+2. After allocating `out_data`, also allocate `let mut annot_v: Array1<i32> = Array1::zeros(total);` and `let mut annot_pos: Array1<i32> = Array1::zeros(total);`.
+3. In the `reconstruct::reconstruct_haplotypes_from_sparse(...)` call, replace the two trailing `None,  // annot_*` args with `Some(annot_v.view_mut()), Some(annot_pos.view_mut())` (match the core's expected `Option<ArrayViewMut1<i32>>` param types — check `src/reconstruct/mod.rs:282` signature and adapt).
+4. Return `(out_data.into_pyarray(py), annot_v.into_pyarray(py), annot_pos.into_pyarray(py), out_offsets_vec.into_pyarray(py))`.
+
+- [ ] **Step 4: Register the pyfunction**
+
+In `src/lib.rs` after line 38 (`reconstruct_haplotypes_fused`):
+```rust
+    m.add_function(wrap_pyfunction!(ffi::reconstruct_annotated_haplotypes_fused, m)?)?;
+```
+
+- [ ] **Step 5: Add the `.pyi` stub**
+
+In `python/genvarloader/genvarloader.pyi`, add a stub mirroring the existing `reconstruct_haplotypes_fused` stub but with the 4-tuple return (`tuple[NDArray[np.uint8], NDArray[np.int32], NDArray[np.int32], NDArray[np.int64]]`).
+
+- [ ] **Step 6: Route the Python annotated branch to the fused entry**
+
+In `_haps.py::_reconstruct_annotated_haplotypes` (non-splice branch, currently lines 895-919), add a `_backend = os.environ.get("GVL_BACKEND", "rust")` check mirroring `_reconstruct_haplotypes` (lines 773-817). When rust: call `reconstruct_annotated_haplotypes_fused(...)` (import it at module top alongside `reconstruct_haplotypes_fused`), wrap the 3 returned data arrays into Ragged via the shared `out_offsets`, and return the `RaggedAnnotatedHaps`-equivalent tuple. When numba: keep the existing composed `reconstruct_haplotypes_from_sparse(...)` call unchanged.
+
+- [ ] **Step 7: Build + run the parity test**
+
+```bash
+pixi run -e dev maturin develop --release 2>&1 | tail -3
+pixi run -e dev pytest tests/parity/test_haplotypes_dataset_parity.py::test_annotated_haplotypes_mode_dataset_parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS (byte-identical haps + var_idxs + ref_coords; fused spy fired).
+
+- [ ] **Step 8: Run cargo + annotated integration tests**
+
+```bash
+rtk cargo test 2>&1 | tail -5
+pixi run -e dev pytest tests/ -k "annot" -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS.
+
+- [ ] **Step 9: Commit**
+
+```bash
+rtk git add src/ffi/mod.rs src/lib.rs python/genvarloader/genvarloader.pyi python/genvarloader/_dataset/_haps.py tests/parity/test_haplotypes_dataset_parity.py
+rtk git commit -m "perf(reconstruct): fused annotated-haps __getitem__ kernel (dataset parity)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 5: Fuse the splice haps path
+
+**Files:**
+- Modify: `src/ffi/mod.rs` (add `reconstruct_haplotypes_spliced_fused`)
+- Modify: `src/lib.rs` (register)
+- Modify: `python/genvarloader/_dataset/_haps.py:846-882` (route splice branch)
+- Modify: `python/genvarloader/genvarloader.pyi` (stub)
+- Create: `tests/parity/test_spliced_haplotypes_parity.py`
+
+**Interfaces:**
+- Consumes: `reconstruct::reconstruct_haplotypes_from_sparse` core. The Python side already computes the splice permutation (`_permute_request_for_splice` → `flat_geno_idx`, `flat_shifts`, `permuted_regions`, `keep_perm`, `keep_offsets_perm`) and `splice_plan.permuted_out_offsets`. **The permutation stays in Python**; only the reconstruction FFI crossing fuses.
+- Produces (the splice variant takes precomputed `out_offsets` instead of computing diffs):
+  ```rust
+  pub fn reconstruct_haplotypes_spliced_fused<'py>(
+      py: Python<'py>,
+      permuted_regions: PyReadonlyArray2<i32>,   // (n_perm, 3)
+      flat_shifts: PyReadonlyArray2<i32>,        // (n_perm, 1)
+      flat_geno_offset_idx: PyReadonlyArray2<i64>, // (n_perm, 1)
+      out_offsets: PyReadonlyArray1<i64>,        // permuted_out_offsets (n_perm+1)
+      geno_offsets: PyReadonlyArray2<i64>, geno_v_idxs: PyReadonlyArray1<i32>,
+      v_starts: PyReadonlyArray1<i32>, ilens: PyReadonlyArray1<i32>,
+      alt_alleles: PyReadonlyArray1<u8>, alt_offsets: PyReadonlyArray1<i64>,
+      ref_: PyReadonlyArray1<u8>, ref_offsets: PyReadonlyArray1<i64>, pad_char: u8,
+      keep: Option<PyReadonlyArray1<bool>>, keep_offsets: Option<PyReadonlyArray1<i64>>,
+  ) -> Bound<'py, PyArray1<u8>>   // out_data only; caller already has out_offsets
+  ```
+
+- [ ] **Step 1: Write the failing splice parity test**
+
+Create `tests/parity/test_spliced_haplotypes_parity.py`. It needs a spliced dataset fixture. Check `tests/conftest.py` / `tests/parity/conftest.py` for an existing `splice_info`-bearing fixture; if none exists, build one from the existing `phased_svar_gvl` by opening with a minimal synthetic `splice_info` (transcript-ID grouping over the BED regions). Mirror `test_haplotypes_dataset_parity.py` structure, spying `_haps_mod.reconstruct_haplotypes_spliced_fused`:
+
+```python
+"""Spliced-haplotypes dataset parity backstop (fused rust splice entry)."""
+from __future__ import annotations
+import numpy as np
+import pytest
+import genvarloader as gvl
+import genvarloader._dataset._haps as _haps_mod
+
+pytestmark = pytest.mark.parity
+
+
+def test_spliced_haplotypes_parity(spliced_gvl, reference, monkeypatch):
+    ds = gvl.Dataset.open(spliced_gvl, reference=reference).with_seqs("haplotypes")
+    orig = _haps_mod.reconstruct_haplotypes_spliced_fused
+    calls = {"n": 0}
+
+    def _spy(*a, **k):
+        calls["n"] += 1
+        return orig(*a, **k)
+
+    monkeypatch.setattr(_haps_mod, "reconstruct_haplotypes_spliced_fused", _spy)
+    monkeypatch.setenv("GVL_BACKEND", "rust")
+    out_rust = ds[:, :]
+    rc = calls["n"]
+    monkeypatch.setenv("GVL_BACKEND", "numba")
+    out_numba = ds[:, :]
+    assert calls["n"] == rc, "fused splice spy fired during numba read"
+    assert calls["n"] > 0, "rust spliced fused entry never invoked — vacuous"
+    np.testing.assert_array_equal(
+        np.asarray(out_numba.data), np.asarray(out_rust.data)
+    )
+    np.testing.assert_array_equal(
+        np.asarray(out_numba.offsets, np.int64),
+        np.asarray(out_rust.offsets, np.int64),
+    )
+```
+
+> If building a synthetic spliced fixture proves disproportionate, STOP and report — per the spec, splice fusion may fall back to the documented unfused-rust path with an honest roadmap note rather than blocking the plan.
+
+- [ ] **Step 2: Run it to confirm it fails**
+
+```bash
+pixi run -e dev pytest tests/parity/test_spliced_haplotypes_parity.py -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: FAIL — `AttributeError: ... reconstruct_haplotypes_spliced_fused`.
+
+- [ ] **Step 3: Implement the rust splice fused kernel**
+
+In `src/ffi/mod.rs`, add `reconstruct_haplotypes_spliced_fused`. It is `reconstruct_haplotypes_fused` **without** the diff/out-offset computation (Steps 1-2 of that fn): the caller passes `out_offsets` directly. Body:
+1. `let out_offsets_a = out_offsets.as_array();` `let total = out_offsets_a[out_offsets_a.len()-1] as usize;`
+2. `let mut out_data: Array1<u8> = Array1::zeros(total);`
+3. Call `reconstruct::reconstruct_haplotypes_from_sparse(out_data.view_mut(), out_offsets_a, permuted_regions.as_array(), flat_shifts.as_array(), flat_geno_offset_idx.as_array(), go_starts, go_stops, geno_v_idxs.as_array(), v_starts.as_array(), ilens.as_array(), alt_alleles.as_array(), alt_offsets.as_array(), ref_.as_array(), ref_offsets.as_array(), pad_char, keep.as_ref().map(|k| k.as_array()), keep_offsets.as_ref().map(|ko| ko.as_array()), None, None);`
+4. `out_data.into_pyarray(py)`
+
+- [ ] **Step 4: Register + stub**
+
+`src/lib.rs`: `m.add_function(wrap_pyfunction!(ffi::reconstruct_haplotypes_spliced_fused, m)?)?;`
+`genvarloader.pyi`: stub returning `NDArray[np.uint8]`.
+
+- [ ] **Step 5: Route the Python splice branch**
+
+In `_haps.py::_reconstruct_haplotypes` splice-plan branch (lines 846-882), add a `_backend` check. When rust: after `_permute_request_for_splice`, call `reconstruct_haplotypes_spliced_fused(...)` (import at top) with the permuted arrays + `splice_plan.permuted_out_offsets`, then wrap into the `_Flat.from_offsets(out_buf, per_elem_shape, splice_plan.permuted_out_offsets).view("S1")` as today. When numba: keep the existing composed `reconstruct_haplotypes_from_sparse(...)` call unchanged.
+
+- [ ] **Step 6: Build + run the splice parity test**
+
+```bash
+pixi run -e dev maturin develop --release 2>&1 | tail -3
+pixi run -e dev pytest tests/parity/test_spliced_haplotypes_parity.py -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS.
+
+- [ ] **Step 7: Cargo + splice integration tests**
+
+```bash
+rtk cargo test 2>&1 | tail -5
+pixi run -e dev pytest tests/ -k splice -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS.
+
+- [ ] **Step 8: Commit**
+
+```bash
+rtk git add src/ffi/mod.rs src/lib.rs python/genvarloader/genvarloader.pyi python/genvarloader/_dataset/_haps.py tests/parity/test_spliced_haplotypes_parity.py tests/conftest.py
+rtk git commit -m "perf(reconstruct): fused spliced-haps __getitem__ kernel (dataset parity)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 6: Bump seqpro to 0.20.0 + adopt `to_numpy(validate=False)`
+
+**Files:**
+- Modify: `pixi.toml:91`, `pyproject.toml:13`
+- Modify: read-path materialization sites (determined by inventory in Step 3)
+
+**Interfaces:**
+- Consumes: seqpro 0.20.0's `to_numpy(validate=False)` (skips the uniformity scan).
+- Produces: faster fixed-length materialization where row uniformity is guaranteed.
+
+- [ ] **Step 1: Bump the pins**
+
+`pixi.toml:91`: `seqpro = "==0.18.0"` → `seqpro = "==0.20.0"`.
+`pyproject.toml:13`: `"seqpro>=0.18",` → `"seqpro>=0.20",`.
+
+```bash
+pixi install -e dev 2>&1 | tail -5
+pixi run -e dev python -c "import seqpro; print(seqpro.__version__)"
+```
+Expected: `0.20.0`.
+
+- [ ] **Step 2: Verify seqpro-core Rust layout still matches**
+
+```bash
+pixi run -e dev maturin develop --release 2>&1 | tail -3
+rtk cargo test 2>&1 | tail -5
+GVL_BACKEND=rust pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: build + cargo + parity all PASS (proves the `seqpro-core` 0.1.0 `Ragged` layout still matches 0.20.0). If parity breaks, STOP — the layout drifted and needs a `seqpro-core` bump (out of this plan's scope; report).
+
+- [ ] **Step 3: Inventory guaranteed-uniform `.to_numpy()` / materialization sites**
+
+```bash
+rtk proxy grep -rn "to_numpy\|to_padded\|to_fixed\|\.to_fixed(" python/genvarloader/
+```
+Identify sites on the read path where row lengths are uniform *by construction* (fixed-length / `with_len(L)` output, padded materialization). Produce a short list with file:line and a one-line justification each. **Do not edit yet** — these are the propose-then-approve candidates per the spec.
+
+- [ ] **Step 4: STOP and present the candidate list to the maintainer for approval**
+
+Present the inventory. Apply `validate=False` only to approved sites. (If the maintainer defers, skip to Step 6 with just the version bump.)
+
+- [ ] **Step 5: Apply `validate=False` at approved sites + re-verify parity**
+
+For each approved site, add `validate=False` to the `to_numpy(...)` call. Then:
+```bash
+GVL_BACKEND=rust pixi run -e dev pytest tests/dataset tests/unit/dataset tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS (output unchanged — `validate=False` only skips the scan, never changes data).
+
+- [ ] **Step 6: Commit**
+
+```bash
+rtk git add pixi.toml pyproject.toml pixi.lock python/genvarloader/
+rtk git commit -m "build(seqpro): bump to 0.20.0; adopt to_numpy(validate=False) on uniform read-path sites
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 7: Roadmap honesty pass + full-tree verification
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md`
+
+**Interfaces:**
+- Consumes: all prior tasks.
+- Produces: roadmap consistent with reality; full green tree on both backends.
+
+- [ ] **Step 1: Full-tree verification on BOTH backends**
+
+```bash
+GVL_BACKEND=rust pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | tail -15
+GVL_BACKEND=numba pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | tail -15
+rtk cargo test 2>&1 | tail -5
+```
+Expected: all PASS; the only remaining xfails are the genuine non-#242 ones (trailing-under-write numba domain, `test_e2e_variants` if still pre-existing). Record counts.
+
+- [ ] **Step 2: Lint / format / typecheck**
+
+```bash
+pixi run -e dev ruff check python/ tests/
+pixi run -e dev ruff format python/ tests/
+pixi run -e dev typecheck 2>&1 | tail -10
+```
+Expected: clean.
+
+- [ ] **Step 3: Confirm abi3 wheel builds**
+
+```bash
+pixi run -e dev maturin build --release 2>&1 | tail -5
+```
+Expected: wheel builds.
+
+- [ ] **Step 4: Reconcile the Phase 3 section of the roadmap**
+
+In `docs/roadmaps/rust-migration.md` Phase 3 section (lines ~270-312):
+- Check off item "Migrate `_dataset/_reconstruct.py` + `_dataset/_haps.py` remaining paths" — note annotated + splice now fused (Tasks 4-5).
+- Reword the `_tracks.py`/`_intervals.py` item: rust-default + fused; remaining numba are Phase-5-deletion parity refs.
+- Check off the `_reference.py` item — note `Reference.fetch` rerouted through rust `get_reference`; `_fetch_*` numba deleted (zero callers).
+- Check off the `_insertion_fill.py` + `_splice.py` item (no numba kernels; splice fused via Task 5) — OR, if splice fusion fell back per Task 5 Step 1, mark it "rust-default, fusion deferred to Phase 5" with the honest note.
+- Resolve the `✅`-header / unchecked-box contradiction so the marker matches the boxes.
+
+- [ ] **Step 5: Add a dated decisions-log entry**
+
+Append to the "Notes & decisions log" (top entry, dated 2026-06-24):
+```
+- 2026-06-24 (Phase 3 close-out): Merged origin/main (#242 intervals_to_tracks
+  clip fix via PR #244; SpliceIndexer subset double-apply fix via PR #243) into
+  the branch — the fused tracks kernel inherits the clip fix (shared
+  intervals::intervals_to_tracks core). Lifted ~10 obsolete #242 xfails +
+  #242-domain assume(False) guards → real passing max_jitter>0 coverage.
+  Rerouted Reference.fetch through the dispatched rust get_reference (deleted
+  zero-caller _fetch_* numba). Fused the annotated-haps
+  (reconstruct_annotated_haplotypes_fused) and spliced-haps
+  (reconstruct_haplotypes_spliced_fused) read paths — both byte-identical to the
+  composed numba oracle. Bumped seqpro 0.18->0.20.0 with to_numpy(validate=False)
+  on guaranteed-uniform read-path sites. Full tree green on both backends.
+```
+
+- [ ] **Step 6: Confirm no public-API change (skill check)**
+
+```bash
+rtk proxy git diff origin/main..HEAD -- python/genvarloader/__init__.py
+```
+Expected: no change to `__all__` / exports → `skills/genvarloader/SKILL.md` needs no update (per CLAUDE.md). If anything changed, update the skill.
+
+- [ ] **Step 7: Commit**
+
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): Phase 3 close-out — honest item status, decisions log
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Self-review notes
+
+- **Spec coverage:** Step1→Task1 (merge), Step2→Task2 (xfails), Step3→Task3 (Reference.fetch), Step4→Tasks4-5 (fusion), Step5→Task6 (seqpro), Step6→Task7 (roadmap/skill). All spec steps mapped.
+- **Simplifications found during planning (vs spec):** (a) the #242 fix needs **no** manual Rust propagation — the fused tracks kernel reuses the shared core; (b) `Reference.fetch` needs **no new rust kernel** — it reroutes through the existing dispatched `get_reference`; (c) the reconstruct core **already** accepts annot buffers, so annotated fusion is a thin wrapper. These reduce risk; the spec's more cautious framing still holds.
+- **Fallback honored:** Task 5 Step 1 explicitly allows splice fusion to fall back to documented unfused-rust if a synthetic spliced fixture is disproportionate (matches spec risk mitigation).
+- **Type consistency:** new entries named consistently — `reconstruct_annotated_haplotypes_fused` (Task 4) and `reconstruct_haplotypes_spliced_fused` (Task 5) used identically in ffi/lib.rs/_haps.py/pyi/tests.
diff --git a/docs/superpowers/plans/2026-06-24-rust-migration-phase-2-genotypes-variants.md b/docs/superpowers/plans/2026-06-24-rust-migration-phase-2-genotypes-variants.md
new file mode 100644
index 00000000..e736d6cd
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-24-rust-migration-phase-2-genotypes-variants.md
@@ -0,0 +1,1770 @@
+# Rust Migration Phase 2 — Genotype Assembly + Variant Gather Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Port the live genotype assembly/selection kernels (`get_diffs_sparse`, `choose_exonic_variants`) and the 7 flat variant-gather kernels from numba to the Rust crate, delete the dead `filter_af` kernel, with byte-identical parity and no `__getitem__` throughput regression.
+
+**Architecture:** Pure-`ndarray` cores in new `src/genotypes/` and `src/variants/` domain modules; PyO3 wrappers live only in `src/ffi/`; Python dispatches per-kernel through `genvarloader._dispatch` (default `rust`, `GVL_BACKEND` override). The numba impls are retained as registered parity references (the registry + numba refs are deleted wholesale in Phase 5, per `_dispatch.py`); only the dead `filter_af` is removed now.
+
+**Tech Stack:** Rust (`ndarray`, PyO3/`numpy`, `maturin`), Python 3.10–3.13, numba (reference impls), pytest + `hypothesis` (parity gates), `cargo test` (unit gates), `pixi` (env/tasks).
+
+## Global Constraints
+
+- Byte-identical parity is the landing gate for every ported kernel — `np.testing.assert_array_equal`, matching dtype AND shape, across the py310–313 × linux/macOS matrix.
+- abi3 wheels must keep building (standing CI invariant) — `pixi run -e dev` build must succeed after each Rust change.
+- `src/ffi/` is the ONLY place new kernels touch PyO3; cores are pure `ndarray`.
+- Both `geno_offsets` forms must be supported: 1-D `(n+1,)` contiguous and 2-D `(2, n)` starts/stops. Normalize to `(2, n)` int64 in the Python dispatch wrapper so both backends receive identical bytes (the numba kernels already branch on `.ndim`; feeding them the 2-D form takes their existing 2-D path).
+- Sequential Rust (no rayon) — per-`(query, hap)` writes are disjoint, so sequential output equals numba's `prange` output; only add rayon if the no-regression gate forces it.
+- Gate = parity + no regression (NOT a required speedup). Baselines on `chr22_geuv`: haplotypes **123.9 batch/s**, variants **145.3 batch/s**.
+- Conventional-commit messages; end every commit message with the `Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>` trailer.
+- Run Rust tests via `pixi run -e dev cargo-test`; Python parity via `pixi run -e dev pytest tests/parity -q` (parity tests are marked `@pytest.mark.parity`).
+- Use `rtk`-prefixed git commands per repo convention.
+
+## File Structure
+
+**Create:**
+- `src/genotypes/mod.rs` — pure-`ndarray` cores: `get_diffs_sparse`, `choose_exonic_variants`.
+- `src/variants/mod.rs` — pure-`ndarray` cores: `gather_v_idxs`, `gather_v_idxs_ss`, `gather_alleles`, `compact_keep`, `fill_empty_scalar`, `fill_empty_seq`, `fill_empty_fixed`.
+- `tests/parity/test_get_diffs_sparse_parity.py`
+- `tests/parity/test_choose_exonic_variants_parity.py`
+- `tests/parity/test_flat_variants_parity.py`
+- `tests/parity/test_variants_dataset_parity.py` — variants-mode dataset-level backstop.
+
+**Modify:**
+- `src/lib.rs` — `pub mod genotypes; pub mod variants;` + register new `ffi::*` pyfunctions.
+- `src/ffi/mod.rs` — PyO3 wrappers for all 9 ported kernels.
+- `python/genvarloader/_dataset/_genotypes.py` — rename numba impls to `_*_numba`, add Rust imports, `register(...)`, and dispatching public wrappers; delete `filter_af`.
+- `python/genvarloader/_dataset/_flat_variants.py` — rename 7 numba kernels to `_*_numba`, add Rust imports, `register(...)`, route internal call sites through `_dispatch.get(...)`.
+- `tests/parity/strategies.py` — new contract-valid generators per kernel.
+- `docs/roadmaps/rust-migration.md` — Phase 2 status, double-count fix, decisions log, measurements.
+
+**Reference only (do not edit logic):**
+- `python/genvarloader/_dataset/_intervals.py` — the canonical dispatch/register/route pattern (Phase 0).
+- `src/intervals.rs` — the canonical core + cargo-test pattern.
+- `tests/parity/_harness.py`, `tests/parity/test_intervals_to_tracks_parity.py` — harness usage.
+
+---
+
+### Task 1: Tuple-aware parity harness helper
+
+The existing `assert_kernel_parity` compares a single returned array. The Phase 2 kernels return tuples (e.g. `(keep, keep_offsets)`, `(data, offsets)`). Add a tuple-aware assertion.
+
+**Files:**
+- Modify: `tests/parity/_harness.py`
+- Test: `tests/parity/test_flat_variants_parity.py` (added in later tasks consumes this; a tiny smoke test here)
+
+**Interfaces:**
+- Produces: `assert_kernel_parity_tuple(name: str, *inputs) -> None` — runs both backends, asserts each returned array element is byte-identical (dtype + shape + values). Works for single-array returns too (wraps non-tuple in a 1-tuple).
+
+- [ ] **Step 1: Write the failing test**
+
+Create `tests/parity/test_harness_tuple.py`:
+
+```python
+import numpy as np
+import pytest
+
+from genvarloader import _dispatch
+from tests.parity._harness import assert_kernel_parity_tuple
+
+pytestmark = pytest.mark.parity
+
+
+def test_tuple_helper_detects_match(monkeypatch):
+    def impl(x):
+        return x * 2, x + 1
+
+    _dispatch.register("_tuple_smoke", numba=impl, rust=impl, default="rust")
+    assert_kernel_parity_tuple("_tuple_smoke", np.arange(4, dtype=np.int32))
+
+
+def test_tuple_helper_detects_mismatch():
+    def a(x):
+        return x, x
+
+    def b(x):
+        return x, x + 1
+
+    _dispatch.register("_tuple_smoke_bad", numba=a, rust=b, default="rust")
+    with pytest.raises(AssertionError):
+        assert_kernel_parity_tuple("_tuple_smoke_bad", np.arange(4, dtype=np.int32))
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+Run: `pixi run -e dev pytest tests/parity/test_harness_tuple.py -q`
+Expected: FAIL with `ImportError: cannot import name 'assert_kernel_parity_tuple'`.
+
+- [ ] **Step 3: Implement the helper**
+
+Append to `tests/parity/_harness.py`:
+
+```python
+def assert_kernel_parity_tuple(name: str, *inputs) -> None:
+    """Parity for kernels that RETURN one array or a tuple of arrays.
+
+    Normalizes a non-tuple return into a 1-tuple, then asserts each element is
+    byte-identical (dtype, shape, values) between the numba and rust backends.
+    """
+    numba_fn, rust_fn = _dispatch.backends(name)
+    got_numba = numba_fn(*inputs)
+    got_rust = rust_fn(*inputs)
+    if not isinstance(got_numba, tuple):
+        got_numba = (got_numba,)
+    if not isinstance(got_rust, tuple):
+        got_rust = (got_rust,)
+    assert len(got_numba) == len(got_rust), (
+        f"{name}: tuple len {len(got_numba)} != {len(got_rust)}"
+    )
+    for i, (a, b) in enumerate(zip(got_numba, got_rust)):
+        a = np.asarray(a)
+        b = np.asarray(b)
+        assert a.dtype == b.dtype, f"{name}[{i}]: dtype {a.dtype} != {b.dtype}"
+        assert a.shape == b.shape, f"{name}[{i}]: shape {a.shape} != {b.shape}"
+        np.testing.assert_array_equal(a, b)
+```
+
+- [ ] **Step 4: Run test to verify it passes**
+
+Run: `pixi run -e dev pytest tests/parity/test_harness_tuple.py -q`
+Expected: PASS (2 passed).
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add tests/parity/_harness.py tests/parity/test_harness_tuple.py
+rtk git commit -m "$(cat <<'EOF'
+test(parity): tuple-aware kernel parity helper for Phase 2 kernels
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 2: Port `get_diffs_sparse` to Rust
+
+Per-`(query, hap)` reference-length diffs. Numba reference: `python/genvarloader/_dataset/_genotypes.py:7-109`. Three branches: empty group (→0); query-clipped path (`q_starts`/`q_ends`/`v_starts` present); keep-masked sum; plain sum.
+
+**Files:**
+- Create: `src/genotypes/mod.rs`
+- Modify: `src/lib.rs`, `src/ffi/mod.rs`, `python/genvarloader/_dataset/_genotypes.py`, `tests/parity/strategies.py`
+- Test: `tests/parity/test_get_diffs_sparse_parity.py`
+
+**Interfaces:**
+- Produces (Rust core): `genotypes::get_diffs_sparse(geno_offset_idx: ArrayView2<i64>, geno_v_idxs: ArrayView1<i32>, o_starts: ArrayView1<i64>, o_stops: ArrayView1<i64>, ilens: ArrayView1<i32>, keep: Option<ArrayView1<bool>>, keep_offsets: Option<ArrayView1<i64>>, q_starts: Option<ArrayView1<i32>>, q_ends: Option<ArrayView1<i32>>, v_starts: Option<ArrayView1<i32>>) -> Array2<i32>`
+- Produces (Python): `get_diffs_sparse(...)` dispatching wrapper with the SAME keyword signature callers already use (`_haps.py:474`); normalizes `geno_offsets` to `(2, n)` int64 before dispatch.
+
+- [ ] **Step 1: Write the Rust core + cargo unit tests**
+
+Create `src/genotypes/mod.rs`:
+
+```rust
+//! Genotype assembly/selection cores (pure ndarray). PyO3 lives in `crate::ffi`.
+use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
+
+/// Per-(query, hap) reference-length diffs. Mirrors the numba
+/// `get_diffs_sparse` exactly. `o_starts`/`o_stops` are the two rows of the
+/// normalized (2, n) offset array: `o_s = o_starts[o_idx]`, `o_e = o_stops[o_idx]`.
+/// Length sums stay far within i32 for real variants; accumulate in i64 and
+/// truncate on store to mirror numpy's `int32`-slot assignment.
+#[allow(clippy::too_many_arguments)]
+pub fn get_diffs_sparse(
+    geno_offset_idx: ArrayView2<i64>,
+    geno_v_idxs: ArrayView1<i32>,
+    o_starts: ArrayView1<i64>,
+    o_stops: ArrayView1<i64>,
+    ilens: ArrayView1<i32>,
+    keep: Option<ArrayView1<bool>>,
+    keep_offsets: Option<ArrayView1<i64>>,
+    q_starts: Option<ArrayView1<i32>>,
+    q_ends: Option<ArrayView1<i32>>,
+    v_starts: Option<ArrayView1<i32>>,
+) -> Array2<i32> {
+    let (n_queries, ploidy) = geno_offset_idx.dim();
+    let mut diffs = Array2::<i32>::zeros((n_queries, ploidy));
+    let has_query = q_starts.is_some() && q_ends.is_some() && v_starts.is_some();
+    let has_keep = keep.is_some() && keep_offsets.is_some();
+
+    for query in 0..n_queries {
+        for hap in 0..ploidy {
+            let o_idx = geno_offset_idx[[query, hap]] as usize;
+            let o_s = o_starts[o_idx] as usize;
+            let o_e = o_stops[o_idx] as usize;
+            let n_variants = o_e - o_s;
+
+            if n_variants == 0 {
+                diffs[[query, hap]] = 0;
+            } else if has_query {
+                let qs = q_starts.unwrap();
+                let qe = q_ends.unwrap();
+                let vs = v_starts.unwrap();
+                let q_start = qs[query] as i64;
+                let q_end = qe[query] as i64;
+                let mut ref_idx = q_start;
+                let mut acc: i64 = 0;
+                for v in o_s..o_e {
+                    if has_keep {
+                        let kp = keep.unwrap();
+                        let ko = keep_offsets.unwrap();
+                        let k_s = ko[query * ploidy + hap] as usize;
+                        if !kp[k_s + (v - o_s)] {
+                            continue;
+                        }
+                    }
+                    let v_idx = geno_v_idxs[v] as usize;
+                    let v_start = vs[v_idx] as i64;
+                    let mut v_ilen = ilens[v_idx] as i64;
+                    let v_end = v_start - v_ilen.min(0) + 1;
+                    if v_end <= q_start {
+                        continue;
+                    }
+                    if v_start >= q_end {
+                        break;
+                    }
+                    if v_start >= q_start && v_start < ref_idx {
+                        continue;
+                    }
+                    ref_idx = ref_idx.max(v_end);
+                    if v_ilen < 0 {
+                        v_ilen += (q_start - v_start - 1).max(0);
+                    }
+                    v_ilen += (v_end - q_end).max(0);
+                    acc += v_ilen;
+                }
+                diffs[[query, hap]] = acc as i32;
+            } else if has_keep {
+                let kp = keep.unwrap();
+                let ko = keep_offsets.unwrap();
+                let k_s = ko[query * ploidy + hap] as usize;
+                let mut sum: i64 = 0;
+                for (j, v) in (o_s..o_e).enumerate() {
+                    if kp[k_s + j] {
+                        sum += ilens[geno_v_idxs[v] as usize] as i64;
+                    }
+                }
+                diffs[[query, hap]] = sum as i32;
+            } else {
+                let mut sum: i64 = 0;
+                for v in o_s..o_e {
+                    sum += ilens[geno_v_idxs[v] as usize] as i64;
+                }
+                diffs[[query, hap]] = sum as i32;
+            }
+        }
+    }
+    diffs
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::{arr1, arr2};
+
+    #[test]
+    fn test_plain_sum() {
+        // 1 query, ploidy 1, two variants with ilens [-2, 3] → sum 1.
+        let goi = arr2(&[[0i64]]);
+        let v_idxs = arr1(&[0i32, 1]);
+        let o_starts = arr1(&[0i64]);
+        let o_stops = arr1(&[2i64]);
+        let ilens = arr1(&[-2i32, 3]);
+        let d = get_diffs_sparse(
+            goi.view(), v_idxs.view(), o_starts.view(), o_stops.view(),
+            ilens.view(), None, None, None, None, None,
+        );
+        assert_eq!(d[[0, 0]], 1);
+    }
+
+    #[test]
+    fn test_empty_group_is_zero() {
+        let goi = arr2(&[[0i64]]);
+        let v_idxs = arr1::<i32, _>(&[]);
+        let o_starts = arr1(&[0i64]);
+        let o_stops = arr1(&[0i64]); // empty slice
+        let ilens = arr1::<i32, _>(&[]);
+        let d = get_diffs_sparse(
+            goi.view(), v_idxs.view(), o_starts.view(), o_stops.view(),
+            ilens.view(), None, None, None, None, None,
+        );
+        assert_eq!(d[[0, 0]], 0);
+    }
+}
+```
+
+- [ ] **Step 2: Wire the module + run cargo tests (expect them to pass)**
+
+In `src/lib.rs` add after `pub mod ffi;` (keep alphabetical-ish with existing `pub mod` lines):
+
+```rust
+pub mod genotypes;
+```
+
+Run: `pixi run -e dev cargo-test`
+Expected: PASS, including `genotypes::tests::test_plain_sum` and `test_empty_group_is_zero`.
+
+- [ ] **Step 3: Add the PyO3 wrapper**
+
+Append to `src/ffi/mod.rs` (add `PyReadonlyArray2`, `PyArray2`, `IntoPyArray` to the `numpy` use line as needed):
+
+```rust
+use numpy::{IntoPyArray, PyArray2, PyReadonlyArray1, PyReadonlyArray2};
+
+use crate::genotypes;
+
+/// Per-(query, hap) reference-length diffs (see `genotypes::get_diffs_sparse`).
+/// `geno_offsets` is the normalized (2, n) int64 starts/stops array.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn get_diffs_sparse<'py>(
+    py: Python<'py>,
+    geno_offset_idx: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    ilens: PyReadonlyArray1<i32>,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    q_starts: Option<PyReadonlyArray1<i32>>,
+    q_ends: Option<PyReadonlyArray1<i32>>,
+    v_starts: Option<PyReadonlyArray1<i32>>,
+) -> Bound<'py, PyArray2<i32>> {
+    let go = geno_offsets.as_array();
+    let diffs = genotypes::get_diffs_sparse(
+        geno_offset_idx.as_array(),
+        geno_v_idxs.as_array(),
+        go.row(0),
+        go.row(1),
+        ilens.as_array(),
+        keep.as_ref().map(|a| a.as_array()),
+        keep_offsets.as_ref().map(|a| a.as_array()),
+        q_starts.as_ref().map(|a| a.as_array()),
+        q_ends.as_ref().map(|a| a.as_array()),
+        v_starts.as_ref().map(|a| a.as_array()),
+    );
+    diffs.into_pyarray(py)
+}
+```
+
+Register it in `src/lib.rs` inside `fn genvarloader(...)`:
+
+```rust
+    m.add_function(wrap_pyfunction!(ffi::get_diffs_sparse, m)?)?;
+```
+
+Run: `pixi run -e dev cargo-test`
+Expected: PASS (compiles + builds the extension).
+
+- [ ] **Step 4: Add the Python dispatch wrapper**
+
+In `python/genvarloader/_dataset/_genotypes.py`:
+
+1. At top, add imports:
+
+```python
+from .._dispatch import get, register
+from ..genvarloader import get_diffs_sparse as _get_diffs_sparse_rust
+```
+
+2. Rename the existing `@nb.njit ... def get_diffs_sparse(` to `def _get_diffs_sparse_numba(` (leave the body untouched — it already handles the 2-D `geno_offsets` branch).
+
+3. Add a normalization helper + register + public wrapper after the numba def:
+
+```python
+def _as_starts_stops(offsets: NDArray[np.integer]) -> NDArray[np.int64]:
+    """Normalize 1-D (n+1,) or 2-D (2, n) offsets to a contiguous (2, n) int64
+    starts/stops array. Both backends consume this single form."""
+    o = np.asarray(offsets)
+    if o.ndim == 1:
+        return np.ascontiguousarray(np.stack([o[:-1], o[1:]]), dtype=np.int64)
+    return np.ascontiguousarray(o, dtype=np.int64)
+
+
+register(
+    "get_diffs_sparse",
+    numba=_get_diffs_sparse_numba,
+    rust=_get_diffs_sparse_rust,
+    default="rust",
+)
+
+
+def get_diffs_sparse(
+    geno_offset_idx: NDArray[np.integer],
+    geno_v_idxs: NDArray[np.integer],
+    geno_offsets: NDArray[np.integer],
+    ilens: NDArray[np.integer],
+    keep: NDArray[np.bool_] | None = None,
+    keep_offsets: NDArray[np.integer] | None = None,
+    q_starts: NDArray[np.integer] | None = None,
+    q_ends: NDArray[np.integer] | None = None,
+    v_starts: NDArray[np.integer] | None = None,
+) -> NDArray[np.int32]:
+    """Per-(query, hap) reference-length diffs; dispatches numba/rust."""
+    return get("get_diffs_sparse")(
+        np.ascontiguousarray(geno_offset_idx, np.int64),
+        np.ascontiguousarray(geno_v_idxs, np.int32),
+        _as_starts_stops(geno_offsets),
+        np.ascontiguousarray(ilens, np.int32),
+        None if keep is None else np.ascontiguousarray(keep, np.bool_),
+        None if keep_offsets is None else np.ascontiguousarray(keep_offsets, np.int64),
+        None if q_starts is None else np.ascontiguousarray(q_starts, np.int32),
+        None if q_ends is None else np.ascontiguousarray(q_ends, np.int32),
+        None if v_starts is None else np.ascontiguousarray(v_starts, np.int32),
+    )
+```
+
+Note: callers in `_haps.py` use keyword args; the wrapper keeps the same keyword names so no call-site edits are required. The numba reference is invoked positionally by the dispatch wrapper, so `_get_diffs_sparse_numba` must accept these args positionally in this exact order (it already does).
+
+- [ ] **Step 5: Add the parity strategy**
+
+Append to `tests/parity/strategies.py`:
+
+```python
+@st.composite
+def _sparse_geno(draw, max_queries=4, max_ploidy=2, max_vars_per_group=5,
+                 max_total_unique=12):
+    """Shared sparse-genotype layout: returns
+    (geno_offset_idx (q,p) int64, geno_v_idxs int32, geno_offsets (n+1,) int64,
+     v_starts int32, ilens int32, q_starts int32, q_ends int32).
+    geno_offset_idx is arange so each (q,p) row maps to its own offset slice."""
+    n_unique = draw(st.integers(min_value=1, max_value=max_total_unique))
+    v_starts = np.sort(
+        draw(st.lists(st.integers(0, 1000), min_size=n_unique, max_size=n_unique)
+             .map(np.array))
+    ).astype(np.int32)
+    ilens = np.array(
+        draw(st.lists(st.integers(-5, 5), min_size=n_unique, max_size=n_unique)),
+        dtype=np.int32,
+    )
+    n_q = draw(st.integers(1, max_queries))
+    p = draw(st.integers(1, max_ploidy))
+    n_groups = n_q * p
+    counts = [draw(st.integers(0, max_vars_per_group)) for _ in range(n_groups)]
+    v_idx_list = []
+    for c in counts:
+        # sorted variant indices within a group (reconstruction assumes sorted pos)
+        idxs = sorted(draw(st.lists(st.integers(0, n_unique - 1),
+                                    min_size=c, max_size=c)))
+        v_idx_list.extend(idxs)
+    geno_v_idxs = np.array(v_idx_list, dtype=np.int32)
+    geno_offsets = np.concatenate([[0], np.cumsum(counts)]).astype(np.int64)
+    geno_offset_idx = np.arange(n_groups, dtype=np.int64).reshape(n_q, p)
+    q_starts = np.array(
+        draw(st.lists(st.integers(0, 800), min_size=n_q, max_size=n_q)), np.int32
+    )
+    q_ends = (q_starts + draw(st.integers(1, 200))).astype(np.int32)
+    return (geno_offset_idx, geno_v_idxs, geno_offsets, v_starts, ilens,
+            q_starts, q_ends)
+
+
+@st.composite
+def get_diffs_sparse_inputs(draw):
+    (goi, gvi, goff, vstarts, ilens, qstarts, qends) = draw(_sparse_geno(draw))
+    mode = draw(st.sampled_from(["plain", "keep", "query"]))
+    twod = draw(st.booleans())
+    offsets = goff if not twod else np.stack([goff[:-1], goff[1:]]).astype(np.int64)
+    n_groups = goi.size
+    total = int(goff[-1])
+    if mode == "plain":
+        return (goi, gvi, offsets, ilens, None, None, None, None, None)
+    if mode == "keep":
+        keep = np.array(
+            draw(st.lists(st.booleans(), min_size=total, max_size=total)), np.bool_
+        )
+        return (goi, gvi, offsets, ilens, keep, goff.copy(), None, None, None)
+    # query mode (optionally also keep)
+    keep = None
+    keep_off = None
+    if draw(st.booleans()):
+        keep = np.array(
+            draw(st.lists(st.booleans(), min_size=total, max_size=total)), np.bool_
+        )
+        keep_off = goff.copy()
+    return (goi, gvi, offsets, ilens, keep, keep_off, qstarts, qends, vstarts)
+```
+
+- [ ] **Step 6: Write the parity test**
+
+Create `tests/parity/test_get_diffs_sparse_parity.py`:
+
+```python
+import pytest
+from hypothesis import given
+
+from genvarloader._dataset import _genotypes  # noqa: F401  (import triggers register())
+from tests.parity._harness import assert_kernel_parity_tuple
+from tests.parity.strategies import get_diffs_sparse_inputs
+
+pytestmark = pytest.mark.parity
+
+
+@given(get_diffs_sparse_inputs())
+def test_get_diffs_sparse_parity(inputs):
+    # The public wrapper normalizes offsets; here we call the registered
+    # backends directly through the wrapper's dispatch name with the wrapper's
+    # already-normalized (2, n) form, so feed normalized inputs.
+    from genvarloader._dataset._genotypes import _as_starts_stops
+    import numpy as np
+
+    goi, gvi, offsets, ilens, keep, keep_off, qs, qe, vs = inputs
+    norm = (
+        np.ascontiguousarray(goi, np.int64),
+        np.ascontiguousarray(gvi, np.int32),
+        _as_starts_stops(offsets),
+        np.ascontiguousarray(ilens, np.int32),
+        None if keep is None else np.ascontiguousarray(keep, np.bool_),
+        None if keep_off is None else np.ascontiguousarray(keep_off, np.int64),
+        None if qs is None else np.ascontiguousarray(qs, np.int32),
+        None if qe is None else np.ascontiguousarray(qe, np.int32),
+        None if vs is None else np.ascontiguousarray(vs, np.int32),
+    )
+    assert_kernel_parity_tuple("get_diffs_sparse", *norm)
+```
+
+- [ ] **Step 7: Run parity + cargo, verify green**
+
+Run: `pixi run -e dev pytest tests/parity/test_get_diffs_sparse_parity.py -q`
+Expected: PASS (100 hypothesis examples).
+Run: `pixi run -e dev cargo-test`
+Expected: PASS.
+
+- [ ] **Step 8: Smoke the live read path**
+
+Run: `pixi run -e dev pytest tests/dataset tests/unit -q -k "hap or splice or exon"`
+Expected: PASS (haplotype/exonic paths still produce correct output through the new wrapper).
+
+- [ ] **Step 9: Commit**
+
+```bash
+rtk git add src/genotypes/mod.rs src/lib.rs src/ffi/mod.rs python/genvarloader/_dataset/_genotypes.py tests/parity/strategies.py tests/parity/test_get_diffs_sparse_parity.py
+rtk git commit -m "$(cat <<'EOF'
+perf(genotypes): port get_diffs_sparse numba->rust (parity-gated)
+
+Pure-ndarray core in src/genotypes/, PyO3 in src/ffi/, dispatched via
+_dispatch (default rust). Offsets normalized to (2,n) int64. numba retained
+as parity reference.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 3: Port `choose_exonic_variants` to Rust
+
+Keep-mask for variants fully contained in a query interval. Numba reference: `_genotypes.py:421-522` (driver `choose_exonic_variants` + inner `_choose_exonic_variants`). Returns `(keep: bool, keep_offsets: OFFSET_TYPE)`.
+
+**Files:**
+- Modify: `src/genotypes/mod.rs`, `src/lib.rs`, `src/ffi/mod.rs`, `python/genvarloader/_dataset/_genotypes.py`, `tests/parity/strategies.py`
+- Test: `tests/parity/test_choose_exonic_variants_parity.py`
+
+**Interfaces:**
+- Produces (Rust core): `genotypes::choose_exonic_variants(starts: ArrayView1<i32>, ends: ArrayView1<i32>, geno_offset_idx: ArrayView2<i64>, geno_v_idxs: ArrayView1<i32>, o_starts: ArrayView1<i64>, o_stops: ArrayView1<i64>, v_starts: ArrayView1<i32>, ilens: ArrayView1<i32>) -> (Array1<bool>, Array1<i64>)`
+- Produces (Python): `choose_exonic_variants(...)` wrapper, same keyword signature as the `_haps.py` call sites; returns `(keep, keep_offsets)` with `keep_offsets.dtype == np.dtype(OFFSET_TYPE)`.
+
+- [ ] **Step 1: Confirm `OFFSET_TYPE`**
+
+Run: `pixi run -e dev python -c "from seqpro.rag import OFFSET_TYPE; import numpy as np; print(np.dtype(OFFSET_TYPE))"`
+Expected: prints `int64`. If it is NOT int64, adjust the Rust return element + ffi `PyArray1<...>` accordingly and the dtype coercion in the wrapper. The rest of this task assumes int64.
+
+- [ ] **Step 2: Write the Rust core + cargo test**
+
+Append to `src/genotypes/mod.rs`:
+
+```rust
+/// Keep-mask for variants fully contained in each query interval. Mirrors the
+/// numba `choose_exonic_variants` + inner `_choose_exonic_variants`. Returns
+/// `(keep, keep_offsets)` where keep_offsets is the per-group prefix sum of
+/// group sizes (len n_groups + 1).
+#[allow(clippy::too_many_arguments)]
+pub fn choose_exonic_variants(
+    starts: ArrayView1<i32>,
+    ends: ArrayView1<i32>,
+    geno_offset_idx: ArrayView2<i64>,
+    geno_v_idxs: ArrayView1<i32>,
+    o_starts: ArrayView1<i64>,
+    o_stops: ArrayView1<i64>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+) -> (Array1<bool>, Array1<i64>) {
+    let (n_regions, ploidy) = geno_offset_idx.dim();
+
+    // keep_offsets = prefix sum of per-group lengths (numba uses lengths.cumsum()).
+    let mut keep_offsets = Array1::<i64>::zeros(n_regions * ploidy + 1);
+    let mut acc: i64 = 0;
+    for query in 0..n_regions {
+        for hap in 0..ploidy {
+            let o_idx = geno_offset_idx[[query, hap]] as usize;
+            let len = (o_stops[o_idx] - o_starts[o_idx]).max(0);
+            acc += len;
+            keep_offsets[query * ploidy + hap + 1] = acc;
+        }
+    }
+
+    let n_variants = keep_offsets[n_regions * ploidy] as usize;
+    let mut keep = Array1::<bool>::default(n_variants);
+
+    for query in 0..n_regions {
+        let ref_start = starts[query] as i64;
+        let ref_end = ends[query] as i64;
+        for hap in 0..ploidy {
+            let o_idx = geno_offset_idx[[query, hap]] as usize;
+            let o_s = o_starts[o_idx] as usize;
+            let o_e = o_stops[o_idx] as usize;
+            let k_s = keep_offsets[query * ploidy + hap] as usize;
+            for (j, v) in (o_s..o_e).enumerate() {
+                let v_idx = geno_v_idxs[v] as usize;
+                let v_pos = v_starts[v_idx] as i64;
+                let v_ref_end = v_pos - (ilens[v_idx] as i64).min(0) + 1;
+                keep[k_s + j] = v_pos >= ref_start && v_ref_end <= ref_end;
+            }
+        }
+    }
+    (keep, keep_offsets)
+}
+```
+
+Add a cargo test inside the existing `mod tests`:
+
+```rust
+    #[test]
+    fn test_exonic_contained_only() {
+        // region [10, 20). variants at pos 12 (ilen 0 -> end 13, kept) and
+        // pos 19 (ilen 0 -> end 20, kept), pos 19 with ilen -2 -> end 22 (dropped).
+        let goi = arr2(&[[0i64]]);
+        let v_idxs = arr1(&[0i32, 1, 2]);
+        let o_starts = arr1(&[0i64]);
+        let o_stops = arr1(&[3i64]);
+        let v_starts = arr1(&[12i32, 19, 19]);
+        let ilens = arr1(&[0i32, 0, -2]);
+        let (keep, koff) = choose_exonic_variants(
+            arr1(&[10i32]).view(), arr1(&[20i32]).view(), goi.view(),
+            v_idxs.view(), o_starts.view(), o_stops.view(),
+            v_starts.view(), ilens.view(),
+        );
+        assert_eq!(keep.to_vec(), vec![true, true, false]);
+        assert_eq!(koff.to_vec(), vec![0, 3]);
+    }
+```
+
+- [ ] **Step 3: Run cargo tests**
+
+Run: `pixi run -e dev cargo-test`
+Expected: PASS including `test_exonic_contained_only`.
+
+- [ ] **Step 4: Add the PyO3 wrapper + register in lib.rs**
+
+Append to `src/ffi/mod.rs` (add `PyArray1` to the `numpy` use if not already imported):
+
+```rust
+use numpy::PyArray1;
+
+/// Exonic keep-mask (see `genotypes::choose_exonic_variants`). Returns
+/// `(keep: bool[n], keep_offsets: i64[n_groups+1])`.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn choose_exonic_variants<'py>(
+    py: Python<'py>,
+    starts: PyReadonlyArray1<i32>,
+    ends: PyReadonlyArray1<i32>,
+    geno_offset_idx: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+) -> (Bound<'py, PyArray1<bool>>, Bound<'py, PyArray1<i64>>) {
+    let go = geno_offsets.as_array();
+    let (keep, koff) = genotypes::choose_exonic_variants(
+        starts.as_array(),
+        ends.as_array(),
+        geno_offset_idx.as_array(),
+        geno_v_idxs.as_array(),
+        go.row(0),
+        go.row(1),
+        v_starts.as_array(),
+        ilens.as_array(),
+    );
+    (keep.into_pyarray(py), koff.into_pyarray(py))
+}
+```
+
+Register in `src/lib.rs`:
+
+```rust
+    m.add_function(wrap_pyfunction!(ffi::choose_exonic_variants, m)?)?;
+```
+
+Run: `pixi run -e dev cargo-test`
+Expected: PASS (extension builds).
+
+- [ ] **Step 5: Add the Python dispatch wrapper**
+
+In `_genotypes.py`:
+
+1. Add import: `from ..genvarloader import choose_exonic_variants as _choose_exonic_variants_rust`.
+2. Rename `@nb.njit ... def choose_exonic_variants(` → `def _choose_exonic_variants_numba(` (keep the inner `_choose_exonic_variants` njit as-is — it's only called by the numba driver).
+3. Add register + wrapper:
+
+```python
+register(
+    "choose_exonic_variants",
+    numba=_choose_exonic_variants_numba,
+    rust=_choose_exonic_variants_rust,
+    default="rust",
+)
+
+
+def choose_exonic_variants(
+    starts: NDArray[np.integer],
+    ends: NDArray[np.integer],
+    geno_offset_idx: NDArray[np.integer],
+    geno_v_idxs: NDArray[np.integer],
+    geno_offsets: NDArray[np.integer],
+    v_starts: NDArray[np.integer],
+    ilens: NDArray[np.integer],
+) -> tuple[NDArray[np.bool_], NDArray[OFFSET_TYPE]]:
+    """Exonic keep-mask; dispatches numba/rust. keep_offsets dtype == OFFSET_TYPE."""
+    keep, keep_offsets = get("choose_exonic_variants")(
+        np.ascontiguousarray(starts, np.int32),
+        np.ascontiguousarray(ends, np.int32),
+        np.ascontiguousarray(geno_offset_idx, np.int64),
+        np.ascontiguousarray(geno_v_idxs, np.int32),
+        _as_starts_stops(geno_offsets),
+        np.ascontiguousarray(v_starts, np.int32),
+        np.ascontiguousarray(ilens, np.int32),
+    )
+    return keep, keep_offsets.astype(OFFSET_TYPE, copy=False)
+```
+
+Note: `_choose_exonic_variants_numba` already returns `keep_offsets` as `OFFSET_TYPE`; the Rust path returns int64 and the `.astype(..., copy=False)` is a no-op when OFFSET_TYPE is int64. The parity test compares the raw backend returns (both int64) BEFORE this astype.
+
+- [ ] **Step 6: Add parity strategy**
+
+Append to `tests/parity/strategies.py`:
+
+```python
+@st.composite
+def choose_exonic_variants_inputs(draw):
+    (goi, gvi, goff, vstarts, ilens, qstarts, qends) = draw(_sparse_geno(draw))
+    twod = draw(st.booleans())
+    offsets = goff if not twod else np.stack([goff[:-1], goff[1:]]).astype(np.int64)
+    return (qstarts, qends, goi, gvi, offsets, vstarts, ilens)
+```
+
+- [ ] **Step 7: Write parity test**
+
+Create `tests/parity/test_choose_exonic_variants_parity.py`:
+
+```python
+import numpy as np
+import pytest
+from hypothesis import given
+
+from genvarloader._dataset import _genotypes  # noqa: F401
+from genvarloader._dataset._genotypes import _as_starts_stops
+from tests.parity._harness import assert_kernel_parity_tuple
+from tests.parity.strategies import choose_exonic_variants_inputs
+
+pytestmark = pytest.mark.parity
+
+
+@given(choose_exonic_variants_inputs())
+def test_choose_exonic_variants_parity(inputs):
+    qs, qe, goi, gvi, offsets, vs, ilens = inputs
+    norm = (
+        np.ascontiguousarray(qs, np.int32),
+        np.ascontiguousarray(qe, np.int32),
+        np.ascontiguousarray(goi, np.int64),
+        np.ascontiguousarray(gvi, np.int32),
+        _as_starts_stops(offsets),
+        np.ascontiguousarray(vs, np.int32),
+        np.ascontiguousarray(ilens, np.int32),
+    )
+    assert_kernel_parity_tuple("choose_exonic_variants", *norm)
+```
+
+- [ ] **Step 8: Run parity + cargo + exonic read path**
+
+Run: `pixi run -e dev pytest tests/parity/test_choose_exonic_variants_parity.py -q`
+Expected: PASS.
+Run: `pixi run -e dev pytest tests/dataset tests/unit -q -k "exon or splice"`
+Expected: PASS.
+
+- [ ] **Step 9: Commit**
+
+```bash
+rtk git add src/genotypes/mod.rs src/lib.rs src/ffi/mod.rs python/genvarloader/_dataset/_genotypes.py tests/parity/strategies.py tests/parity/test_choose_exonic_variants_parity.py
+rtk git commit -m "$(cat <<'EOF'
+perf(genotypes): port choose_exonic_variants numba->rust (parity-gated)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 4: Delete dead `filter_af`
+
+`filter_af` (`_genotypes.py:525-580`) has zero callers — AF filtering is done inline in numpy (`_haps.py:734-737`, `_flat_variants.py:698-701`). Remove it.
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_genotypes.py`
+
+**Interfaces:**
+- Consumes: nothing.
+- Produces: nothing (removal only).
+
+- [ ] **Step 1: Confirm zero callers (guard against a hidden reference)**
+
+Run: `rtk grep -rn "filter_af" . --include="*.py"`
+Expected: only the definition line(s) in `_genotypes.py` and the comment at `_genotypes.py:475`. If any other reference exists, STOP and re-scope — do not delete.
+
+- [ ] **Step 2: Delete the kernel + stale comment reference**
+
+Remove the entire `@nb.njit ... def filter_af(...)` block (`_genotypes.py:525-580`). Update the comment at line ~475 (`# Mirror filter_af's (2, n_slices) indexing (sibling kernel below).`) to not reference the now-deleted kernel — replace with `# Handle both 1-D (n+1,) and 2-D (2, n_slices) geno_offsets forms.`
+
+- [ ] **Step 3: Verify nothing imports it**
+
+Run: `pixi run -e dev ruff check python/genvarloader/_dataset/_genotypes.py`
+Expected: PASS (no unused/undefined-name errors).
+Run: `pixi run -e dev pytest tests/dataset tests/unit -q -k "af or freq"`
+Expected: PASS (AF filtering still works via the inline numpy path).
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add python/genvarloader/_dataset/_genotypes.py
+rtk git commit -m "$(cat <<'EOF'
+refactor(genotypes): delete dead filter_af kernel (superseded by inline numpy)
+
+AF filtering happens in numpy in _haps.py/_flat_variants.py; the numba
+filter_af had zero callers (same as the Phase 0 splits_sum_le_value dead path).
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 5: Port `_gather_v_idxs` + `_gather_v_idxs_ss` to Rust
+
+Per-row variant-index gather. Numba reference: `_flat_variants.py:432-488`. Both are unified by the `(2, n)` normalization, so a single Rust core `gather_rows` suffices; the Python `_gather_rows` dispatcher (line 538) routes to it.
+
+**Files:**
+- Create: `src/variants/mod.rs`
+- Modify: `src/lib.rs`, `src/ffi/mod.rs`, `python/genvarloader/_dataset/_flat_variants.py`, `tests/parity/strategies.py`
+- Test: `tests/parity/test_flat_variants_parity.py`
+
+**Interfaces:**
+- Produces (Rust core): `variants::gather_rows(geno_offset_idx: ArrayView1<i64>, o_starts: ArrayView1<i64>, o_stops: ArrayView1<i64>, geno_v_idxs: ArrayView1<i32>) -> (Array1<i32>, Array1<i64>)` → `(v_idxs, out_offsets)`.
+- Produces (Python): `_gather_rows(geno_offset_idx, offsets, data)` keeps its existing signature (line 538) but dispatches to the Rust/numba `gather_rows` after normalizing offsets to `(2, n)`.
+
+Note: `geno_v_idxs` dtype — the numba kernel preserves `geno_v_idxs.dtype`. Confirm it is int32 in production (`self.genotypes.data`). The wrapper coerces to int32; if production uses a wider dtype, widen the Rust element type + ffi to match and re-confirm parity dtype.
+
+- [ ] **Step 1: Write the Rust core + cargo test**
+
+Create `src/variants/mod.rs`:
+
+```rust
+//! Flat variant gather/fill cores (pure ndarray). PyO3 lives in `crate::ffi`.
+use ndarray::{Array1, ArrayView1};
+
+/// Per-row variant-index gather. Mirrors numba `_gather_v_idxs` (and `_ss` via
+/// the (2, n) normalized offsets). `o_s = o_starts[goi]`, `o_e = o_stops[goi]`.
+pub fn gather_rows(
+    geno_offset_idx: ArrayView1<i64>,
+    o_starts: ArrayView1<i64>,
+    o_stops: ArrayView1<i64>,
+    geno_v_idxs: ArrayView1<i32>,
+) -> (Array1<i32>, Array1<i64>) {
+    let n_rows = geno_offset_idx.len();
+    let mut out_offsets = Array1::<i64>::zeros(n_rows + 1);
+    for i in 0..n_rows {
+        let goi = geno_offset_idx[i] as usize;
+        out_offsets[i + 1] = out_offsets[i] + (o_stops[goi] - o_starts[goi]);
+    }
+    let total = out_offsets[n_rows] as usize;
+    let mut v_idxs = Array1::<i32>::zeros(total);
+    let mut dst = 0usize;
+    for i in 0..n_rows {
+        let goi = geno_offset_idx[i] as usize;
+        let s = o_starts[goi] as usize;
+        let e = o_stops[goi] as usize;
+        for k in s..e {
+            v_idxs[dst] = geno_v_idxs[k];
+            dst += 1;
+        }
+    }
+    (v_idxs, out_offsets)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::arr1;
+
+    #[test]
+    fn test_gather_rows_basic() {
+        // 2 rows selecting offset groups 1 then 0.
+        let goi = arr1(&[1i64, 0]);
+        let o_starts = arr1(&[0i64, 2]);
+        let o_stops = arr1(&[2i64, 5]);
+        let data = arr1(&[10i32, 11, 12, 13, 14]);
+        let (v, off) = gather_rows(goi.view(), o_starts.view(), o_stops.view(), data.view());
+        assert_eq!(v.to_vec(), vec![12, 13, 14, 10, 11]);
+        assert_eq!(off.to_vec(), vec![0, 3, 5]);
+    }
+}
+```
+
+- [ ] **Step 2: Wire module + cargo test**
+
+In `src/lib.rs` add `pub mod variants;`.
+Run: `pixi run -e dev cargo-test`
+Expected: PASS including `variants::tests::test_gather_rows_basic`.
+
+- [ ] **Step 3: PyO3 wrapper + register**
+
+Append to `src/ffi/mod.rs`:
+
+```rust
+use crate::variants;
+
+/// Per-row variant-index gather (see `variants::gather_rows`).
+#[pyfunction]
+pub fn gather_rows<'py>(
+    py: Python<'py>,
+    geno_offset_idx: PyReadonlyArray1<i64>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+) -> (Bound<'py, PyArray1<i32>>, Bound<'py, PyArray1<i64>>) {
+    let go = geno_offsets.as_array();
+    let (v, off) = variants::gather_rows(
+        geno_offset_idx.as_array(),
+        go.row(0),
+        go.row(1),
+        geno_v_idxs.as_array(),
+    );
+    (v.into_pyarray(py), off.into_pyarray(py))
+}
+```
+
+Register in `src/lib.rs`: `m.add_function(wrap_pyfunction!(ffi::gather_rows, m)?)?;`
+Run: `pixi run -e dev cargo-test`
+Expected: PASS.
+
+- [ ] **Step 4: Route the Python `_gather_rows`**
+
+In `_flat_variants.py`:
+
+1. Add imports near the top:
+
+```python
+from .._dispatch import get, register
+from ..genvarloader import gather_rows as _gather_rows_rust
+from ._genotypes import _as_starts_stops
+```
+
+2. Rename the two njit defs to `_gather_v_idxs_numba` / `_gather_v_idxs_ss_numba` (keep bodies). Add a numba adapter matching the Rust ffi signature `(geno_offset_idx, geno_offsets_2d, geno_v_idxs)`:
+
+```python
+def _gather_rows_numba(geno_offset_idx, geno_offsets, geno_v_idxs):
+    # geno_offsets is the normalized (2, n) form.
+    return _gather_v_idxs_ss_numba(
+        geno_offset_idx, geno_offsets[0], geno_offsets[1], geno_v_idxs
+    )
+
+
+register("gather_rows", numba=_gather_rows_numba, rust=_gather_rows_rust, default="rust")
+```
+
+3. Replace the body of the existing `_gather_rows(...)` (line 538) with:
+
+```python
+def _gather_rows(
+    geno_offset_idx: NDArray[np.intp],
+    offsets: NDArray[np.int64],
+    data: NDArray,
+) -> tuple[NDArray, NDArray[np.int64]]:
+    """Dispatch per-row variant-index gather (numba/rust), normalizing offsets."""
+    return get("gather_rows")(
+        np.ascontiguousarray(geno_offset_idx, np.int64),
+        _as_starts_stops(offsets),
+        np.ascontiguousarray(data, np.int32),
+    )
+```
+
+Note: keeping `_gather_v_idxs_numba`/`_gather_v_idxs_ss_numba` lets the parity test exercise the numba path; `_gather_rows_numba` is the dispatch adapter. The 2-D normalized form makes `_ss` the single numba path.
+
+- [ ] **Step 5: Parity strategy + test (gather_rows)**
+
+Append to `tests/parity/strategies.py`:
+
+```python
+@st.composite
+def gather_rows_inputs(draw):
+    n_groups = draw(st.integers(1, 6))
+    counts = [draw(st.integers(0, 5)) for _ in range(n_groups)]
+    offsets = np.concatenate([[0], np.cumsum(counts)]).astype(np.int64)
+    total = int(offsets[-1])
+    data = np.array(
+        draw(st.lists(st.integers(0, 1000), min_size=total, max_size=total)), np.int32
+    )
+    n_rows = draw(st.integers(1, 8))
+    goi = np.array(
+        draw(st.lists(st.integers(0, n_groups - 1), min_size=n_rows, max_size=n_rows)),
+        np.int64,
+    )
+    twod = draw(st.booleans())
+    off = offsets if not twod else np.stack([offsets[:-1], offsets[1:]]).astype(np.int64)
+    return (goi, off, data)
+```
+
+Create `tests/parity/test_flat_variants_parity.py`:
+
+```python
+import numpy as np
+import pytest
+from hypothesis import given
+
+from genvarloader._dataset import _flat_variants  # noqa: F401  (triggers register())
+from genvarloader._dataset._genotypes import _as_starts_stops
+from tests.parity._harness import assert_kernel_parity_tuple
+from tests.parity.strategies import gather_rows_inputs
+
+pytestmark = pytest.mark.parity
+
+
+@given(gather_rows_inputs())
+def test_gather_rows_parity(inputs):
+    goi, offsets, data = inputs
+    assert_kernel_parity_tuple(
+        "gather_rows",
+        np.ascontiguousarray(goi, np.int64),
+        _as_starts_stops(offsets),
+        np.ascontiguousarray(data, np.int32),
+    )
+```
+
+- [ ] **Step 6: Run parity + cargo**
+
+Run: `pixi run -e dev pytest tests/parity/test_flat_variants_parity.py -q`
+Expected: PASS.
+Run: `pixi run -e dev cargo-test`
+Expected: PASS.
+
+- [ ] **Step 7: Commit**
+
+```bash
+rtk git add src/variants/mod.rs src/lib.rs src/ffi/mod.rs python/genvarloader/_dataset/_flat_variants.py tests/parity/strategies.py tests/parity/test_flat_variants_parity.py
+rtk git commit -m "$(cat <<'EOF'
+perf(variants): port _gather_v_idxs(+_ss) numba->rust as gather_rows (parity)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 6: Port `_gather_alleles` to Rust
+
+Variable-length allele-byte gather. Numba reference: `_flat_variants.py:491-512`.
+
+**Files:**
+- Modify: `src/variants/mod.rs`, `src/lib.rs`, `src/ffi/mod.rs`, `python/genvarloader/_dataset/_flat_variants.py`, `tests/parity/strategies.py`, `tests/parity/test_flat_variants_parity.py`
+
+**Interfaces:**
+- Produces (Rust core): `variants::gather_alleles(v_idxs: ArrayView1<i32>, allele_bytes: ArrayView1<u8>, allele_offsets: ArrayView1<i64>) -> (Array1<u8>, Array1<i64>)` → `(data, seq_offsets)`.
+- Produces (Python): registered as `"gather_alleles"`; call sites at `_flat_variants.py:738,749` go through `get("gather_alleles")(...)`.
+
+- [ ] **Step 1: Rust core + cargo test**
+
+Append to `src/variants/mod.rs`:
+
+```rust
+/// Gather variable-length allele bytestrings. Mirrors numba `_gather_alleles`.
+pub fn gather_alleles(
+    v_idxs: ArrayView1<i32>,
+    allele_bytes: ArrayView1<u8>,
+    allele_offsets: ArrayView1<i64>,
+) -> (Array1<u8>, Array1<i64>) {
+    let n = v_idxs.len();
+    let mut seq_offsets = Array1::<i64>::zeros(n + 1);
+    for i in 0..n {
+        let v = v_idxs[i] as usize;
+        seq_offsets[i + 1] = seq_offsets[i] + (allele_offsets[v + 1] - allele_offsets[v]);
+    }
+    let total = seq_offsets[n] as usize;
+    let mut data = Array1::<u8>::zeros(total);
+    let mut dst = 0usize;
+    for i in 0..n {
+        let v = v_idxs[i] as usize;
+        let s = allele_offsets[v] as usize;
+        let e = allele_offsets[v + 1] as usize;
+        for k in s..e {
+            data[dst] = allele_bytes[k];
+            dst += 1;
+        }
+    }
+    (data, seq_offsets)
+}
+```
+
+Add to `mod tests`:
+
+```rust
+    #[test]
+    fn test_gather_alleles_basic() {
+        // alleles: v0="AC"(65,67), v1="G"(71). gather [1,0,1].
+        let v_idxs = arr1(&[1i32, 0, 1]);
+        let bytes = arr1(&[65u8, 67, 71]);
+        let offs = arr1(&[0i64, 2, 3]);
+        let (data, seq) = gather_alleles(v_idxs.view(), bytes.view(), offs.view());
+        assert_eq!(data.to_vec(), vec![71, 65, 67, 71]);
+        assert_eq!(seq.to_vec(), vec![0, 1, 3, 4]);
+    }
+```
+
+- [ ] **Step 2: PyO3 wrapper + register**
+
+Append to `src/ffi/mod.rs`:
+
+```rust
+/// Gather allele bytestrings (see `variants::gather_alleles`).
+#[pyfunction]
+pub fn gather_alleles<'py>(
+    py: Python<'py>,
+    v_idxs: PyReadonlyArray1<i32>,
+    allele_bytes: PyReadonlyArray1<u8>,
+    allele_offsets: PyReadonlyArray1<i64>,
+) -> (Bound<'py, PyArray1<u8>>, Bound<'py, PyArray1<i64>>) {
+    let (data, seq) = variants::gather_alleles(
+        v_idxs.as_array(),
+        allele_bytes.as_array(),
+        allele_offsets.as_array(),
+    );
+    (data.into_pyarray(py), seq.into_pyarray(py))
+}
+```
+
+Register: `m.add_function(wrap_pyfunction!(ffi::gather_alleles, m)?)?;`
+Run: `pixi run -e dev cargo-test`
+Expected: PASS.
+
+- [ ] **Step 3: Route Python + register**
+
+In `_flat_variants.py`: add `from ..genvarloader import gather_alleles as _gather_alleles_rust`; rename njit to `_gather_alleles_numba`; add a thin dispatch wrapper named `_gather_alleles` (preserving the existing internal call name) + register:
+
+```python
+register("gather_alleles", numba=_gather_alleles_numba, rust=_gather_alleles_rust, default="rust")
+
+
+def _gather_alleles(v_idxs, allele_bytes, allele_offsets):
+    return get("gather_alleles")(
+        np.ascontiguousarray(v_idxs, np.int32),
+        np.ascontiguousarray(allele_bytes, np.uint8),
+        np.ascontiguousarray(allele_offsets, np.int64),
+    )
+```
+
+The existing call sites (`_gather_alleles(v_idxs, alt_bytes, alt_off)` at lines 738, 749) now resolve to this wrapper unchanged.
+
+- [ ] **Step 4: Parity strategy + test**
+
+Append to `tests/parity/strategies.py`:
+
+```python
+@st.composite
+def gather_alleles_inputs(draw):
+    n_unique = draw(st.integers(1, 8))
+    lens = [draw(st.integers(0, 5)) for _ in range(n_unique)]
+    allele_offsets = np.concatenate([[0], np.cumsum(lens)]).astype(np.int64)
+    total = int(allele_offsets[-1])
+    allele_bytes = np.array(
+        draw(st.lists(st.integers(0, 255), min_size=total, max_size=total)), np.uint8
+    )
+    m = draw(st.integers(0, 10))
+    v_idxs = np.array(
+        draw(st.lists(st.integers(0, n_unique - 1), min_size=m, max_size=m)), np.int32
+    )
+    return (v_idxs, allele_bytes, allele_offsets)
+```
+
+Add to `tests/parity/test_flat_variants_parity.py`:
+
+```python
+from tests.parity.strategies import gather_alleles_inputs
+
+
+@given(gather_alleles_inputs())
+def test_gather_alleles_parity(inputs):
+    v_idxs, allele_bytes, allele_offsets = inputs
+    assert_kernel_parity_tuple(
+        "gather_alleles",
+        np.ascontiguousarray(v_idxs, np.int32),
+        np.ascontiguousarray(allele_bytes, np.uint8),
+        np.ascontiguousarray(allele_offsets, np.int64),
+    )
+```
+
+- [ ] **Step 5: Run parity + cargo, commit**
+
+Run: `pixi run -e dev pytest tests/parity/test_flat_variants_parity.py -q && pixi run -e dev cargo-test`
+Expected: PASS.
+
+```bash
+rtk git add src/variants/mod.rs src/lib.rs src/ffi/mod.rs python/genvarloader/_dataset/_flat_variants.py tests/parity/strategies.py tests/parity/test_flat_variants_parity.py
+rtk git commit -m "$(cat <<'EOF'
+perf(variants): port _gather_alleles numba->rust (parity-gated)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 7: Port `_compact_keep` to Rust
+
+Drop variants where `keep` is False, rebuilding row offsets. Numba reference: `_flat_variants.py:515-535`. Note: the first param can be `v_idxs` OR a parallel array (e.g. dosage) sharing the row layout — the dtype varies (int32 for v_idxs, float for dosage). Handle both with a generic element type via two registered entry points, OR coerce in the wrapper per call site.
+
+**Decision:** register a single `"compact_keep"` that operates on the value array as `f64`-agnostic is unsafe for int parity. Instead expose two typed cores and pick by the value array's dtype in the Python wrapper (v_idxs → int32, dosage/ccf → float32). Confirm the production dtypes first.
+
+**Files:**
+- Modify: `src/variants/mod.rs`, `src/lib.rs`, `src/ffi/mod.rs`, `python/genvarloader/_dataset/_flat_variants.py`, `tests/parity/strategies.py`, `tests/parity/test_flat_variants_parity.py`
+
+**Interfaces:**
+- Produces (Rust cores): `variants::compact_keep_i32(values: ArrayView1<i32>, row_offsets: ArrayView1<i64>, keep: ArrayView1<bool>) -> (Array1<i32>, Array1<i64>)` and `compact_keep_f32(values: ArrayView1<f32>, ...) -> (Array1<f32>, Array1<i64>)`.
+- Produces (Python): `_compact_keep(v_idxs, row_offsets, keep)` wrapper dispatching by `v_idxs.dtype`.
+
+- [ ] **Step 1: Confirm production value dtypes**
+
+Run: `rtk grep -n "_compact_keep(" python/genvarloader/_dataset/_flat_variants.py`
+Inspect each call (lines ~715, 717, 769, +1): the first arg is `v_idxs` (int32), `dosage_data` (check dtype), `cf_data` (check dtype). Run:
+`rtk grep -n "dosage_data\|cf_data\|unfiltered_row_offsets" python/genvarloader/_dataset/_flat_variants.py`
+Record the dtypes. If only int32 + float32 occur, the two typed cores below suffice. If another float width appears (float64), add a matching core.
+
+- [ ] **Step 2: Rust cores + cargo test**
+
+Append to `src/variants/mod.rs`:
+
+```rust
+/// Compact a per-variant value array + rebuild row offsets under `keep`.
+/// Mirrors numba `_compact_keep`. Generic over the value element type.
+fn compact_keep_impl<T: Copy + num_traits::Zero>(
+    values: ArrayView1<T>,
+    row_offsets: ArrayView1<i64>,
+    keep: ArrayView1<bool>,
+) -> (Array1<T>, Array1<i64>) {
+    let n_rows = row_offsets.len() - 1;
+    let mut new_offsets = Array1::<i64>::zeros(n_rows + 1);
+    let mut n_keep: i64 = 0;
+    for i in 0..n_rows {
+        for j in row_offsets[i] as usize..row_offsets[i + 1] as usize {
+            if keep[j] {
+                n_keep += 1;
+            }
+        }
+        new_offsets[i + 1] = n_keep;
+    }
+    let mut new_v = Array1::<T>::zeros(n_keep as usize);
+    let mut dst = 0usize;
+    for j in 0..values.len() {
+        if keep[j] {
+            new_v[dst] = values[j];
+            dst += 1;
+        }
+    }
+    (new_v, new_offsets)
+}
+
+pub fn compact_keep_i32(
+    values: ArrayView1<i32>, row_offsets: ArrayView1<i64>, keep: ArrayView1<bool>,
+) -> (Array1<i32>, Array1<i64>) {
+    compact_keep_impl(values, row_offsets, keep)
+}
+
+pub fn compact_keep_f32(
+    values: ArrayView1<f32>, row_offsets: ArrayView1<i64>, keep: ArrayView1<bool>,
+) -> (Array1<f32>, Array1<i64>) {
+    compact_keep_impl(values, row_offsets, keep)
+}
+```
+
+If `num_traits` is not already a dependency, replace the bound with an explicit zero by parameterizing the fill: change `Array1::<T>::zeros(...)` to build from a provided zero value, or simplest — drop the generic and write two near-identical functions. Check `Cargo.toml`; if `num-traits` is absent and you prefer no new dep, duplicate the body for i32/f32.
+
+Add a cargo test:
+
+```rust
+    #[test]
+    fn test_compact_keep_i32() {
+        // 2 rows: [10,11 | 12]; keep [T,F,T] → [10 | 12], offsets [0,1,2].
+        let vals = arr1(&[10i32, 11, 12]);
+        let off = arr1(&[0i64, 2, 3]);
+        let keep = arr1(&[true, false, true]);
+        let (v, o) = compact_keep_i32(vals.view(), off.view(), keep.view());
+        assert_eq!(v.to_vec(), vec![10, 12]);
+        assert_eq!(o.to_vec(), vec![0, 1, 2]);
+    }
+```
+
+- [ ] **Step 3: PyO3 wrappers + register**
+
+Append to `src/ffi/mod.rs` (two pyfunctions `compact_keep_i32`, `compact_keep_f32`, each `(values, row_offsets, keep) -> (PyArray1<T>, PyArray1<i64>)`), mirroring the gather wrappers. Register both in `src/lib.rs`.
+Run: `pixi run -e dev cargo-test`
+Expected: PASS.
+
+- [ ] **Step 4: Route Python + register (dtype dispatch)**
+
+In `_flat_variants.py`: import both rust fns; rename njit → `_compact_keep_numba`; add:
+
+```python
+register("compact_keep_i32", numba=_compact_keep_numba, rust=_compact_keep_i32_rust, default="rust")
+register("compact_keep_f32", numba=_compact_keep_numba, rust=_compact_keep_f32_rust, default="rust")
+
+
+def _compact_keep(v_idxs, row_offsets, keep):
+    values = np.ascontiguousarray(v_idxs)
+    row_offsets = np.ascontiguousarray(row_offsets, np.int64)
+    keep = np.ascontiguousarray(keep, np.bool_)
+    if np.issubdtype(values.dtype, np.floating):
+        return get("compact_keep_f32")(values.astype(np.float32, copy=False), row_offsets, keep)
+    return get("compact_keep_i32")(values.astype(np.int32, copy=False), row_offsets, keep)
+```
+
+If Step 1 found a float64 dosage/ccf dtype, the `.astype(np.float32)` would lose precision and break parity — in that case add a `compact_keep_f64` core/wrapper and route float64 to it instead of down-casting. The numba reference preserves the input dtype, so the parity test (which feeds the same dtype to both) will catch any mismatch.
+
+- [ ] **Step 5: Parity strategy + test (both dtypes)**
+
+Append to `tests/parity/strategies.py` a `compact_keep_inputs(dtype)` generator producing `(values[dtype], row_offsets int64, keep bool)`; add two parametrized tests in `test_flat_variants_parity.py` for int32 and float32 that call `assert_kernel_parity_tuple("compact_keep_i32"/"compact_keep_f32", ...)`.
+
+```python
+@st.composite
+def compact_keep_inputs(draw, dtype):
+    n_rows = draw(st.integers(1, 6))
+    counts = [draw(st.integers(0, 5)) for _ in range(n_rows)]
+    row_offsets = np.concatenate([[0], np.cumsum(counts)]).astype(np.int64)
+    total = int(row_offsets[-1])
+    if np.issubdtype(np.dtype(dtype), np.floating):
+        values = np.array(
+            draw(st.lists(st.floats(width=32, allow_nan=False, allow_infinity=False),
+                          min_size=total, max_size=total)), dtype)
+    else:
+        values = np.array(
+            draw(st.lists(st.integers(0, 1000), min_size=total, max_size=total)), dtype)
+    keep = np.array(
+        draw(st.lists(st.booleans(), min_size=total, max_size=total)), np.bool_)
+    return (values, row_offsets, keep)
+```
+
+```python
+from tests.parity.strategies import compact_keep_inputs
+
+
+@given(compact_keep_inputs(np.int32))
+def test_compact_keep_i32_parity(inputs):
+    assert_kernel_parity_tuple("compact_keep_i32", *inputs)
+
+
+@given(compact_keep_inputs(np.float32))
+def test_compact_keep_f32_parity(inputs):
+    assert_kernel_parity_tuple("compact_keep_f32", *inputs)
+```
+
+- [ ] **Step 6: Run parity + cargo, commit**
+
+Run: `pixi run -e dev pytest tests/parity/test_flat_variants_parity.py -q && pixi run -e dev cargo-test`
+Expected: PASS.
+
+```bash
+rtk git add src/variants/mod.rs src/lib.rs src/ffi/mod.rs python/genvarloader/_dataset/_flat_variants.py tests/parity/strategies.py tests/parity/test_flat_variants_parity.py Cargo.toml
+rtk git commit -m "$(cat <<'EOF'
+perf(variants): port _compact_keep numba->rust (i32/f32, parity-gated)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 8: Port `_fill_empty_scalar` + `_fill_empty_fixed` to Rust
+
+Dummy-fill for empty groups. Numba reference: `_flat_variants.py:555-576` (scalar) and `628-656` (fixed). Both insert one dummy element/variant per empty row. `_fill_empty_scalar`'s `data`/`fill` dtype varies by field (int / float). Use the same dtype-dispatch approach as Task 7.
+
+**Files:**
+- Modify: `src/variants/mod.rs`, `src/lib.rs`, `src/ffi/mod.rs`, `python/genvarloader/_dataset/_flat_variants.py`, `tests/parity/strategies.py`, `tests/parity/test_flat_variants_parity.py`
+
+**Interfaces:**
+- Produces (Rust cores): `variants::fill_empty_scalar_{i32,f32}(data, offsets, fill) -> (Array1<T>, Array1<i64>)`; `variants::fill_empty_fixed_{i32,f32}(data, offsets, inner: i64, fill) -> (Array1<T>, Array1<i64>)`. Confirm production dtypes in Step 1 (start/ilen → int; dosage → float; flank_tokens → int).
+- Produces (Python): `_fill_empty_scalar(data, offsets, fill)` and `_fill_empty_fixed(data, offsets, inner, fill)` dispatch wrappers (existing names/signatures preserved — call sites at lines 314, 419, 427).
+
+- [ ] **Step 1: Confirm field dtypes**
+
+Run: `rtk grep -n "_fill_empty_scalar(\|_fill_empty_fixed(" python/genvarloader/_dataset/_flat_variants.py`
+For each call, determine `data.dtype` (the `f.data` / `ft.data` arrays). Record which dtypes occur (expected: int32/int64 for start/ilen/flank_tokens, float32 for dosage). Add a typed core per distinct dtype; do NOT down-cast (parity).
+
+- [ ] **Step 2: Rust cores + cargo tests**
+
+Append to `src/variants/mod.rs` generic impls + typed wrappers:
+
+```rust
+fn fill_empty_scalar_impl<T: Copy>(
+    data: ArrayView1<T>, offsets: ArrayView1<i64>, fill: T,
+) -> (Array1<T>, Array1<i64>) {
+    let n_rows = offsets.len() - 1;
+    let mut new_offsets = Array1::<i64>::zeros(n_rows + 1);
+    for i in 0..n_rows {
+        let ln = offsets[i + 1] - offsets[i];
+        new_offsets[i + 1] = new_offsets[i] + if ln > 0 { ln } else { 1 };
+    }
+    let total = new_offsets[n_rows] as usize;
+    // Fill buffer with `fill` so empty-row slots are already correct; then copy.
+    let mut new_data = Array1::<T>::from_elem(total, fill);
+    for i in 0..n_rows {
+        let s = offsets[i] as usize;
+        let e = offsets[i + 1] as usize;
+        let mut d = new_offsets[i] as usize;
+        if e != s {
+            for k in s..e {
+                new_data[d] = data[k];
+                d += 1;
+            }
+        }
+    }
+    (new_data, new_offsets)
+}
+
+fn fill_empty_fixed_impl<T: Copy>(
+    data: ArrayView1<T>, offsets: ArrayView1<i64>, inner: i64, fill: T,
+) -> (Array1<T>, Array1<i64>) {
+    let n_rows = offsets.len() - 1;
+    let mut new_offsets = Array1::<i64>::zeros(n_rows + 1);
+    for i in 0..n_rows {
+        let nv = offsets[i + 1] - offsets[i];
+        new_offsets[i + 1] = new_offsets[i] + if nv > 0 { nv } else { 1 };
+    }
+    let total_vars = new_offsets[n_rows] as usize;
+    let inner_u = inner as usize;
+    let mut new_data = Array1::<T>::from_elem(total_vars * inner_u, fill);
+    let mut dptr = 0usize;
+    for i in 0..n_rows {
+        let vs = offsets[i] as usize;
+        let ve = offsets[i + 1] as usize;
+        if ve == vs {
+            dptr += inner_u; // already filled
+        } else {
+            for k in vs * inner_u..ve * inner_u {
+                new_data[dptr] = data[k];
+                dptr += 1;
+            }
+        }
+    }
+    (new_data, new_offsets)
+}
+```
+
+Add `_i32`/`_f32` (and any other confirmed dtype) public wrappers calling the impls, plus cargo tests asserting the empty-row insertion and pass-through for one int and one float case.
+
+- [ ] **Step 3: PyO3 wrappers + register; Step 4: Python dtype-dispatch wrappers**
+
+Mirror Task 7: register `"fill_empty_scalar_<dtype>"` and `"fill_empty_fixed_<dtype>"`; rename numba defs to `_*_numba`; the public `_fill_empty_scalar`/`_fill_empty_fixed` wrappers pick the entry by `data.dtype` and pass `fill` as a python scalar (PyO3 receives it as `T`). `inner` is passed as `i64`.
+Run: `pixi run -e dev cargo-test`
+Expected: PASS.
+
+- [ ] **Step 5: Parity strategies + tests**
+
+Add `fill_empty_scalar_inputs(dtype)` and `fill_empty_fixed_inputs(dtype)` generators (offsets with some empty rows guaranteed; random `fill`; `inner` 1..4 for fixed) and parametrized parity tests for each confirmed dtype in `test_flat_variants_parity.py`.
+
+- [ ] **Step 6: Run parity + cargo, commit**
+
+Run: `pixi run -e dev pytest tests/parity/test_flat_variants_parity.py -q && pixi run -e dev cargo-test`
+Expected: PASS.
+
+```bash
+rtk git add src/variants/mod.rs src/lib.rs src/ffi/mod.rs python/genvarloader/_dataset/_flat_variants.py tests/parity/strategies.py tests/parity/test_flat_variants_parity.py
+rtk git commit -m "$(cat <<'EOF'
+perf(variants): port _fill_empty_scalar + _fill_empty_fixed numba->rust (parity)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 9: Port `_fill_empty_seq` to Rust
+
+Two-level dummy-fill for allele bytestrings. Numba reference: `_flat_variants.py:579-625`. Returns `(new_data uint8, new_var_offsets int64, new_seq_offsets int64)`.
+
+**Files:**
+- Modify: `src/variants/mod.rs`, `src/lib.rs`, `src/ffi/mod.rs`, `python/genvarloader/_dataset/_flat_variants.py`, `tests/parity/strategies.py`, `tests/parity/test_flat_variants_parity.py`
+
+**Interfaces:**
+- Produces (Rust core): `variants::fill_empty_seq(data: ArrayView1<u8>, var_offsets: ArrayView1<i64>, seq_offsets: ArrayView1<i64>, dummy: ArrayView1<u8>) -> (Array1<u8>, Array1<i64>, Array1<i64>)`.
+- Produces (Python): `_fill_empty_seq(data, var_offsets, seq_offsets, dummy)` dispatch wrapper (existing name/signature; call sites at lines 323, 413).
+
+- [ ] **Step 1: Rust core + cargo test**
+
+Append to `src/variants/mod.rs` a faithful port (empty variant-rows receive one dummy allele of `dummy` bytes; non-empty pass through), then a cargo test covering one empty row + one non-empty row.
+
+```rust
+/// Two-level dummy-fill for allele bytestrings. Mirrors numba `_fill_empty_seq`.
+pub fn fill_empty_seq(
+    data: ArrayView1<u8>,
+    var_offsets: ArrayView1<i64>,
+    seq_offsets: ArrayView1<i64>,
+    dummy: ArrayView1<u8>,
+) -> (Array1<u8>, Array1<i64>, Array1<i64>) {
+    let n_rows = var_offsets.len() - 1;
+    let l = dummy.len() as i64;
+    let mut new_var = Array1::<i64>::zeros(n_rows + 1);
+    for i in 0..n_rows {
+        let nv = var_offsets[i + 1] - var_offsets[i];
+        new_var[i + 1] = new_var[i] + if nv > 0 { nv } else { 1 };
+    }
+    let total_vars = new_var[n_rows] as usize;
+    let mut new_seq = Array1::<i64>::zeros(total_vars + 1);
+    let mut vptr = 0usize;
+    for i in 0..n_rows {
+        let vs = var_offsets[i] as usize;
+        let ve = var_offsets[i + 1] as usize;
+        if ve == vs {
+            new_seq[vptr + 1] = new_seq[vptr] + l;
+            vptr += 1;
+        } else {
+            for v in vs..ve {
+                let vlen = seq_offsets[v + 1] - seq_offsets[v];
+                new_seq[vptr + 1] = new_seq[vptr] + vlen;
+                vptr += 1;
+            }
+        }
+    }
+    let mut new_data = Array1::<u8>::zeros(new_seq[total_vars] as usize);
+    let mut dptr = 0usize;
+    for i in 0..n_rows {
+        let vs = var_offsets[i] as usize;
+        let ve = var_offsets[i + 1] as usize;
+        if ve == vs {
+            for k in 0..dummy.len() {
+                new_data[dptr] = dummy[k];
+                dptr += 1;
+            }
+        } else {
+            for v in vs..ve {
+                let bs = seq_offsets[v] as usize;
+                let be = seq_offsets[v + 1] as usize;
+                for k in bs..be {
+                    new_data[dptr] = data[k];
+                    dptr += 1;
+                }
+            }
+        }
+    }
+    (new_data, new_var, new_seq)
+}
+```
+
+- [ ] **Step 2: PyO3 wrapper + register; Step 3: Python wrapper**
+
+Append the `ffi::fill_empty_seq` pyfunction (`-> (PyArray1<u8>, PyArray1<i64>, PyArray1<i64>)`), register in lib.rs; in `_flat_variants.py` rename njit → `_fill_empty_seq_numba`, register `"fill_empty_seq"`, and define the `_fill_empty_seq` dispatch wrapper coercing `data`/`dummy` to uint8 and offsets to int64.
+Run: `pixi run -e dev cargo-test`
+Expected: PASS.
+
+- [ ] **Step 4: Parity strategy + test**
+
+Add `fill_empty_seq_inputs` (var_offsets with at least one empty row; nested seq_offsets; random dummy bytes) and a parity test using `assert_kernel_parity_tuple("fill_empty_seq", ...)`.
+
+- [ ] **Step 5: Run parity + cargo, commit**
+
+Run: `pixi run -e dev pytest tests/parity/test_flat_variants_parity.py -q && pixi run -e dev cargo-test`
+Expected: PASS.
+
+```bash
+rtk git add src/variants/mod.rs src/lib.rs src/ffi/mod.rs python/genvarloader/_dataset/_flat_variants.py tests/parity/strategies.py tests/parity/test_flat_variants_parity.py
+rtk git commit -m "$(cat <<'EOF'
+perf(variants): port _fill_empty_seq numba->rust (parity-gated)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 10: Variants-mode dataset-level parity backstop
+
+Variants output mode (`with_seqs("variants")`) has no differential coverage today. Add a dataset-level test mirroring `tests/parity/test_dataset_parity.py` (tracks mode), with a spy asserting the Rust flat kernels are actually invoked (no vacuous pass — the Phase 0 lesson).
+
+**Files:**
+- Create: `tests/parity/test_variants_dataset_parity.py`
+- Reference: `tests/parity/test_dataset_parity.py`, `tests/parity/_fixtures.py`
+
+**Interfaces:**
+- Consumes: the registered kernels `gather_rows`, `gather_alleles`, `compact_keep_*`, `fill_empty_*` and a variants-capable dataset fixture.
+
+- [ ] **Step 1: Read the existing backstop pattern**
+
+Read `tests/parity/test_dataset_parity.py` and `tests/parity/_fixtures.py` in full. Reuse the dataset fixture; if it has no variants-mode dataset, build one via the fixture helpers (a small written dataset with variants).
+
+- [ ] **Step 2: Write the backstop test**
+
+Create `tests/parity/test_variants_dataset_parity.py`:
+
+```python
+import numpy as np
+import pytest
+
+from genvarloader._dataset import _flat_variants
+from genvarloader import _dispatch
+
+pytestmark = pytest.mark.parity
+
+
+def _run_variants_getitem(ds):
+    """Materialize a variants-mode getitem over the whole dataset."""
+    vds = ds.with_seqs("variants")
+    return vds[:, :]
+
+
+def test_variants_getitem_parity_and_kernels_invoked(variants_dataset, monkeypatch):
+    # Spy: count rust gather_rows calls so a vacuous pass is impossible.
+    calls = {"n": 0}
+    real = _dispatch.get("gather_rows")
+
+    def spy(*args, **kwargs):
+        calls["n"] += 1
+        return real(*args, **kwargs)
+
+    # numba reference
+    monkeypatch.setenv("GVL_BACKEND", "numba")
+    out_numba = _run_variants_getitem(variants_dataset)
+
+    # rust + spy
+    monkeypatch.setenv("GVL_BACKEND", "rust")
+    monkeypatch.setattr(
+        _flat_variants, "get",
+        lambda name: spy if name == "gather_rows" else _dispatch.get(name),
+    )
+    out_rust = _run_variants_getitem(variants_dataset)
+
+    assert calls["n"] > 0, "rust gather_rows was never invoked — vacuous parity"
+    # Compare each parallel field of the RaggedVariants output byte-identically.
+    # (Adapt field access to the RaggedVariants API: .alts, .refs, .v_idxs, etc.)
+    for field in ("v_idxs", "alts", "refs"):
+        a = np.asarray(getattr(out_numba, field).data)
+        b = np.asarray(getattr(out_rust, field).data)
+        np.testing.assert_array_equal(a, b)
+```
+
+Note: adjust `variants_dataset` fixture wiring and the `RaggedVariants` field names to the actual API (inspect `get_variants_flat`'s return and `_rag_variants.py`). The two essentials are (1) the spy proving the Rust kernel ran and (2) byte-identical field comparison.
+
+- [ ] **Step 3: Run the backstop**
+
+Run: `pixi run -e dev pytest tests/parity/test_variants_dataset_parity.py -q`
+Expected: PASS, with the spy assertion satisfied.
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add tests/parity/test_variants_dataset_parity.py tests/parity/_fixtures.py
+rtk git commit -m "$(cat <<'EOF'
+test(parity): variants-mode dataset backstop (spy-guarded, byte-identical)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+### Task 11: Full-suite gate, no-regression measurement, roadmap update
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md`
+
+- [ ] **Step 1: Full test tree (both backends)**
+
+Run: `pixi run -e dev pytest tests -q`
+Expected: PASS (covers `tests/dataset` AND `tests/unit`, per CLAUDE.md).
+Run with the numba backend forced to confirm the reference path still works:
+`GVL_BACKEND=numba pixi run -e dev pytest tests/dataset tests/unit -q`
+Expected: PASS.
+
+- [ ] **Step 2: Lint + typecheck + format**
+
+Run: `pixi run -e dev ruff check python/ tests/ && pixi run -e dev ruff format --check python/ tests/ && pixi run -e dev typecheck`
+Expected: PASS. Fix any issues, re-run.
+
+- [ ] **Step 3: abi3 wheel build**
+
+Run: `pixi run -e dev cargo-test` (already builds) and confirm a clean maturin build per the repo's build task.
+Expected: builds clean.
+
+- [ ] **Step 4: No-regression measurement on `chr22_geuv`**
+
+Build the corpus if absent: `pixi run -e dev python tests/benchmarks/data/build_realistic.py` (needs `/carter` or `GVL_BENCH_SOURCE`).
+Run haps mode (exercises get_diffs_sparse + choose_exonic_variants):
+`pixi run -e dev python tests/benchmarks/profiling/profile.py --mode haps`
+Compare to baseline **123.9 batch/s** — assert no regression (within noise).
+Run variants mode (exercises the flat gather/fill kernels):
+`pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variants`
+Compare to baseline **145.3 batch/s** — assert no regression.
+Record both numbers (rust vs numba) for the roadmap. If a regression appears, profile and consider rayon on the hot kernel (allowed by the constraints only if needed).
+
+- [ ] **Step 5: Update the roadmap**
+
+In `docs/roadmaps/rust-migration.md`:
+- Phase 2 header: set status 🚧→ (✅ when all gates green) + PR link.
+- Fix the double-count: change the `_genotypes.py` line to "assembly/selection kernels (`get_diffs_sparse`, `choose_exonic_variants`); reconstruction kernels moved to Phase 3"; tick the `_genotypes.py` and `_flat_variants.py` items.
+- Note `filter_af` deleted as dead (cross-reference the Phase 0 `splits_sum_le_value` precedent).
+- Add a dated entry to the decisions log summarizing: kernels ported, dead-code deletion, `(2,n)` offset normalization, dtype-dispatch for `compact_keep`/`fill_empty_*`, gate = parity + no regression, and the measured haps/variants throughput (rust vs numba).
+- Record measurements in the metrics narrative.
+
+- [ ] **Step 6: Commit**
+
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "$(cat <<'EOF'
+docs(roadmap): Phase 2 genotype assembly + variant gather complete
+
+Ported get_diffs_sparse + choose_exonic_variants + 7 flat gather/fill kernels
+to Rust (parity-gated); deleted dead filter_af; fixed Phase 2/3 double-count.
+No getitem regression (haps/variants vs baseline).
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
+EOF
+)"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- Port `get_diffs_sparse` → Task 2. ✅
+- Port `choose_exonic_variants` (+ inner) → Task 3 (inner kept as numba-only helper). ✅
+- Delete dead `filter_af` → Task 4. ✅
+- Port 7 flat kernels → Tasks 5 (`_gather_v_idxs`+`_ss` as `gather_rows`), 6 (`_gather_alleles`), 7 (`_compact_keep`), 8 (`_fill_empty_scalar`+`_fill_empty_fixed`), 9 (`_fill_empty_seq`). 2+1+1+2+1 = 7. ✅
+- `src/genotypes/` + `src/variants/` pure-ndarray cores, `src/ffi/` PyO3 only → Tasks 2/3 (genotypes), 5–9 (variants). ✅
+- Dispatch registry, default rust, numba retained as reference → every port task. ✅
+- Both offset forms via `(2,n)` normalization → Tasks 2/3/5. ✅
+- Sequential (no rayon) → cores written sequentially; rayon only if Task 11 finds a regression. ✅
+- Per-kernel hypothesis parity gates + variants-mode dataset backstop → Tasks 2–9 + Task 10. ✅
+- Gate = parity + no regression, haps 123.9 / variants 145.3 baselines → Task 11. ✅
+- Roadmap update incl. double-count fix → Task 11. ✅
+- Harness tuple support (needed because Phase 2 kernels return tuples) → Task 1. ✅
+
+**Placeholder scan:** Tasks 8 and 10 intentionally describe a repeated pattern (typed dtype wrappers / fixture wiring) rather than transcribing every near-identical variant — each names the exact functions, dtypes, signatures, and reference line numbers needed, and shows the generic Rust impl + one concrete strategy/test. This is pattern-repetition guidance, not a TBD; the int32 path is shown in full and float follows identically.
+
+**Type consistency:** `_as_starts_stops` defined in Task 2, imported in Tasks 3 and 5. `assert_kernel_parity_tuple` defined in Task 1, used in Tasks 2–9. `gather_rows` (Rust) ↔ `"gather_rows"` (registry) ↔ `_gather_rows` (Python) consistent. `compact_keep_i32`/`compact_keep_f32` names consistent across core/ffi/registry/test. OFFSET_TYPE confirmed int64 in Task 3 Step 1 before relying on i64 returns.
+
+**Open items the implementer MUST resolve (flagged inline, not deferred):**
+- Task 3 Step 1: confirm `OFFSET_TYPE == int64`.
+- Task 7 Step 1 / Task 8 Step 1: confirm production value dtypes for `_compact_keep` (dosage/ccf) and `_fill_empty_*` (start/ilen/dosage/flank_tokens); add a typed core if float64 appears (do NOT down-cast — would break parity).
+- Task 5: confirm `geno_v_idxs`/`self.genotypes.data` dtype is int32.
+- Task 10: confirm the `RaggedVariants` field names + add a variants-capable fixture if absent.
diff --git a/docs/superpowers/plans/2026-06-24-rust-migration-phase-3.md b/docs/superpowers/plans/2026-06-24-rust-migration-phase-3.md
new file mode 100644
index 00000000..831208e9
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-24-rust-migration-phase-3.md
@@ -0,0 +1,815 @@
+# Phase 3 — Reconstruction + Track Realignment Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Port the 8 numba-only read-path kernel groups (reference fetch, haplotype reconstruction, track realignment + insertion-fill, track→interval RLE) to Rust as byte-identical 1:1 parity twins behind dispatch, then fuse the haplotypes and tracks `__getitem__` read paths into single Rust boundary crossings.
+
+**Architecture:** Strangler-fig, identical to Phase 2. Each kernel becomes a pure-`ndarray`/`rayon` core in a new `src/` domain module, wrapped by a `#[pyfunction]` in `src/ffi/mod.rs`, registered in `src/lib.rs`, and wired into the existing `genvarloader._dispatch` registry (default `rust`; numba retained as parity reference). Parity is hard-gated (byte-identical); throughput is recorded only.
+
+**Tech Stack:** Rust (ndarray 0.17, rayon 1.12, pyo3 0.28 abi3-py310, numpy 0.28), maturin build, Python 3.10–3.13, numba (reference impls), hypothesis + pytest (parity), pixi (`-e dev`).
+
+## Global Constraints
+
+- **Parity is the hard gate.** Every ported kernel must be **byte-identical** (dtype + shape + values via `np.testing.assert_array_equal`) to its numba twin across hypothesis-generated inputs before it lands. Throughput is **recorded only** — no throughput gate this phase (per the 2026-06-24 decision; the throughput gate lives in Phase 5).
+- **Dispatch contract:** new kernels register via `genvarloader._dispatch.register(name, numba=<fn>, rust=<fn>, default="rust")`. `GVL_BACKEND=numba|rust` force-overrides all kernels (used by parity sweeps). Numba impls stay as the registered reference; they are deleted wholesale in Phase 5, **not** this phase.
+- **Type floors (confirmed at runtime in Phase 2):** `OFFSET_TYPE` = `int64`, genoray `V_IDX_TYPE` = `int32`, `DOSAGE_TYPE` = `float32`. Reference/haplotype bytes are `uint8` (viewed `S1`). Track values are `float32`. Insertion-fill `params` are `float64`; `strategy_ids` are `int8`; PRNG seeds are `uint64`.
+- **Numba-fidelity rule:** accumulate length sums in a wider int (`i64`) and truncate on store to mirror numpy's `int32`-slot assignment (Phase 2 precedent in `src/genotypes/mod.rs`). For unsigned PRNG arithmetic, use **wrapping** `u64` ops to mirror numba's `np.uint64` overflow semantics exactly.
+- **Offset normalization:** offsets may arrive 1-D `(n+1,)` or 2-D `(2, n)`. Reuse the established `_as_starts_stops` helper (`_genotypes.py:112`) so both backends consume the single `(2, n)` int64 form.
+- **abi3 wheels must keep building** across py310–313 × linux/macOS (standing CI invariant).
+- **Out of scope this phase:** `_insertion_fill.py:lower` and `_splice.py:build_splice_plan` stay plain Python; variant-flat/flank kernels (done Phase 2); wholesale numba deletion + crate consolidation (Phase 5); genoray IO (Phase 6).
+- **Test tmp filesystem:** dataset tests need pytest's tmp on the same filesystem as `tests/data` — run with `--basetemp=<repo>/.pytest_tmp` or the write-path `os.link` hardlink fails cross-device (Errno 18).
+- **Branch:** all work lands incrementally on `phase-3-reconstruction` (off `rust-migration`); the phase merges to `rust-migration` as ONE bundled PR. Commit after every kernel.
+
+---
+
+## The porting recipe (every kernel task in §3a–§3c follows this)
+
+This is the invariant mechanical loop. Each task below supplies only the parts that differ (numba source reference, Rust core signature, ffi signature, dispatch name + wiring location, cargo tests, parity strategy + assertion). The 9 steps are always:
+
+1. **Write the failing parity test** — add a hypothesis strategy to `tests/parity/strategies.py` and a `test_<name>_parity.py` under `tests/parity/` using the harness (`assert_kernel_parity` / `assert_kernel_parity_tuple` / `assert_inplace_kernel_parity`). Import the owning `_dataset` module so `register()` runs.
+2. **Run it, verify it FAILS** — `pixi run -e dev pytest tests/parity/test_<name>_parity.py -v`. Expected: `KeyError: no kernel registered as '<name>'` (rust not wired yet) or a `register()`-time failure. (Numba-only kernels aren't registered yet, so the test fails until both backends exist.)
+3. **Write the Rust core** in `src/<module>/mod.rs` (pure ndarray, no PyO3) translating the numba source **line-by-line**, honoring the numba-fidelity rule. Add `#[cfg(test)] mod tests` cargo unit tests covering the empty/boundary/typical cases listed in the task.
+4. **Run cargo tests** — `pixi run -e dev cargo-test` (or `cargo test -p genvarloader <name>`). Expected: PASS.
+5. **Add the ffi wrapper** — a `#[pyfunction] pub fn <name>` in `src/ffi/mod.rs` (`PyReadonlyArray*::as_array()` in, `Array::into_pyarray(py)` out, `as_array_mut()` for in-place buffers, `.row(0)/.row(1)` to split normalized offsets).
+6. **Register** in `src/lib.rs` — `m.add_function(wrap_pyfunction!(ffi::<name>, m)?)?;`.
+7. **Wire dispatch** in the owning `_dataset` module — add `_<name>_rust` thin binding calling `_gvl_rust.<name>(...)`, and a `register("<name>", numba=<numba_fn>, rust=_<name>_rust, default="rust")` call. Route the production call site through `get("<name>")(...)` (or keep the existing wrapper and add the rust branch).
+8. **Build + run parity on BOTH backends** — `pixi run -e dev maturin develop` then `GVL_BACKEND=rust pytest tests/parity/test_<name>_parity.py -v` and `GVL_BACKEND=numba …`. Expected: PASS both.
+9. **Commit** — `rtk git add … && rtk git commit -m "perf(<area>): port <name> numba->rust (parity)"`.
+
+The Phase 2 reference implementations to mirror for shape/idiom: `src/genotypes/mod.rs` (core), `src/ffi/mod.rs` (boundary), `tests/parity/_harness.py` + `tests/parity/test_get_diffs_sparse_parity.py` (tests), `_genotypes.py:112-167` (`_as_starts_stops` + wrapper + `register`).
+
+---
+
+## File structure
+
+**New Rust modules (created):**
+- `src/reference/mod.rs` — `padded_slice`, `get_reference` (par/ser selection inside the core via a `parallel: bool` flag).
+- `src/reconstruct/mod.rs` — `reconstruct_haplotype_from_sparse` (singular) + `reconstruct_haplotypes_from_sparse` (batch, rayon), with the optional annotation outputs.
+- `src/tracks/mod.rs` — `xorshift64`, `hash4`, `apply_insertion_fill`, `shift_and_realign_track_sparse` (singular) + `shift_and_realign_tracks_sparse` (batch, rayon), `tracks_to_intervals` (+ `scanned_mask`/`compact_mask`).
+
+**Modified:**
+- `src/ffi/mod.rs` — one `#[pyfunction]` per ported entry kernel.
+- `src/lib.rs` — `pub mod reference; pub mod reconstruct; pub mod tracks;` + `add_function` lines.
+- `python/genvarloader/_dataset/_reference.py`, `_genotypes.py`, `_tracks.py`, `_intervals.py` — `_<name>_rust` bindings + `register(...)` + call-site routing.
+- `python/genvarloader/_dataset/_utils.py` — `padded_slice` stays (numba reference) but its production callers move behind dispatch via `get_reference`.
+
+**New tests:**
+- `tests/parity/strategies.py` — extend with reference/reconstruct/track input strategies.
+- `tests/parity/test_get_reference_parity.py`, `test_reconstruct_haplotypes_parity.py`, `test_shift_and_realign_tracks_parity.py`, `test_tracks_to_intervals_parity.py`.
+- `tests/parity/test_dataset_parity.py` — extend the existing spy-guarded backstop with haplotypes-mode and tracks-mode (realign) `ds[:, :]` byte-identical checks + fused-path assertions.
+
+---
+
+# Sub-unit 3a — Reference path (warm-up, low parity risk)
+
+### Task 1: `padded_slice` Rust core
+
+Port the leaf used by all reference fetches. It is njit-internal (not a Python entry), so it gets **no** dispatch registration of its own — it is exercised through `get_reference` (Task 2). This task lands the Rust core + cargo tests only.
+
+**Files:**
+- Create: `src/reference/mod.rs`
+- Modify: `src/lib.rs` (add `pub mod reference;`)
+
+**Numba source to mirror:** `python/genvarloader/_dataset/_utils.py:14-48` (`padded_slice`).
+
+**Interfaces:**
+- Produces (consumed by Task 2): `pub fn padded_slice(arr: ArrayView1<u8>, start: i64, stop: i64, pad_val: u8, out: ArrayViewMut1<u8>)` — writes into `out` in place, mirroring the numba semantics: `start >= stop` → no-op; `stop < 0` → fill `pad_val`; otherwise copy `arr[start:stop]` with left/right padding where the slice runs past `[0, len(arr))`.
+
+- [ ] **Step 1: Write the Rust core + cargo tests**
+
+```rust
+//! Reference sequence assembly cores (pure ndarray). PyO3 lives in `crate::ffi`.
+use ndarray::{ArrayView1, ArrayViewMut1};
+
+/// Copy `arr[start:stop]` into `out`, padding with `pad_val` where the slice
+/// runs past `[0, arr.len())`. Mirrors numba `padded_slice`
+/// (`_dataset/_utils.py`). `out.len()` MUST equal `stop - start` for the
+/// in-bounds case (the caller guarantees this via out_offsets).
+pub fn padded_slice(
+    arr: ArrayView1<u8>,
+    start: i64,
+    stop: i64,
+    pad_val: u8,
+    mut out: ArrayViewMut1<u8>,
+) {
+    if start >= stop {
+        return;
+    }
+    if stop < 0 {
+        out.fill(pad_val);
+        return;
+    }
+    let len = arr.len() as i64;
+    let pad_left = (-start).max(0);
+    let pad_right = (stop - len).max(0);
+    if pad_left == 0 && pad_right == 0 {
+        // out[:] = arr[start:stop]
+        out.assign(&arr.slice(ndarray::s![start as usize..stop as usize]));
+        return;
+    }
+    let out_len = out.len() as i64;
+    if pad_left > 0 && pad_right > 0 {
+        let out_stop = out_len - pad_right;
+        out.slice_mut(ndarray::s![..pad_left as usize]).fill(pad_val);
+        out.slice_mut(ndarray::s![pad_left as usize..out_stop as usize])
+            .assign(&arr);
+        out.slice_mut(ndarray::s![out_stop as usize..]).fill(pad_val);
+    } else if pad_left > 0 {
+        // out[:pad_left] = pad; out[pad_left:] = arr[:stop]
+        out.slice_mut(ndarray::s![..pad_left as usize]).fill(pad_val);
+        out.slice_mut(ndarray::s![pad_left as usize..])
+            .assign(&arr.slice(ndarray::s![..stop as usize]));
+    } else {
+        // pad_right > 0: out[:out_stop] = arr[start:]; out[out_stop:] = pad
+        let out_stop = out_len - pad_right;
+        out.slice_mut(ndarray::s![..out_stop as usize])
+            .assign(&arr.slice(ndarray::s![start as usize..]));
+        out.slice_mut(ndarray::s![out_stop as usize..]).fill(pad_val);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::{arr1, Array1};
+
+    fn run(arr: &[u8], start: i64, stop: i64, pad: u8) -> Vec<u8> {
+        let a = arr1(arr);
+        let mut out = Array1::<u8>::zeros((stop - start).max(0) as usize);
+        padded_slice(a.view(), start, stop, pad, out.view_mut());
+        out.to_vec()
+    }
+
+    #[test]
+    fn in_bounds() {
+        assert_eq!(run(&[1, 2, 3, 4, 5], 1, 4, 0), vec![2, 3, 4]);
+    }
+    #[test]
+    fn pad_left_only() {
+        assert_eq!(run(&[1, 2, 3], -2, 2, 9), vec![9, 9, 1, 2]);
+    }
+    #[test]
+    fn pad_right_only() {
+        assert_eq!(run(&[1, 2, 3], 1, 5, 9), vec![2, 3, 9, 9]);
+    }
+    #[test]
+    fn pad_both() {
+        assert_eq!(run(&[1, 2], -1, 3, 9), vec![9, 1, 2, 9]);
+    }
+    #[test]
+    fn empty_when_start_ge_stop() {
+        assert_eq!(run(&[1, 2, 3], 2, 2, 9), Vec::<u8>::new());
+    }
+    #[test]
+    fn all_pad_when_stop_negative() {
+        let a = arr1(&[1u8, 2, 3]);
+        let mut out = Array1::<u8>::zeros(3);
+        padded_slice(a.view(), -5, -1, 7, out.view_mut());
+        // stop < 0 → numba returns early after filling pad_val on the whole out
+        assert_eq!(out.to_vec(), vec![7, 7, 7]);
+    }
+}
+```
+
+- [ ] **Step 2: Declare the module** — add `pub mod reference;` to the module list at the top of `src/lib.rs`.
+
+- [ ] **Step 3: Run cargo tests, verify PASS**
+
+Run: `pixi run -e dev cargo-test`
+Expected: the 6 `reference::tests::*` cases PASS (and the existing suite stays green).
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add src/reference/mod.rs src/lib.rs
+rtk git commit -m "perf(reference): port padded_slice numba->rust core (cargo-tested)"
+```
+
+---
+
+### Task 2: `get_reference` entry kernel (core + ffi + dispatch + parity)
+
+**Files:**
+- Modify: `src/reference/mod.rs` (add `get_reference`), `src/ffi/mod.rs`, `src/lib.rs`
+- Modify: `python/genvarloader/_dataset/_reference.py` (`_get_reference_rust` + `register` + route `get_reference`)
+- Create: `tests/parity/test_get_reference_parity.py`; extend `tests/parity/strategies.py`
+
+**Numba source to mirror:** `_reference.py:685-723` (`_get_reference_par/_ser`, `_get_reference_row`) + `get_reference` Python entry. The kernel writes `out[out_offsets[i]:out_offsets[i+1]] = padded_slice(ref[c_s:c_e], start, end, pad_char)` for each region `i`, where `regions[i] = (c_idx, start, end)` and `c_s,c_e = ref_offsets[c_idx], ref_offsets[c_idx+1]`. Parallel vs serial is a pure scheduling choice (disjoint out-slices) selected by `should_parallelize(out_offsets[-1])` — **byte-identical regardless of scheduling**, so the Rust core takes a `parallel: bool` flag and uses rayon when true.
+
+**Interfaces:**
+- Produces: `pub fn get_reference(regions: ArrayView2<i32>, out_offsets: ArrayView1<i64>, reference: ArrayView1<u8>, ref_offsets: ArrayView1<i64>, pad_char: u8, parallel: bool) -> Array1<u8>` (length `out_offsets[-1]`).
+- ffi: `#[pyfunction] pub fn get_reference(py, regions: PyReadonlyArray2<i32>, out_offsets: PyReadonlyArray1<i64>, reference: PyReadonlyArray1<u8>, ref_offsets: PyReadonlyArray1<i64>, pad_char: u8, parallel: bool) -> Bound<PyArray1<u8>>`.
+- dispatch name: `"get_reference"`.
+
+- [ ] **Step 1: Add hypothesis strategy** to `tests/parity/strategies.py`
+
+```python
+@st.composite
+def get_reference_inputs(draw):
+    """Generate (regions, out_offsets, reference, ref_offsets, pad_char, parallel)
+    with regions whose [start,end) windows may run off either contig edge."""
+    import numpy as np
+    n_contigs = draw(st.integers(1, 3))
+    contig_lens = [draw(st.integers(1, 40)) for _ in range(n_contigs)]
+    ref_offsets = np.concatenate([[0], np.cumsum(contig_lens)]).astype(np.int64)
+    reference = draw(
+        arrays(np.uint8, int(ref_offsets[-1]), elements=st.integers(0, 255))
+    )
+    n_regions = draw(st.integers(1, 6))
+    regions = np.empty((n_regions, 3), np.int32)
+    lengths = []
+    for i in range(n_regions):
+        c = draw(st.integers(0, n_contigs - 1))
+        clen = contig_lens[c]
+        start = draw(st.integers(-5, clen + 5))
+        length = draw(st.integers(0, clen + 5))
+        regions[i] = (c, start, start + length)
+        lengths.append(length)
+    out_offsets = np.concatenate([[0], np.cumsum(lengths)]).astype(np.int64)
+    pad_char = draw(st.integers(0, 255))
+    parallel = draw(st.booleans())
+    return regions, out_offsets, reference, ref_offsets, np.uint8(pad_char), parallel
+```
+
+- [ ] **Step 2: Write the failing parity test** — `tests/parity/test_get_reference_parity.py`
+
+```python
+import pytest
+from hypothesis import given, settings
+
+from genvarloader._dataset import _reference  # noqa: F401  (triggers register())
+from tests.parity._harness import assert_kernel_parity
+from tests.parity.strategies import get_reference_inputs
+
+pytestmark = pytest.mark.parity
+
+
+@settings(deadline=None)
+@given(get_reference_inputs())
+def test_get_reference_parity(inputs):
+    regions, out_offsets, reference, ref_offsets, pad_char, parallel = inputs
+    assert_kernel_parity(
+        "get_reference", regions, out_offsets, reference, ref_offsets, pad_char, parallel
+    )
+```
+
+- [ ] **Step 3: Run it, verify FAIL**
+
+Run: `pixi run -e dev pytest tests/parity/test_get_reference_parity.py -q`
+Expected: FAIL — `KeyError: no kernel registered as 'get_reference'`.
+
+- [ ] **Step 4: Add the Rust core** to `src/reference/mod.rs`
+
+```rust
+use ndarray::{Array1, ArrayView1, ArrayView2};
+use rayon::prelude::*;
+
+/// Fetch padded reference rows for each region into one flat buffer.
+/// `regions[i] = (contig_idx, start, end)`. Mirrors numba
+/// `_get_reference_par/_ser` + `_get_reference_row`. Scheduling (rayon vs
+/// serial) does not affect output — out-slices are disjoint.
+pub fn get_reference(
+    regions: ArrayView2<i32>,
+    out_offsets: ArrayView1<i64>,
+    reference: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+    parallel: bool,
+) -> Array1<u8> {
+    let total = out_offsets[out_offsets.len() - 1] as usize;
+    let mut out = Array1::<u8>::zeros(total);
+    let n = regions.nrows();
+
+    // Build disjoint mutable row slices so we can fill each region independently.
+    let row = |i: usize, dst: &mut [u8]| {
+        let c_idx = regions[[i, 0]] as usize;
+        let start = regions[[i, 1]] as i64;
+        let end = regions[[i, 2]] as i64;
+        let c_s = ref_offsets[c_idx] as usize;
+        let c_e = ref_offsets[c_idx + 1] as usize;
+        let contig = reference.slice(ndarray::s![c_s..c_e]);
+        let mut dst_view = ndarray::ArrayViewMut1::from(dst);
+        padded_slice(contig, start, end, pad_char, dst_view.view_mut());
+    };
+
+    // Partition `out` into per-region chunks by out_offsets, then fill.
+    let bounds: Vec<(usize, usize)> = (0..n)
+        .map(|i| (out_offsets[i] as usize, out_offsets[i + 1] as usize))
+        .collect();
+    let out_slice = out.as_slice_mut().unwrap();
+    if parallel {
+        // split_at_mut chain over sorted disjoint bounds via chunks_by indices
+        let mut chunks: Vec<&mut [u8]> = Vec::with_capacity(n);
+        let mut rest = out_slice;
+        let mut cursor = 0usize;
+        for &(s, e) in &bounds {
+            let (_, tail) = rest.split_at_mut(s - cursor);
+            let (mid, tail2) = tail.split_at_mut(e - s);
+            chunks.push(mid);
+            rest = tail2;
+            cursor = e;
+        }
+        chunks
+            .into_par_iter()
+            .enumerate()
+            .for_each(|(i, dst)| row(i, dst));
+    } else {
+        for (i, &(s, e)) in bounds.iter().enumerate() {
+            row(i, &mut out_slice[s..e]);
+        }
+    }
+    out
+}
+```
+
+Add cargo tests covering: a fully in-bounds region; a region straddling the left edge (`start < 0`); a region straddling the right edge (`end > contig_len`); two contigs with a region in each; `parallel=true` vs `false` produce identical buffers.
+
+- [ ] **Step 5: Run cargo tests, verify PASS** — `pixi run -e dev cargo-test`.
+
+- [ ] **Step 6: Add the ffi wrapper** to `src/ffi/mod.rs`
+
+```rust
+use crate::reference;
+
+#[pyfunction]
+pub fn get_reference<'py>(
+    py: Python<'py>,
+    regions: PyReadonlyArray2<i32>,
+    out_offsets: PyReadonlyArray1<i64>,
+    reference: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+    parallel: bool,
+) -> Bound<'py, PyArray1<u8>> {
+    let out = reference::get_reference(
+        regions.as_array(),
+        out_offsets.as_array(),
+        reference.as_array(),
+        ref_offsets.as_array(),
+        pad_char,
+        parallel,
+    );
+    out.into_pyarray(py)
+}
+```
+
+- [ ] **Step 7: Register** in `src/lib.rs` — add `m.add_function(wrap_pyfunction!(ffi::get_reference, m)?)?;`.
+
+- [ ] **Step 8: Wire dispatch** in `_reference.py`. Add the rust binding + registration and route the existing `get_reference` entry through dispatch:
+
+```python
+from genvarloader import _genvarloader as _gvl_rust  # match existing import alias
+from genvarloader._dispatch import register, get
+
+
+def _get_reference_numba(regions, out_offsets, reference, ref_offsets, pad_char, parallel):
+    out = np.empty(out_offsets[-1], np.uint8)
+    kernel = _get_reference_par if parallel else _get_reference_ser
+    return kernel(regions, out_offsets, reference, ref_offsets, pad_char, out)
+
+
+def _get_reference_rust(regions, out_offsets, reference, ref_offsets, pad_char, parallel):
+    return _gvl_rust.get_reference(
+        np.ascontiguousarray(regions, np.int32),
+        np.ascontiguousarray(out_offsets, np.int64),
+        np.ascontiguousarray(reference, np.uint8),
+        np.ascontiguousarray(ref_offsets, np.int64),
+        int(pad_char),
+        bool(parallel),
+    )
+
+
+register("get_reference", numba=_get_reference_numba, rust=_get_reference_rust, default="rust")
+
+
+def get_reference(regions, out_offsets, reference, ref_offsets, pad_char):
+    parallel = should_parallelize(int(out_offsets[-1]))
+    return get("get_reference")(regions, out_offsets, reference, ref_offsets, pad_char, parallel)
+```
+
+Note: `parallel` is computed in the Python entry (not inside the kernels) so both backends receive the identical flag — this keeps the numba twin byte-identical to today's behavior and makes the strategy's `parallel` field meaningful.
+
+- [ ] **Step 9: Build + run parity on both backends**
+
+Run:
+```bash
+pixi run -e dev maturin develop
+pixi run -e dev pytest tests/parity/test_get_reference_parity.py -q
+GVL_BACKEND=numba pixi run -e dev pytest tests/parity/test_get_reference_parity.py -q
+```
+Expected: PASS (default rust) and PASS (forced numba).
+
+- [ ] **Step 10: Commit**
+
+```bash
+rtk git add src/reference/mod.rs src/ffi/mod.rs src/lib.rs \
+  python/genvarloader/_dataset/_reference.py \
+  tests/parity/test_get_reference_parity.py tests/parity/strategies.py
+rtk git commit -m "perf(reference): port get_reference numba->rust (parity, default rust)"
+```
+
+---
+
+### Task 3: spliced-reference parity backstop
+
+`_fetch_spliced_ref` (`_reference.py:728-755`) is plain Python that permutes regions via `SplicePlan` then calls `get_reference`. It needs **no** new kernel — Task 2 already covers its hot call. This task adds a dataset-level backstop proving the rust `get_reference` is byte-identical through the splice path.
+
+**Files:**
+- Modify: `tests/parity/test_dataset_parity.py`
+
+**Interfaces:**
+- Consumes: the `get_reference` dispatch from Task 2; the existing dataset fixtures + backend-forcing helper used by the Phase 0/2 backstops.
+
+- [ ] **Step 1: Add a spy-guarded reference-mode backstop test**
+
+Add a test that opens a reference-bearing dataset (reuse the existing parity fixtures), spies on `genvarloader._genvalloader.get_reference` (or the `_get_reference_rust` binding) to assert it is invoked, materializes `ds[:, :]` for a reference/spliced query under `GVL_BACKEND=rust` and `GVL_BACKEND=numba`, and asserts the two are byte-identical and non-trivially non-zero (the Phase 0 spy lesson — a vacuous pass must be impossible).
+
+```python
+def test_reference_mode_dataset_parity(parity_ref_dataset, force_backend, kernel_spy):
+    with kernel_spy("get_reference") as spy:
+        rust = materialize(parity_ref_dataset, backend="rust")
+    assert spy.called
+    numba = materialize(parity_ref_dataset, backend="numba")
+    assert_ragged_byte_identical(rust, numba)
+    assert rust.data.size > 0 and (rust.data != 0).any()
+```
+
+(Use the existing helpers in `test_dataset_parity.py`; the names above mirror its Phase 2 patterns — adapt to the actual fixture/spy utilities in that file.)
+
+- [ ] **Step 2: Run, verify PASS** — `pixi run -e dev pytest tests/parity/test_dataset_parity.py -q --basetemp=$(pwd)/.pytest_tmp`.
+
+- [ ] **Step 3: Commit**
+
+```bash
+rtk git add tests/parity/test_dataset_parity.py
+rtk git commit -m "test(parity): reference-mode + spliced dataset backstop (spy-guarded)"
+```
+
+---
+
+# Sub-unit 3b — Haplotype reconstruction (core)
+
+### Task 4: `reconstruct_haplotype_from_sparse` (singular) Rust core
+
+The ~190-line workhorse. Port it first in isolation with exhaustive cargo tests **before** the batch driver, because every parity edge case lives here (negative `ref_start` padding, DEL spanning start, overlapping ALTs, shift consumption across ref+allele, right-pad with `pad_char`, and the annotation arrays `annot_v_idxs`/`annot_ref_pos`).
+
+**Files:**
+- Create: `src/reconstruct/mod.rs`
+- Modify: `src/lib.rs` (`pub mod reconstruct;`)
+
+**Numba source to mirror EXACTLY (line-by-line):** `_genotypes.py:277-465` (`reconstruct_haplotype_from_sparse`). Preserve every branch, including the `allele_start_idx == v_len` early-`continue`, the `out_idx + ref_len >= length` break, and the final unfilled/right-pad clause. Annotation writes: reference runs write `annot_v_idxs = -1` and `annot_ref_pos = arange(ref_idx, ref_idx+ref_len)`; allele runs write `annot_v_idxs = variant` and `annot_ref_pos = v_pos`; trailing pad writes `annot_v_idxs = -1` and `annot_ref_pos = i32::MAX` (note: the **leading** pad uses `-1` for ref_pos, the **trailing** pad uses `i32::MAX` — they differ; replicate exactly).
+
+**Interfaces:**
+- Produces: `pub fn reconstruct_haplotype_from_sparse(v_idxs: ArrayView1<i32>, v_starts: ArrayView1<i32>, ilens: ArrayView1<i32>, shift: i64, alt_alleles: ArrayView1<u8>, alt_offsets: ArrayView1<i64>, ref_: ArrayView1<u8>, ref_start: i64, out: ArrayViewMut1<u8>, pad_char: u8, keep: Option<ArrayView1<bool>>, annot_v_idxs: Option<ArrayViewMut1<i32>>, annot_ref_pos: Option<ArrayViewMut1<i32>>)`.
+
+- [ ] **Step 1: Port the core** to `src/reconstruct/mod.rs`, translating `_genotypes.py:277-465` statement-by-statement. Keep `ref_idx`, `out_idx`, `shifted` as `i64`/`usize` mirroring the numba ints; use `slice`/`assign`/`fill` for the block writes. Thread the two optional annotation views through with `if let Some(..)` guards at each write site.
+
+- [ ] **Step 2: Add cargo unit tests** covering, each as a named case with hand-computed expected bytes:
+  - No variants, `shift=0`, in-bounds → `out == ref[ref_start:ref_start+len]`.
+  - Negative `ref_start` → leading pad of `pad_char`, `annot_ref_pos == -1` over the pad.
+  - A single SNP (ilen 0) → one byte replaced, `annot_v_idxs == variant` at that base.
+  - A 2bp insertion (ilen +2) → allele bytes spliced in, downstream ref shifted.
+  - A deletion (ilen −2) → ref skipped, `ref_idx` advances to `v_ref_end`.
+  - DEL spanning `ref_start` (`v_pos < ref_start`, `v_diff < 0`, `v_ref_end >= ref_start`) → `ref_idx = v_ref_end`, variant not emitted.
+  - Overlapping ALTs at the same pos → only the first applied.
+  - `shift` consumed partly by ref + partly by allele (`allele = allele[allele_start_idx:]`).
+  - Right-pad clause: `out` longer than ref+variants → trailing `pad_char`, trailing `annot_ref_pos == i32::MAX`.
+  - Annotated vs non-annotated calls produce identical `out` bytes.
+
+- [ ] **Step 3: Run cargo tests, verify PASS** — `pixi run -e dev cargo-test`.
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add src/reconstruct/mod.rs src/lib.rs
+rtk git commit -m "perf(reconstruct): port reconstruct_haplotype_from_sparse core (cargo-tested)"
+```
+
+---
+
+### Task 5: `reconstruct_haplotypes_from_sparse` (batch) + ffi + dispatch + parity
+
+**Files:**
+- Modify: `src/reconstruct/mod.rs` (batch driver), `src/ffi/mod.rs`, `src/lib.rs`
+- Modify: `python/genvarloader/_dataset/_genotypes.py` (binding + `register`), `python/genvarloader/_dataset/_haps.py` (route both reconstruct methods through dispatch)
+- Create: `tests/parity/test_reconstruct_haplotypes_parity.py`; extend `strategies.py`
+
+**Numba source to mirror:** `_genotypes.py:158-275` (`reconstruct_haplotypes_from_sparse`). The batch driver loops `(query, hap)`, slices each region's reference (`ref[ref_offsets[c_idx]:ref_offsets[c_idx+1]]`), genotype variant indices (`geno_v_idxs[o_s:o_e]` via normalized offsets), per-(query,hap) keep slice, and the out / annotation sub-slices by `out_offsets[k_idx]:out_offsets[k_idx+1]`, then calls the singular kernel. Per-(query,hap) out-slices are disjoint → rayon-parallelizable, byte-identical to numba's `prange`.
+
+**Interfaces:**
+- Produces: `pub fn reconstruct_haplotypes_from_sparse(out: ArrayViewMut1<u8>, out_offsets, regions: ArrayView2<i32>, shifts: ArrayView2<i32>, geno_offset_idx: ArrayView2<i64>, geno_o_starts: ArrayView1<i64>, geno_o_stops: ArrayView1<i64>, geno_v_idxs: ArrayView1<i32>, v_starts, ilens, alt_alleles, alt_offsets, ref_, ref_offsets, pad_char, keep: Option<...>, keep_offsets: Option<...>, annot_v_idxs: Option<ArrayViewMut1<i32>>, annot_ref_pos: Option<ArrayViewMut1<i32>>)` — writes `out` (and optional annotation buffers) in place.
+- ffi: `#[pyfunction] pub fn reconstruct_haplotypes_from_sparse(...)` — takes the normalized `(2,n)` geno_offsets and splits with `.row(0)/.row(1)`; out + annotation buffers via `PyReadwriteArray1`; the two annotation params are `Option<PyReadwriteArray1<i32>>`.
+- dispatch name: `"reconstruct_haplotypes_from_sparse"`.
+
+> **Rayon + in-place annotation note:** because three buffers (`out`, `annot_v_idxs`, `annot_ref_pos`) are written by disjoint per-(query,hap) slices, parallelize by pre-splitting each buffer into disjoint chunks (same `split_at_mut` chaining as Task 2) and zipping the three chunk-vectors per work item. Keep a serial path for the non-annotated common case and verify both produce identical output in cargo tests.
+
+- [ ] **Step 1: Add the batch strategy** to `strategies.py` — `reconstruct_haplotypes_inputs()` generating a small reference (1–2 contigs), a handful of variants (SNP/ins/del mix) with `v_starts`/`ilens`/`alt_alleles`/`alt_offsets`, sparse genotype offsets, `regions`, `shifts` (0 and small positive), optional `keep`/`keep_offsets`, and out_offsets sized to the query windows. Yield the inputs in **both** annotated and non-annotated variants (a `annotate: bool` field), with the out + annotation buffers built by an `out_factory` for the in-place harness.
+
+- [ ] **Step 2: Write the failing parity test** — `tests/parity/test_reconstruct_haplotypes_parity.py` using `assert_inplace_kernel_parity("reconstruct_haplotypes_from_sparse", inputs, out_factory, out_index)` for the non-annotated case, plus a tuple variant asserting all three buffers (out + annot_v + annot_pos) byte-identical for the annotated case (build a small helper mirroring `assert_inplace_kernel_parity` that compares all three written buffers).
+
+- [ ] **Step 3: Run it, verify FAIL** — `KeyError: no kernel registered as 'reconstruct_haplotypes_from_sparse'`.
+
+- [ ] **Step 4: Implement the batch driver** in `src/reconstruct/mod.rs` (serial + rayon paths) calling the Task 4 singular kernel.
+
+- [ ] **Step 5: Run cargo tests, verify PASS** — include a cargo test asserting serial == parallel on a multi-region input.
+
+- [ ] **Step 6: Add the ffi wrapper** + register in `src/lib.rs`.
+
+- [ ] **Step 7: Wire dispatch** in `_genotypes.py` (mirror the `get_diffs_sparse` wrapper: a `register(...)` plus a public `reconstruct_haplotypes_from_sparse` wrapper that normalizes offsets via `_as_starts_stops` and dispatches). Update `_haps.py:_reconstruct_haplotypes` and `_reconstruct_annotated_haplotypes` to call the dispatched wrapper (they already pass the exact kwargs; only the import/callee changes — keep the `_Flat.from_offsets(...).view("S1")` wrapping unchanged).
+
+- [ ] **Step 8: Build + parity both backends** — `maturin develop`; run the parity test under default and `GVL_BACKEND=numba`. Expected PASS both.
+
+- [ ] **Step 9: Commit**
+
+```bash
+rtk git add src/reconstruct/mod.rs src/ffi/mod.rs src/lib.rs \
+  python/genvarloader/_dataset/_genotypes.py python/genvarloader/_dataset/_haps.py \
+  tests/parity/test_reconstruct_haplotypes_parity.py tests/parity/strategies.py
+rtk git commit -m "perf(reconstruct): port reconstruct_haplotypes_from_sparse batch (parity, default rust)"
+```
+
+---
+
+### Task 6: haplotypes-mode dataset backstop
+
+**Files:**
+- Modify: `tests/parity/test_dataset_parity.py`
+
+- [ ] **Step 1: Add a spy-guarded haplotypes-mode backstop** — spy on the `reconstruct_haplotypes_from_sparse` rust binding, materialize `ds[:, :]` for a haplotypes query (and a spliced-haplotypes query) under both backends, assert byte-identical haplotype bytes **and** (for the annotated path) the variant-index + ref-coord arrays. Assert non-trivial output.
+
+- [ ] **Step 2: Run, verify PASS** — `pytest tests/parity/test_dataset_parity.py -q --basetemp=$(pwd)/.pytest_tmp`.
+
+- [ ] **Step 3: Commit** — `test(parity): haplotypes + spliced-haps dataset backstop (spy-guarded)`.
+
+---
+
+# Sub-unit 3c — Track realignment + RLE (hairiest; parity risks live here)
+
+### Task 7: PRNG (`xorshift64`, `hash4`) Rust core + direct parity
+
+The FlankSample fill is the highest parity risk. Lock the PRNG **before** the kernel that uses it, with a direct numba-vs-rust sequence comparison.
+
+**Files:**
+- Create: `src/tracks/mod.rs`
+- Modify: `src/lib.rs` (`pub mod tracks;`), `src/ffi/mod.rs` (temporary debug export, see below)
+- Create: `tests/parity/test_prng_parity.py`; expose a tiny numba helper in `_tracks.py`
+
+**Numba source to mirror:** `_tracks.py:37-53` (`_xorshift64`, `_hash4`). All ops are on `np.uint64` → use Rust `u64` **wrapping** shifts/xors: `x ^= x.wrapping_shl(13)` etc. (shifts by 13/7/17). `hash4(a,b,c,d) = xorshift64(xorshift64(xorshift64(a^b)^c)^d)`.
+
+**Interfaces:**
+- Produces: `pub fn xorshift64(x: u64) -> u64`, `pub fn hash4(a: u64, b: u64, c: u64, d: u64) -> u64`.
+
+- [ ] **Step 1: Implement + cargo-test** the two functions in `src/tracks/mod.rs` with a hardcoded expected vector (compute the first few outputs by hand / from the numba definition and assert).
+
+```rust
+/// One round of xorshift64 (wrapping, mirrors numba `_xorshift64` on np.uint64).
+#[inline(always)]
+pub fn xorshift64(mut x: u64) -> u64 {
+    x ^= x.wrapping_shl(13);
+    x ^= x >> 7;
+    x ^= x.wrapping_shl(17);
+    x
+}
+
+/// Hash four u64 into one (mirrors numba `_hash4`).
+#[inline(always)]
+pub fn hash4(a: u64, b: u64, c: u64, d: u64) -> u64 {
+    let mut h = a;
+    h = xorshift64(h ^ b);
+    h = xorshift64(h ^ c);
+    h = xorshift64(h ^ d);
+    h
+}
+```
+
+- [ ] **Step 2: Add a direct numba-vs-rust PRNG parity test.** Temporarily expose the rust `hash4` via a `#[pyfunction]` (e.g. `ffi::_debug_hash4`) and a numba `_hash4` accessor in `_tracks.py`, then over a hypothesis grid of `(a,b,c,d)` `uint64` quadruples assert `rust_hash4(a,b,c,d) == int(_hash4(a,b,c,d))`. This is the single most important guard for FlankSample byte-identity.
+
+```python
+@given(st.integers(0, 2**64 - 1), st.integers(0, 2**64 - 1),
+       st.integers(0, 2**64 - 1), st.integers(0, 2**64 - 1))
+def test_hash4_parity(a, b, c, d):
+    from genvarloader._dataset._tracks import _hash4
+    import numpy as np
+    exp = int(_hash4(np.uint64(a), np.uint64(b), np.uint64(c), np.uint64(d)))
+    assert _gvl_rust._debug_hash4(a, b, c, d) == exp
+```
+
+- [ ] **Step 3: Run both (cargo + pytest), verify PASS.**
+
+- [ ] **Step 4: Commit** — `perf(tracks): port xorshift64/hash4 PRNG (direct numba parity)`.
+
+---
+
+### Task 8: `apply_insertion_fill` (4 strategies) Rust core
+
+**Files:**
+- Modify: `src/tracks/mod.rs`
+
+**Numba source to mirror:** `_tracks.py:56-139` (`_apply_insertion_fill`). Strategy IDs (`src/tracks` mirrors `_insertion_fill.py`): `REPEAT_5P=0`, `REPEAT_5P_NORM=1`, `CONSTANT=2`, `FLANK_SAMPLE=3`, `INTERPOLATE=4`. **Float-parity risk lives in INTERPOLATE** — replicate the Lagrange evaluation in the *exact same operation order*: anchors built 5′ side first (`xs[j] = -j`, `ys[j] = track[max(v_rel_pos-j,0)]`) then 3′ side (`xs[k+j] = v_len + j`, `ys[k+j] = track[min(v_rel_pos+1+j, track_len-1)]`), and the per-output accumulation `acc += ys[a] * Π_{b≠a} (x - xs[b])/(xs[a] - xs[b])` with `x = i as f64`, looping `a` outer, `b` inner, skipping `b==a`. Keep all interpolation math in `f64` and store the final `acc` into the `f32` out (matching numba, where `out` is float32 and the arithmetic is float64).
+
+**Interfaces:**
+- Produces: `pub fn apply_insertion_fill(out: &mut ArrayViewMut1<f32>, out_idx: usize, writable_length: usize, v_len: i64, track: ArrayView1<f32>, v_rel_pos: i64, strategy_id: i64, params: ArrayView1<f64>, base_seed: u64, query: u64, hap: u64)`. FlankSample uses `hash4(base_seed, query, hap, (out_idx+i) as u64) % pool_size` for each position `i` (note: `query`/`hap` and `out_idx+i` are the per-position seed components — replicate the cast order exactly).
+
+- [ ] **Step 1: Implement** the four branches in `src/tracks/mod.rs`. For `REPEAT_5P_NORM` divide `track[v_rel_pos]` by `v_len as f32`... — **match the numba dtype**: numba computes `track[v_rel_pos] / v_len` where `track` is f32 and `v_len` is a python int → numpy promotes to f32 result? Confirm by reading the numba: the value is stored into f32 `out`; compute in the same precision numba uses (f32/f32 or f64). Mirror exactly; cargo-test against hand values.
+
+- [ ] **Step 2: Cargo-test each strategy** with a fixed `track`, `params`, `base_seed`: Repeat5pNorm (sum-preserving), Constant (params[0]), FlankSample (deterministic given seed — assert exact indices chosen), Interpolate order 1/2/3 (assert against hand-computed Lagrange values; order-1 endpoints must equal the two flanking track values).
+
+- [ ] **Step 3: Run cargo tests, verify PASS.**
+
+- [ ] **Step 4: Commit** — `perf(tracks): port apply_insertion_fill (4 strategies) core (cargo-tested)`.
+
+---
+
+### Task 9: `shift_and_realign_track[s]_sparse` + ffi + dispatch + parity
+
+**Files:**
+- Modify: `src/tracks/mod.rs` (singular + batch), `src/ffi/mod.rs`, `src/lib.rs`
+- Modify: `python/genvarloader/_dataset/_tracks.py` (binding + `register`), `python/genvarloader/_dataset/_reconstruct.py` (route the call site at `_reconstruct.py:210-227`)
+- Create: `tests/parity/test_shift_and_realign_tracks_parity.py`; extend `strategies.py`
+
+**Numba source to mirror:** singular `_tracks.py:230-401`, batch `_tracks.py:141-228`. The singular kernel mirrors the haplotype reconstruct shift logic but on f32 track values, with three key differences: SNPs (`v_diff == 0`) are skipped (tracks match ref there); insertions route to `apply_insertion_fill` unless `strategy_id == REPEAT_5P` (which repeats `track[v_rel_pos]`); deletions/Repeat5p repeat `track[v_rel_pos]`; trailing fill pads with `0` (not `pad_char`). Batch driver loops `(query, hap)` with disjoint out-slices (rayon-safe) and passes `query`/`hap` indices through for the FlankSample seed.
+
+**Interfaces:**
+- Produces: `pub fn shift_and_realign_tracks_sparse(out: ArrayViewMut1<f32>, out_offsets, regions: ArrayView2<i32>, shifts: ArrayView2<i32>, geno_offset_idx: ArrayView2<i64>, geno_v_idxs: ArrayView1<i32>, geno_o_starts: ArrayView1<i64>, geno_o_stops: ArrayView1<i64>, v_starts, ilens, tracks: ArrayView1<f32>, track_offsets: ArrayView1<i64>, params: ArrayView1<f64>, keep: Option<...>, keep_offsets: Option<...>, strategy_id: i64, base_seed: u64)`.
+- ffi `#[pyfunction] pub fn shift_and_realign_tracks_sparse(...)` — `out` via `PyReadwriteArray1<f32>`; normalized `(2,n)` geno_offsets split with `.row()`; `params` is a 1-D `f64` slice (the per-track row already indexed Python-side as `strat_params[track_ofst]`).
+- dispatch name: `"shift_and_realign_tracks_sparse"`.
+
+- [ ] **Step 1: Add the batch strategy** to `strategies.py` — generate a track (f32), variants (SNP/ins/del mix), sparse genos, regions, shifts, optional keep, and for the fill strategy sample `strategy_id ∈ {0,1,2,3,4}` with matching `params` (Constant value; FlankSample width≥0; Interpolate order∈{1,2,3}) and a random `base_seed`. Provide an `out_factory` building the f32 out buffer.
+
+- [ ] **Step 2: Write the failing parity test** using `assert_inplace_kernel_parity("shift_and_realign_tracks_sparse", inputs, out_factory, out_index)`. Ensure the strategy exercises **all five** strategy IDs (especially FlankSample + Interpolate) so byte-identity is proven on the risky paths.
+
+- [ ] **Step 3: Run, verify FAIL** — kernel not registered.
+
+- [ ] **Step 4: Implement** singular + batch in `src/tracks/mod.rs` (calling Task 8's `apply_insertion_fill` and Task 7's `hash4`).
+
+- [ ] **Step 5: Cargo-test** singular kernel cases (no variants → `out = track[:length]`; deletion; insertion under each strategy; shift) + serial==parallel batch.
+
+- [ ] **Step 6: ffi wrapper + register** in `src/lib.rs`.
+
+- [ ] **Step 7: Wire dispatch** in `_tracks.py` (`register(...)` + a wrapper normalizing offsets) and route the `_reconstruct.py:210-227` call site through the dispatched wrapper (kwargs already match; keep the `_Flat.from_offsets(out, out_shape, out_offsets)` wrapping unchanged).
+
+- [ ] **Step 8: Build + parity both backends.** If Interpolate float-parity fails byte-identity after honest operation-order matching, apply the documented fallback: register a strategy-dispatched rust core that handles Repeat5p/Constant/FlankSample/Repeat5pNorm and falls back to numba for `INTERPOLATE` only — and record this in the roadmap decisions log. Attempt strict byte-identity first.
+
+- [ ] **Step 9: Commit** — `perf(tracks): port shift_and_realign_tracks_sparse (parity, default rust)`.
+
+---
+
+### Task 10: `tracks_to_intervals` RLE + ffi + dispatch + parity
+
+**Files:**
+- Modify: `src/tracks/mod.rs` (`tracks_to_intervals`, `scanned_mask`, `compact_mask`), `src/ffi/mod.rs`, `src/lib.rs`
+- Modify: `python/genvarloader/_dataset/_intervals.py` (binding + `register` + route)
+- Create: `tests/parity/test_tracks_to_intervals_parity.py`; extend `strategies.py`
+
+**Numba source to mirror:** `_intervals.py:129-220` (`tracks_to_intervals`, `_scanned_mask`, `_compact_mask`). Returns `(all_starts: i32, all_ends: i32, all_values: f32, interval_offsets: i64)`. RLE: per query, `scanned_mask` = cumulative count of value changes (`backward_mask[0]=True`, `backward_mask[i] = track[i-1] != track[i]`); `compact_mask` recovers run-boundary indices; values are `track[boundaries[:-1]]`; starts/ends are boundaries shifted by `regions[query,1]`. Note `0`-value intervals **are** included (matches numba comment). Per-query work over disjoint output ranges → rayon-safe (but the two-pass cumsum/offsets must mirror numba's `n_intervals.cumsum()`).
+
+**Interfaces:**
+- Produces: `pub fn tracks_to_intervals(regions: ArrayView2<i32>, tracks: ArrayView1<f32>, track_offsets: ArrayView1<i64>) -> (Array1<i32>, Array1<i32>, Array1<f32>, Array1<i64>)`.
+- ffi returns a 4-tuple of `Bound<PyArray*>`.
+- dispatch name: `"tracks_to_intervals"`.
+
+- [ ] **Step 1: Strategy** — generate `regions` + a piecewise-constant `tracks` f32 buffer (draw run lengths + values so RLE has interesting structure, including a single all-constant query and an empty query) + `track_offsets`.
+
+- [ ] **Step 2: Failing parity test** with `assert_kernel_parity_tuple("tracks_to_intervals", regions, tracks, track_offsets)`.
+
+- [ ] **Step 3: Run, verify FAIL.**
+
+- [ ] **Step 4: Implement** in `src/tracks/mod.rs` (two-pass: count intervals per query → cumsum offsets → fill starts/ends/values). Cargo-test against a hand-built RLE example.
+
+- [ ] **Step 5: cargo-test, verify PASS.**
+
+- [ ] **Step 6: ffi + register.**
+
+- [ ] **Step 7: Wire dispatch** in `_intervals.py`; route the production call site through `get("tracks_to_intervals")`.
+
+- [ ] **Step 8: Build + parity both backends.**
+
+- [ ] **Step 9: Commit** — `perf(intervals): port tracks_to_intervals RLE numba->rust (parity, default rust)`.
+
+---
+
+### Task 11: tracks-mode dataset backstop
+
+**Files:**
+- Modify: `tests/parity/test_dataset_parity.py`
+
+- [ ] **Step 1: Add a spy-guarded tracks-mode backstop** — spy on `shift_and_realign_tracks_sparse`, materialize `ds[:, :]` for a tracks query that triggers realignment (indel-bearing regions) under both backends across **each** insertion-fill strategy, assert byte-identical realigned tracks + non-trivial output. Include a tracks_to_intervals round-trip check if a public path exercises it.
+
+- [ ] **Step 2: Run, verify PASS** — `--basetemp=$(pwd)/.pytest_tmp`.
+
+- [ ] **Step 3: Commit** — `test(parity): tracks-realign dataset backstop across fill strategies (spy-guarded)`.
+
+---
+
+# Sub-unit 3d — Consolidation (fuse hot read paths; throughput recorded, not gated)
+
+> Goal: collapse the per-kernel boundary crossings + redundant `np.ascontiguousarray` coercions Phase 2 profiling pinned at 62% of the variants loop, for the **haplotypes** and **tracks** read paths. Parity is still hard-gated (dataset-level, byte-identical); throughput is **recorded** in the roadmap.
+
+### Task 12: Audit the haplotypes + tracks `__getitem__` glue
+
+**Files:**
+- Create: `docs/roadmaps/phase-3-getitem-glue-audit.md` (scratch findings; can be deleted before merge or folded into the roadmap)
+
+- [ ] **Step 1: Trace + list** every `np.ascontiguousarray` / boundary crossing / intermediate numpy alloc on the live haplotypes path (`__getitem__` → `_haps._reconstruct_haplotypes` → `get_diffs_sparse` → `reconstruct_haplotypes_from_sparse`) and the tracks path (`__getitem__` → `_reconstruct` → `get_diffs_sparse` → `shift_and_realign_tracks_sparse` → `intervals_to_tracks`). Use `cProfile` on `chr22_geuv` (haplotypes + tracks modes, `NUMBA_NUM_THREADS=1`) per the Phase 0 `profile.py` to confirm the coercion hotspots.
+
+- [ ] **Step 2: Decide the fusion seam** per path — the minimal single ffi entry that takes the already-available arrays once and returns the final ragged buffers, dropping intermediate Python coercions. Document the chosen signatures.
+
+- [ ] **Step 3: Commit** the audit doc — `docs(phase-3): getitem glue audit for haps/tracks fusion`.
+
+### Task 13: Fused haplotypes `__getitem__` kernel
+
+**Files:**
+- Modify: `src/reconstruct/mod.rs` (or new `src/reconstruct/fused.rs`), `src/ffi/mod.rs`, `src/lib.rs`
+- Modify: `python/genvarloader/_dataset/_haps.py` (call the fused entry on the default path)
+- Modify: `tests/parity/test_dataset_parity.py`
+
+**Interfaces:**
+- Produces: a fused ffi entry (e.g. `reconstruct_haps_fused`) that computes diffs → out_offsets → reconstruction in one crossing from the raw genotype/variant/reference arrays, returning `(out_data, out_offsets)` (and optional annotation buffers) without Python-side coercions between sub-steps.
+
+- [ ] **Step 1: Write a dataset-level parity test FIRST** — assert the fused-path `ds[:, :]` haplotype output is byte-identical to the current composed path under `GVL_BACKEND=numba` (the numba composed pipeline remains the oracle). This is the gate.
+
+- [ ] **Step 2: Run, verify FAIL** (fused entry not yet implemented / not wired).
+
+- [ ] **Step 3: Implement** the fused entry reusing the Task 4/5 cores (call `get_diffs_sparse` core + `reconstruct_haplotypes_from_sparse` core internally; allocate `out` from computed offsets in Rust). No new algorithm — pure plumbing of existing cores.
+
+- [ ] **Step 4: Wire** `_haps._reconstruct_haplotypes` (non-splice default path) to call the fused entry; keep the unfused dispatched kernels for the splice path and as the numba oracle.
+
+- [ ] **Step 5: Build + run dataset parity** both backends; verify PASS + spy confirms the fused entry ran.
+
+- [ ] **Step 6: Record throughput** — re-run `profile.py --mode haps` on `chr22_geuv`, capture batch/s + peak RSS, confirm via cProfile the `np.ascontiguousarray` glue is gone from the fused path. Note the numbers for the roadmap (Task 15).
+
+- [ ] **Step 7: Commit** — `perf(reconstruct): fused haplotypes __getitem__ kernel (dataset parity; throughput recorded)`.
+
+### Task 14: Fused tracks `__getitem__` kernel
+
+**Files:**
+- Modify: `src/tracks/mod.rs` (or `src/tracks/fused.rs`), `src/ffi/mod.rs`, `src/lib.rs`
+- Modify: `python/genvarloader/_dataset/_reconstruct.py` (tracks path)
+- Modify: `tests/parity/test_dataset_parity.py`
+
+**Interfaces:**
+- Produces: a fused ffi entry chaining `get_diffs_sparse` → `shift_and_realign_tracks_sparse` → `intervals_to_tracks` cores in one crossing, returning the final realigned ragged tracks buffer + offsets.
+
+- [ ] **Step 1: Dataset-level parity test FIRST** — fused tracks `ds[:, :]` byte-identical to the composed numba pipeline, across fill strategies. Verify FAIL.
+
+- [ ] **Step 2: Implement** the fused entry from the existing cores (plumbing only).
+
+- [ ] **Step 3: Wire** the tracks default path to the fused entry.
+
+- [ ] **Step 4: Build + dataset parity** both backends; spy confirms fused entry ran. PASS.
+
+- [ ] **Step 5: Record throughput** — `profile.py --mode tracks` on `chr22_geuv`; capture batch/s + peak RSS.
+
+- [ ] **Step 6: Commit** — `perf(tracks): fused tracks __getitem__ kernel (dataset parity; throughput recorded)`.
+
+---
+
+# Phase close-out
+
+### Task 15: Full-tree verification, roadmap update, skill check
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md`
+- Modify (if public API changed): `skills/genvarloader/SKILL.md`
+
+- [ ] **Step 1: Full tree, both backends.** Run, all green:
+```bash
+pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp
+GVL_BACKEND=numba pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp
+pixi run -e dev cargo-test
+```
+Expected: PASS (rust default) and PASS (numba forced); cargo green.
+
+- [ ] **Step 2: Lint + types + build.**
+```bash
+pixi run -e dev ruff check python/ tests/
+pixi run -e dev ruff format --check python/ tests/
+pixi run -e dev typecheck
+pixi run -e dev maturin build   # confirm abi3 wheel builds
+```
+Expected: clean.
+
+- [ ] **Step 3: Update the roadmap** (`docs/roadmaps/rust-migration.md`):
+  - Fix the stale Phase 3 `Gate:` line → "parity hard-gate; throughput recorded only".
+  - Tick all Phase 3 checkboxes; set the phase marker ⬜→✅ + the bundled PR link.
+  - Record the fused haplotypes + tracks throughput / peak RSS (Tasks 13–14) in a Phase 3 measurement block.
+  - Add a Notes & decisions log entry mirroring the Phase 2 entry (kernels ported, fusion seams, any Interpolate-fallback decision, env notes).
+
+- [ ] **Step 4: Skill check.** Phase 3 is internal (no public API change expected). Confirm `python/genvarloader/__init__.py:__all__`, `gvl.write`, `Dataset.open`, and `Dataset.with_*` signatures/defaults are unchanged; if anything public shifted, update `skills/genvarloader/SKILL.md` per CLAUDE.md. State the result explicitly.
+
+- [ ] **Step 5: Commit + open the bundled PR** into `rust-migration`.
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): Phase 3 complete — reconstruction+tracks ported, fused paths, throughput recorded"
+rtk git push -u origin phase-3-reconstruction
+rtk gh pr create --base rust-migration --title "Phase 3: reconstruction + track realignment (Rust)" --body "..."
+```
+
+---
+
+## Self-review notes (author)
+
+- **Spec coverage:** 3a reference (Tasks 1–3), 3b reconstruction incl. annotated (Tasks 4–6), 3c tracks realign + 4 fill strategies + RLE (Tasks 7–11), 3d fuse both haplotypes+tracks (Tasks 12–14), parity-hard/throughput-recorded gate + roadmap fix (Task 15). All spec sections mapped.
+- **Parity risks** (FlankSample PRNG, Interpolate float) are isolated to their own tasks (7, 8/9) with direct guards + a documented numba fallback for Interpolate only.
+- **Type consistency:** offsets normalized via `_as_starts_stops` everywhere; `i64`-accumulate-truncate for length sums; `u64` wrapping for PRNG; f64 interpolation stored to f32; annotation leading-pad ref_pos `-1` vs trailing-pad `i32::MAX` called out explicitly.
+- **njit-internal leaves** (`padded_slice`, `_get_reference_row`, `xorshift64`, `hash4`, `apply_insertion_fill`, `scanned_mask`, `compact_mask`) get **no** dispatch registration — they land inside their entry kernel's task and are covered through it, per the Phase 0 dispatch rule.
diff --git a/docs/superpowers/plans/2026-06-25-round3-instruction-level-kernel-tuning.md b/docs/superpowers/plans/2026-06-25-round3-instruction-level-kernel-tuning.md
new file mode 100644
index 00000000..91aae6dc
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-25-round3-instruction-level-kernel-tuning.md
@@ -0,0 +1,325 @@
+# Round-3 Instruction-Level Kernel Tuning Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Drive the Rust read-path kernels to rust ≥ numba single-threaded on all four read paths (tracks-only, haplotypes, variants, variant-windows) by tuning their generated machine code, using perf to localize and cargo-show-asm (+llvm-mca) to inspect and verify.
+
+**Architecture:** Profile-all-first to build one consolidated, aggregate-weighted target list, then run a fixed per-kernel tune loop (inspect asm → fix → confirm asm delta → confirm throughput → confirm parity → commit-or-revert) in descending target order. No format/API/semantic change; this round only changes the instruction sequences hot kernels compile to.
+
+**Tech Stack:** Rust (ndarray, PyO3, rayon present but unused this round), `cargo-show-asm` v0.2.61 (`cargo asm`), `perf`, `maturin`, `pixi`, `pytest` + `pytest-benchmark`, `hypothesis` (parity).
+
+**Spec:** `docs/superpowers/specs/2026-06-25-round3-instruction-level-kernel-tuning-design.md`
+
+## Global Constraints
+
+Every task implicitly includes these. Values copied verbatim from the spec.
+
+- **Parity is sacrosanct:** rust output must stay **byte-identical** to numba on both backends. The two documented numba-bug exclusions (the #242-family `intervals_to_tracks` start<query clip; the reconstruct trailing-under-write overshoot) stay **unchanged** — do not touch them.
+- **Gate = wall-clock throughput, not instruction count.** A change that drops instructions but does **not** improve (or at least hold) ms/batch is **reverted**. Instruction/llvm-mca deltas are recorded as evidence only.
+- **`unsafe` budget:** safe idioms first (slice hoisting, iterators, `assert!` bound hints, codegen attrs). Targeted `unsafe` (`get_unchecked` / explicit SIMD) only where the bound is provably safe but the optimizer keeps the check; every `unsafe` carries a `// SAFETY:` comment and is gated by passing parity.
+- **No scope creep:** no on-disk format change, no public API change, no new kernels, no rayon/batch parallelism (Phase 5), no numba deletion (Phase 5).
+- **Measurement env (every throughput/asm number):** corpus `tests/benchmarks/data/chr22_geuv.gvl` (format 2.0, 165 regions × 5 samples, 82 neg / 83 pos strand), `with_len(16384)`, `BATCH=32`, `NUMBA_NUM_THREADS=1`, `maturin develop --release`, Carter HPC (AMD EPYC 7543, linux-64). **Report the rust ÷ numba ratio, not absolute batch/s** (shared-node load varies across sessions).
+- **Per-path gate harness:** tracks-only & haplotypes → `tests/benchmarks/test_e2e.py` pytest-benchmark **pedantic min** (ms/batch). variants & variant-windows → `tests/benchmarks/profiling/profile.py` **wall-clock average** (2000 batches) — `test_e2e_variants` is xfailed (`_FlatVariants.to_fixed` gap) so no pedantic min exists for those two.
+- **Gate numbers come only from the plain `--release` build.** The `[profile.profiling]` profile is for perf attribution only and is never the measured artifact.
+- **HPC note:** dataset/parity tests need `--basetemp=$(pwd)/.pytest_tmp` (avoids `os.link` cross-device Errno 18).
+- **Roadmap contract:** this work lands as "Optimization targets — round 3" under Phase 3 in `docs/roadmaps/rust-migration.md` (not a new phase); the roadmap must be updated as part of the work.
+
+---
+
+### Task 1: Worktree + fresh pixi env + release build smoke
+
+**Files:**
+- Create: new git worktree directory (outside the repo tree), branch `opt/round3-instruction-tuning` off `rust-migration`.
+
+**Interfaces:**
+- Consumes: nothing.
+- Produces: an isolated worktree with its **own** pixi env and a working `--release` build; all later tasks run here.
+
+- [ ] **Step 1: Create the worktree via the using-git-worktrees skill**
+
+Use the `superpowers:using-git-worktrees` skill to create a worktree for branch `opt/round3-instruction-tuning` based on `rust-migration`. Do **not** symlink `.pixi` into it — `maturin develop` repoints the shared env's `.pth`/`.so` and would corrupt the parent workspace (per the `gvl-parallel-worktrees-fresh-pixi-env` note).
+
+- [ ] **Step 2: Install a fresh dev pixi env in the worktree**
+
+Run (from the worktree root): `pixi install -e dev`
+Expected: a populated `.pixi/envs/dev` local to the worktree.
+
+- [ ] **Step 3: Release build + smoke the four profile modes**
+
+Run: `pixi run -e dev maturin develop --release`
+Then smoke each mode at a tiny batch count to confirm the corpus + build work:
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode tracks --n-batches 20`
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode haplotypes --n-batches 20`
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variants --n-batches 20`
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variant-windows --n-batches 20`
+Expected: each prints a `done wall=... throughput=... batch/s` line, no exception. (If the corpus is missing, build it: `pixi run -e dev python tests/benchmarks/data/build_realistic.py`.)
+
+- [ ] **Step 4: Confirm `cargo asm` resolves a symbol against this build**
+
+Run: `cargo asm --simplify genvarloader::intervals::intervals_to_tracks 2>&1 | head -30`
+Expected: x86-64 assembly for the function prints (confirms cargo-show-asm v0.2.61 sees the release artifact and resolves the symbol). If it lists candidates instead, copy the exact mangled path it offers — that is the canonical symbol name for later tasks.
+
+- [ ] **Step 5: Commit (worktree marker)**
+
+No code change yet; nothing to commit. Proceed.
+
+---
+
+### Task 2: Add the `[profile.profiling]` profile
+
+**Files:**
+- Modify: `Cargo.toml` (append a profile section).
+
+**Interfaces:**
+- Consumes: nothing.
+- Produces: a `profiling` cargo profile for perf call-graph attribution (used in Task 3 only when flat self-time is ambiguous). Never the measured artifact.
+
+- [ ] **Step 1: Append the profile to `Cargo.toml`**
+
+Add at the end of `Cargo.toml`:
+
+```toml
+# Perf call-graph attribution only (`perf report --children`). Inherits release
+# codegen and adds line tables + frame pointers. NEVER the gate artifact — all
+# throughput/asm gate numbers come from the plain `--release` build.
+[profile.profiling]
+inherits = "release"
+debug = "line-tables-only"
+force-frame-pointers = true
+```
+
+- [ ] **Step 2: Verify it builds**
+
+Run: `pixi run -e dev cargo build --profile profiling 2>&1 | tail -5`
+Expected: `Finished` line, no error. (This validates the profile parses; the gate build remains `maturin develop --release`.)
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add Cargo.toml
+git commit -m "build(rust): add [profile.profiling] for perf call-graph attribution"
+```
+
+---
+
+### Task 3: Fresh baseline + ranked aggregate target list
+
+**Files:**
+- Create: `docs/roadmaps/round3-profile-baseline.md` (the consolidated table; the roadmap round-3 section links to it).
+
+**Interfaces:**
+- Consumes: the release build from Task 1.
+- Produces: `round3-profile-baseline.md` containing (a) per-path rust ÷ numba starting ratios and (b) a consolidated flat-self-time table with an aggregate-weight column. **No tuning task starts until this file exists** — it determines target order and overrides the "expected targets" in the spec.
+
+- [ ] **Step 1: Capture per-path throughput baselines (rust vs numba)**
+
+tracks-only & haplotypes (pedantic min):
+Run: `pixi run -e dev pytest tests/benchmarks/test_e2e.py::test_e2e_tracks_only tests/benchmarks/test_e2e.py::test_e2e_haplotypes --benchmark-only -q`
+Run again with `GVL_BACKEND=numba` prefixed to get the numba min for the same two.
+
+variants & variant-windows (profile.py wall-clock avg, 2000 batches):
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variants --n-batches 2000`
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variant-windows --n-batches 2000`
+Run each again with `GVL_BACKEND=numba` prefixed.
+
+Record the four rust ÷ numba ratios.
+
+- [ ] **Step 2: Capture flat self-time perf profiles for all four paths (rust)**
+
+For each `MODE` in `tracks haplotypes variants variant-windows`:
+
+```bash
+NUMBA_NUM_THREADS=1 perf record -F 999 -o p_$MODE.data -- \
+    .pixi/envs/dev/bin/python tests/benchmarks/profiling/profile.py --mode $MODE --n-batches 12000
+perf report --stdio --no-children -i p_$MODE.data > report_$MODE.txt
+```
+
+Expected: each `report_*.txt` lists symbols by self-time with `genvarloader::...` Rust symbols resolved. (12k batches drowns one-time import/JIT.)
+
+- [ ] **Step 3: Build the consolidated aggregate-weighted table**
+
+In `docs/roadmaps/round3-profile-baseline.md`, write a table: rows = Rust kernel symbols that appear in any path's top self-time, columns = self-time % per path, plus an **Aggregate** column = sum of self-time % across the paths the kernel appears in. Shared kernels (e.g. `intervals_to_tracks`, `shift_and_realign_tracks_sparse` appear in both tracks and haplotypes) rank by total read-path cost. Include the four starting ratios from Step 1 above the table.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add docs/roadmaps/round3-profile-baseline.md
+git commit -m "docs(roadmap): round-3 profiling baseline + aggregate target list"
+```
+
+---
+
+### Task 4: TUNE LOOP TEMPLATE — apply to each target in descending aggregate-weight order
+
+> **This is the procedure every tuning task follows.** The exact code fix **cannot** be pre-written — it is determined by reading the kernel's assembly (an instruction-count pass is asm-driven by definition; fabricating a diff here would be a lie). What IS fixed and concrete: the inspect commands, the asm→fix decision tree with worked examples from this codebase, and the three gates (asm delta recorded, throughput non-regression, parity byte-identical). Instantiate this loop as a **separate commit per kernel**, taking targets from Task 3's table in order. Tasks 5–7 list the expected targets with their real source anchors; Task 3's profile reorders/prunes them.
+
+For a target kernel `K` at `crate::module::K` in `src/<file>.rs`:
+
+- [ ] **Step 1: Record the asm baseline (evidence)**
+
+Run: `cargo asm --rust crate::module::K > asm_K_before.txt`
+Run: `cargo asm --mca crate::module::K > mca_K_before.txt`
+Note from `asm_K_before.txt`: total instruction count, and from `mca_K_before.txt`: llvm-mca "Total Cycles" / "Block RThroughput". Identify the dominant cost using the decision tree in Step 3.
+
+- [ ] **Step 2: Record the throughput baseline for K's path (gate)**
+
+Run K's path harness (see Global Constraints "Per-path gate harness") for **both** backends and record the rust ÷ numba ratio. This is the number the change must improve or hold.
+
+- [ ] **Step 3: Diagnose from the asm, pick a fix class**
+
+Map the asm symptom to a fix (worked examples are real transformations from this codebase / its history):
+
+  - **Per-element bounds check** (`cmp`/`jae` to a panic block around an indexed write in the hot loop) → hoist the slice once before the loop and index the raw `&mut [T]`. *Worked example (already landed as T5, `src/intervals.rs:29,69`):* `out.as_slice_mut().unwrap()` hoisted before the interval loop, inner body `out_slice[a..b].fill(value)` on `&mut [f32]` — dropped per-interval `SliceInfo` + bounds check, no `unsafe`. If the compiler still cannot prove `a..b` in range, add `assert!(b <= out_slice.len())` before the loop (one check feeds the optimizer), or as a last resort `out_slice.get_unchecked_mut(a..b)` with `// SAFETY: a,b are clamped to [0,length] and out_s+length == out_e <= out_slice.len()`.
+  - **Scalar byte loop that should vectorize** (e.g. `rc_flat_rows_inplace`'s `for b in row.iter_mut() { *b = COMP[*b as usize] }`, `src/reverse.rs:54-56`) → the gather through `COMP` blocks autovectorization. Try: process in fixed chunks, or split reverse+complement so the reverse is a `slice::reverse` (already SIMD) and the complement is a separate tight pass; inspect whether llvm vectorizes the complement after the split. Keep the COMP table semantics identical (parity).
+  - **Redundant copy / materialization** in the loop → eliminate the intermediate, write directly into the output slice.
+  - **Register spill** (stack `mov`s in the inner loop) → reduce live values, pull invariants out of the loop, or split the function so the hot loop monomorphizes tighter.
+  - **Integer width churn** (`movsxd`/`cdqe` from `as i64`/`as usize` per element) → compute loop-invariant casts once outside the loop.
+
+Apply the chosen fix to `src/<file>.rs`. Safe idiom first; `unsafe` only per the Global Constraints budget, always with a `// SAFETY:` comment.
+
+- [ ] **Step 4: Rebuild and confirm the asm delta (evidence)**
+
+Run: `pixi run -e dev maturin develop --release`
+Run: `cargo asm --rust crate::module::K > asm_K_after.txt` and `cargo asm --mca crate::module::K > mca_K_after.txt`
+Expected: lower instruction count and/or lower llvm-mca cycles vs the `*_before.txt`. Record the delta.
+
+- [ ] **Step 5: Confirm throughput (gate) — REVERT if no win**
+
+Re-run K's path harness for both backends; recompute the rust ÷ numba ratio.
+- If ms/batch **improved or held** and parity (Step 6) passes → keep.
+- If instructions dropped but ms/batch **did not improve** → **`git checkout -- src/<file>.rs`** and record in the roadmap that K is memory/branch-bound at this floor (honest non-result). Do not force it.
+
+- [ ] **Step 6: Confirm parity (byte-identical, both backends)**
+
+Run the kernel's parity suite (Task 5–7 name the exact file per kernel), e.g.:
+Run: `pixi run -e dev pytest tests/parity/<test_file>.py -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS. Then the relevant cargo unit tests:
+Run: `pixi run -e dev cargo test <module> 2>&1 | tail -5`
+Expected: `test result: ok`.
+
+- [ ] **Step 7: Commit (one kernel per commit)**
+
+```bash
+git add src/<file>.rs
+git commit -m "perf(rust): tune <K> — <instr before>→<after> instrs, <ratio before>→<after>"
+```
+
+---
+
+### Task 5: Tune the tracks/haplotypes shared kernels (expected highest aggregate weight)
+
+> Instantiate the Task-4 loop for each, in the order Task 3's aggregate column gives. Real source anchors and parity files below. Skip any whose Task-3 self-time is already negligible.
+
+**Files:**
+- Modify (as the asm dictates): `src/intervals.rs`, `src/tracks/mod.rs`, `src/reverse.rs`.
+- Test: `tests/parity/test_intervals_to_tracks_parity.py`, `tests/parity/test_fused_tracks_parity.py`, `tests/parity/test_shift_and_realign_tracks_parity.py`, `tests/parity/test_dataset_parity.py`.
+
+**Interfaces:**
+- Consumes: Task 3's ranked table.
+- Produces: tuned kernels with recorded asm + ratio deltas; tracks-only and tracks-seqs paths at/above numba.
+
+- [ ] **Step 1: `genvarloader::intervals::intervals_to_tracks`** (`src/intervals.rs:16`) — run the Task-4 loop. Hot inner loop already raw-slice (T5); look for residual per-interval `as i64`/`as usize` casts (`src/intervals.rs:52-53,67-68`) and the `out_slice.fill(0.0)` prelude. Parity: `test_intervals_to_tracks_parity.py` + `test_fused_tracks_parity.py`. Gate path: `test_e2e_tracks_only`.
+- [ ] **Step 2: `genvarloader::tracks::shift_and_realign_tracks_sparse`** (`src/tracks/mod.rs`) — run the Task-4 loop. Parity: `test_shift_and_realign_tracks_parity.py` + `test_fused_tracks_parity.py`. Gate path: `test_e2e_tracks_only` and `test_e2e_tracks` (shared).
+- [ ] **Step 3: `genvarloader::reverse::reverse_flat_rows_inplace`** (`src/reverse.rs:25`, the f32 track-reverse half) — run the Task-4 loop only if Task 3 shows it hot on the tracks path. Parity: `test_fused_tracks_parity.py`. Gate path: `test_e2e_tracks_only`.
+- [ ] **Step 4: Re-confirm both gate paths after all kept changes**
+
+Run: `pixi run -e dev pytest tests/benchmarks/test_e2e.py::test_e2e_tracks_only tests/benchmarks/test_e2e.py::test_e2e_tracks --benchmark-only -q` (rust, then `GVL_BACKEND=numba`).
+Expected: recorded rust ÷ numba ratio ≥ the Task-3 starting ratio for both.
+
+---
+
+### Task 6: Tune the haplotype kernels
+
+> Instantiate the Task-4 loop for each, in Task-3 aggregate order.
+
+**Files:**
+- Modify (as the asm dictates): `src/reconstruct/mod.rs`, `src/reverse.rs`.
+- Test: `tests/parity/test_reconstruct_haplotypes_parity.py`, `tests/parity/test_fused_haps_parity.py`, `tests/parity/test_haplotypes_dataset_parity.py`.
+
+**Interfaces:**
+- Consumes: Task 3's ranked table.
+- Produces: tuned haplotype kernels; haplotypes path at/above numba.
+
+- [ ] **Step 1: `genvarloader::reconstruct::reconstruct_haplotypes_from_sparse`** (`src/reconstruct/mod.rs`) — run the Task-4 loop. Parity: `test_reconstruct_haplotypes_parity.py` + `test_fused_haps_parity.py`. Gate path: `test_e2e_haplotypes`.
+- [ ] **Step 2: `genvarloader::reverse::rc_flat_rows_inplace`** (`src/reverse.rs:41`, the byte revcomp half) — run the Task-4 loop. Decision-tree hint: the `COMP[*b as usize]` gather (`src/reverse.rs:54-56`) blocks autovectorization; try splitting `row.reverse()` (already SIMD) from the complement pass and inspect whether the complement vectorizes. Parity: `test_fused_haps_parity.py` + `test_dataset_parity.py`. Gate path: `test_e2e_haplotypes`.
+- [ ] **Step 3: Re-confirm the gate path after all kept changes**
+
+Run: `pixi run -e dev pytest tests/benchmarks/test_e2e.py::test_e2e_haplotypes --benchmark-only -q` (rust, then `GVL_BACKEND=numba`).
+Expected: recorded rust ÷ numba ratio ≥ the Task-3 starting ratio.
+
+---
+
+### Task 7: Tune the variant-windows kernels
+
+> Instantiate the Task-4 loop for each, in Task-3 aggregate order. These are the T7 profile top.
+
+**Files:**
+- Modify (as the asm dictates): `src/variants/windows.rs`.
+- Test: `tests/parity/test_assemble_variant_buffers_parity.py`, `tests/parity/test_flat_variants_parity.py`, `tests/parity/test_variants_dataset_parity.py`.
+
+**Interfaces:**
+- Consumes: Task 3's ranked table.
+- Produces: tuned variant-window assembly kernels; variant-windows path further above numba.
+
+- [ ] **Step 1: `genvarloader::variants::windows::tokenize`** (`src/variants/windows.rs`, T7 top leaf ~28%) — run the Task-4 loop. Gate path (profile.py wall-clock avg, 2000 batches): `--mode variant-windows`.
+- [ ] **Step 2: `genvarloader::variants::windows::slice_flanks`** (`src/variants/windows.rs`, ~19%) — run the Task-4 loop.
+- [ ] **Step 3: `genvarloader::variants::windows::assemble_alt_window`** (`src/variants/windows.rs`, ~13%) — run the Task-4 loop.
+- [ ] **Step 4: Re-confirm the gate path after all kept changes**
+
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variant-windows --n-batches 2000` (rust, then `GVL_BACKEND=numba`).
+Expected: recorded rust ÷ numba ratio ≥ the Task-3 starting ratio (T7 baseline 1.83×).
+
+Parity for all three: `tests/parity/test_assemble_variant_buffers_parity.py` + `tests/parity/test_flat_variants_parity.py`.
+
+---
+
+### Task 8: Full-tree gate + roadmap update + finish
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (add the round-3 section).
+
+**Interfaces:**
+- Consumes: all kept tuning commits + their recorded deltas.
+- Produces: a landed, fully-verified round-3 pass with the roadmap updated per the migration contract.
+
+- [ ] **Step 1: Full tree, rust backend**
+
+Run: `pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: all pass except the known pre-existing xfails (`test_e2e_variants`, `test_haps_property` ×2, `test_indexing::test_parse_idx[missing]`, `test_ref_ds::test_getitem[no_regions]`). 0 unexpected failures.
+
+- [ ] **Step 2: Full tree, numba backend**
+
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: same pass/xfail profile (byte-identical parity proven on both backends).
+
+- [ ] **Step 3: cargo tests + lint + format + typecheck + wheel build**
+
+Run: `pixi run -e dev cargo test 2>&1 | tail -5` → `test result: ok`
+Run: `pixi run -e dev ruff check python/ tests/` → clean
+Run: `pixi run -e dev ruff format --check python/ tests/` → clean
+Run: `pixi run -e dev typecheck` → clean
+Run: `pixi run -e dev maturin build 2>&1 | tail -3` → abi3 wheel builds
+
+- [ ] **Step 4: Write the round-3 roadmap section**
+
+In `docs/roadmaps/rust-migration.md`, under Phase 3's optimization-targets area, add an "Optimization targets — round 3 (instruction-level, profiled <date>)" subsection containing: the Task-3 starting ratios, the consolidated target table, a per-kernel row (symbol · instr before→after · llvm-mca cycles before→after · rust÷numba before→after · kept/reverted), and the final four-path ratio summary. Add a dated entry to the "Notes & decisions log" summarizing the round (tooling = cargo-show-asm; gate = throughput; unsafe = targeted/parity-gated; any honest non-results). Update the sequencing note to mark round-3 done and restate that rayon (Phase 5) is the next lever.
+
+- [ ] **Step 5: Commit the roadmap**
+
+```bash
+git add docs/roadmaps/rust-migration.md docs/roadmaps/round3-profile-baseline.md
+git commit -m "docs(roadmap): record round-3 instruction-level tuning results"
+```
+
+- [ ] **Step 6: Finish the branch**
+
+Use the `superpowers:finishing-a-development-branch` skill to choose how to integrate `opt/round3-instruction-tuning` into `rust-migration` (the roadmap uses per-target PRs into `rust-migration`, e.g. #248/#249/#250 — follow that precedent; **no squash merge**, per the `no-squash-merges` note).
+
+---
+
+## Notes for the implementer
+
+- **Why no pre-written fix diffs:** an instruction-count pass is asm-driven — the fix is whatever the disassembly reveals, discovered at execution. Task 4 gives the real decision tree (asm symptom → fix class → worked codebase example) and the three concrete gates. A fabricated diff would be a placeholder; the gates are the real deliverable.
+- **Always rebuild `--release` before any `cargo asm` / throughput measurement.** `cargo asm` reads the last build's artifact; a stale debug build gives misleading asm.
+- **One kernel per commit** so any reverted non-result is a clean, isolated revert.
+- **Ratios over absolutes:** the Carter node is shared; numba absolute times drift between sessions. Always re-measure numba in the same session as rust and report the ratio.
diff --git a/docs/superpowers/plans/2026-06-25-rust-variant-rc-fold.md b/docs/superpowers/plans/2026-06-25-rust-variant-rc-fold.md
new file mode 100644
index 00000000..e1b20079
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-25-rust-variant-rc-fold.md
@@ -0,0 +1,756 @@
+# Rust Variant-Allele Reverse-Complement Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Replace the per-batch Python object churn in the variant-allele reverse-complement post-pass with a thin gvl-owned Rust kernel (`rc_alleles_inplace`) operating on the raw `_FlatAlleles` buffers, byte-identical to the existing seqpro path.
+
+**Architecture:** A pure-`ndarray` core (`src/variants/mod.rs`) reuses the Target-6 `reverse::{rc_flat_rows_inplace, COMP}` primitives; a PyO3 in-place wrapper (`src/ffi/mod.rs`) exposes it; it is registered in `_dispatch` as `rc_alleles` (rust default, the existing seqpro implementation retained as the reference backend). The two Python RC methods (`_FlatAlleles.reverse_masked`, `RaggedVariants.rc_`) route their inner RC through the dispatched kernel. RC stays positioned **after** dummy-fill (same as today), so ordering is byte-identical even for custom non-palindromic dummy alleles.
+
+**Tech Stack:** Rust (PyO3 + ndarray), Python (numpy), pytest + hypothesis (parity), cargo test, pixi (`-e dev`).
+
+## Global Constraints
+
+- **Byte-identical parity** is the migration contract: the new rust kernel must produce output identical to the existing seqpro reference across the parity matrix. A unit only lands when parity holds.
+- **Do NOT delete the seqpro reference / numba backends.** `rust-migration` is not ready to merge; the reference is retained for parity + performance gating (deletion is Phase 5). Per `[[numba-oracle-bug-policy]]` and the roadmap.
+- **No on-disk format change.** No change to `_FlatVariantWindows` (still never RC'd). No change to `flank_tokens` (the post-pass RCs only `alt`/`ref`).
+- Dispatch registry API: `register(name, *, numba=, rust=, default=)`, `get(name)(...)`, `backends(name) -> (numba, rust)`. `GVL_BACKEND=numba|rust` force-overrides.
+- Complement LUT is `_COMP = np.frombuffer(bytes.maketrans(b"ACGT", b"TGCA"), np.uint8)` (Python) ≡ `crate::reverse::COMP` (Rust). Both reverse THEN complement per allele.
+- Mask broadcast convention (must match exactly): per-region mask → per-`(b*p)` row via `np.repeat(mask, ploidy)` (done Python-side) → per-allele via `np.repeat(per_bp, np.diff(var_offsets))` (done inside the kernel).
+- Dataset tests on the HPC need `--basetemp=$(pwd)/.pytest_tmp` (os.link cross-device Errno 18).
+- Build/test commands: `pixi run -e dev cargo test`, `pixi run -e dev pytest <path> -q`, `pixi run -e dev test` (full tree), `pixi run -e dev ruff check python/ tests/`, `pixi run -e dev ruff format python/ tests/`, `pixi run -e dev typecheck`.
+
+---
+
+### Task 1: Rust core `rc_alleles_inplace` + cargo unit tests
+
+**Files:**
+- Modify: `src/variants/mod.rs` (add `rc_alleles_inplace` after `gather_alleles` ~line 52; add tests to the existing `#[cfg(test)] mod tests` or create one)
+
+**Interfaces:**
+- Consumes: `crate::reverse::{rc_flat_rows_inplace, COMP}` (existing, from Target 6).
+- Produces: `pub fn rc_alleles_inplace(byte_data: &mut [u8], seq_offsets: ArrayView1<i64>, var_offsets: ArrayView1<i64>, to_rc_row: ArrayView1<bool>)`.
+  - `byte_data`: contiguous allele bytes, mutated in place.
+  - `seq_offsets`: per-allele byte boundaries, len `n_alleles + 1`.
+  - `var_offsets`: per-`(b*p)`-row allele boundaries, len `n_rows + 1`. `to_rc_row` has len `n_rows`.
+  - For each row `g` with `to_rc_row[g]==true`, every allele `a` in `var_offsets[g]..var_offsets[g+1]` is reverse-complemented over `seq_offsets[a]..seq_offsets[a+1]` via `COMP`.
+
+- [ ] **Step 1: Write the failing tests**
+
+Add to `src/variants/mod.rs` (inside the test module; if none exists, add `#[cfg(test)] mod rc_tests { use super::*; use ndarray::array; ... }`):
+
+```rust
+#[test]
+fn rc_alleles_rcs_only_masked_rows() {
+    // 2 rows. row0 (masked) has 2 alleles: "AC","G". row1 (unmasked): "TT".
+    // seq_offsets delimit alleles: [0,2,3,5]; var_offsets delimit rows: [0,2,3].
+    let mut data = b"ACGTT".to_vec();
+    let seq_offsets = ndarray::array![0i64, 2, 3, 5];
+    let var_offsets = ndarray::array![0i64, 2, 3];
+    let to_rc_row = ndarray::array![true, false];
+    rc_alleles_inplace(&mut data, seq_offsets.view(), var_offsets.view(), to_rc_row.view());
+    // row0: "AC"->"GT", "G"->"C"; row1 "TT" untouched.
+    assert_eq!(&data, b"GTCTT");
+}
+
+#[test]
+fn rc_alleles_all_false_is_noop() {
+    let mut data = b"ACG".to_vec();
+    let seq_offsets = ndarray::array![0i64, 1, 3];
+    let var_offsets = ndarray::array![0i64, 2];
+    let to_rc_row = ndarray::array![false];
+    rc_alleles_inplace(&mut data, seq_offsets.view(), var_offsets.view(), to_rc_row.view());
+    assert_eq!(&data, b"ACG");
+}
+
+#[test]
+fn rc_alleles_handles_empty_allele_and_n() {
+    // 1 masked row, 2 alleles: "" (empty) and "ACN".
+    let mut data = b"ACN".to_vec();
+    let seq_offsets = ndarray::array![0i64, 0, 3];
+    let var_offsets = ndarray::array![0i64, 2];
+    let to_rc_row = ndarray::array![true];
+    rc_alleles_inplace(&mut data, seq_offsets.view(), var_offsets.view(), to_rc_row.view());
+    // "" stays ""; "ACN" -> revcomp -> "NGT".
+    assert_eq!(&data, b"NGT");
+}
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `pixi run -e dev cargo test --lib rc_alleles`
+Expected: FAIL — `rc_alleles_inplace` not found (cannot resolve function).
+
+- [ ] **Step 3: Implement the core**
+
+Add to `src/variants/mod.rs` (after `gather_alleles`). Ensure `use crate::reverse::{rc_flat_rows_inplace, COMP};` is available — `COMP` is unused directly here (delegated), so import only what is used:
+
+```rust
+/// Reverse-complement the alleles of mask-selected `(b*p)` rows, in place.
+///
+/// `byte_data`   contiguous allele bytes (mutated in place)
+/// `seq_offsets` per-allele byte boundaries (len n_alleles + 1)
+/// `var_offsets` per-(b*p)-row allele boundaries (len n_rows + 1)
+/// `to_rc_row`   per-(b*p)-row bool mask (len n_rows)
+///
+/// Expands the row mask to a per-allele mask via `var_offsets`, then delegates
+/// to `reverse::rc_flat_rows_inplace` (reverse + `COMP`), matching the Python
+/// `np.repeat(per_bp, np.diff(var_offsets))` expansion byte-for-byte.
+pub fn rc_alleles_inplace(
+    byte_data: &mut [u8],
+    seq_offsets: ndarray::ArrayView1<i64>,
+    var_offsets: ndarray::ArrayView1<i64>,
+    to_rc_row: ndarray::ArrayView1<bool>,
+) {
+    let n_alleles = seq_offsets.len() - 1;
+    let mut per_allele = vec![false; n_alleles];
+    for g in 0..to_rc_row.len() {
+        if !to_rc_row[g] {
+            continue;
+        }
+        let a0 = var_offsets[g] as usize;
+        let a1 = var_offsets[g + 1] as usize;
+        for a in a0..a1 {
+            per_allele[a] = true;
+        }
+    }
+    let per_allele = ndarray::Array1::from_vec(per_allele);
+    crate::reverse::rc_flat_rows_inplace(byte_data, seq_offsets, per_allele.view());
+}
+```
+
+- [ ] **Step 4: Run tests to verify they pass**
+
+Run: `pixi run -e dev cargo test --lib rc_alleles`
+Expected: PASS (3 tests).
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add src/variants/mod.rs
+rtk git commit -m "feat(rust): rc_alleles_inplace core for variant-allele RC
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 2: PyO3 wrapper `rc_alleles` + registration
+
+**Files:**
+- Modify: `src/ffi/mod.rs` (add `rc_alleles` pyfunction, follow the `intervals_to_tracks` in-place pattern ~line 67)
+- Modify: `src/lib.rs` (register `ffi::rc_alleles` in the `#[pymodule]`, after `assemble_variant_buffers_i32` ~line 38)
+
+**Interfaces:**
+- Consumes: `crate::variants::rc_alleles_inplace` (Task 1).
+- Produces: pyfunction `rc_alleles(byte_data: PyReadwriteArray1<u8>, seq_offsets: PyReadonlyArray1<i64>, var_offsets: PyReadonlyArray1<i64>, to_rc_row: PyReadonlyArray1<bool>)` — mutates `byte_data` in place, returns `None`.
+
+- [ ] **Step 1: Write the failing test (Python smoke via the rust symbol)**
+
+Create `tests/unit/test_rc_alleles_ffi.py`. The compiled extension is
+`genvarloader.genvarloader` (see `_flat_variants.py:20`, `from ..genvarloader import ...`):
+
+```python
+import numpy as np
+import genvarloader.genvarloader as _gvl  # compiled rust extension module
+
+
+def test_rc_alleles_ffi_inplace():
+    # 2 rows. row0 (masked): alleles "AC","G". row1 (unmasked): "TT".
+    data = np.frombuffer(b"ACGTT", np.uint8).copy()
+    seq_offsets = np.array([0, 2, 3, 5], np.int64)
+    var_offsets = np.array([0, 2, 3], np.int64)
+    to_rc_row = np.array([True, False], np.bool_)
+    _gvl.rc_alleles(data, seq_offsets, var_offsets, to_rc_row)
+    assert data.tobytes() == b"GTCTT"
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev pytest tests/unit/test_rc_alleles_ffi.py -v`
+Expected: FAIL — `module ... has no attribute 'rc_alleles'`.
+
+- [ ] **Step 3: Implement the wrapper**
+
+In `src/ffi/mod.rs` (mirror `intervals_to_tracks`):
+
+```rust
+/// In-place reverse-complement of the alleles of mask-selected `(b*p)` rows.
+/// See `crate::variants::rc_alleles_inplace`.
+#[pyfunction]
+pub fn rc_alleles(
+    mut byte_data: PyReadwriteArray1<u8>,
+    seq_offsets: PyReadonlyArray1<i64>,
+    var_offsets: PyReadonlyArray1<i64>,
+    to_rc_row: PyReadonlyArray1<bool>,
+) {
+    crate::variants::rc_alleles_inplace(
+        byte_data.as_slice_mut().unwrap(),
+        seq_offsets.as_array(),
+        var_offsets.as_array(),
+        to_rc_row.as_array(),
+    );
+}
+```
+
+In `src/lib.rs`, after line 38 (`assemble_variant_buffers_i32`):
+
+```rust
+    m.add_function(wrap_pyfunction!(ffi::rc_alleles, m)?)?;
+```
+
+- [ ] **Step 4: Rebuild + run to verify it passes**
+
+Run: `pixi run -e dev pytest tests/unit/test_rc_alleles_ffi.py -v`
+(pixi rebuilds the extension via maturin automatically.)
+Expected: PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add src/ffi/mod.rs src/lib.rs tests/unit/test_rc_alleles_ffi.py
+rtk git commit -m "feat(rust): rc_alleles PyO3 wrapper + registration
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 3: `rc_alleles` dispatch entry (rust default + seqpro reference)
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_flat_variants.py` (add the dispatch shims + `register("rc_alleles", ...)` near the existing `register("assemble_variant_buffers", ...)` ~line 931)
+
+**Interfaces:**
+- Consumes: the rust `rc_alleles` pyfunction (Task 2); `_dispatch.register`; `genvarloader._ragged.reverse_complement_masked` + `seqpro.rag.Ragged` (reference).
+- Produces: registry entry `"rc_alleles"` with signature `(byte_data, seq_offsets, var_offsets, to_rc_row)`, both backends mutating `byte_data` in place and returning `None`. `default="rust"`.
+  - `byte_data`: `uint8` array. `seq_offsets`/`var_offsets`: `int64`. `to_rc_row`: per-`(b*p)` bool mask (already ploidy-broadcast by the caller).
+
+- [ ] **Step 1: Write the failing parity test**
+
+Create `tests/parity/test_rc_alleles_parity.py`:
+
+```python
+import numpy as np
+import pytest
+from hypothesis import given, settings
+from hypothesis import strategies as st
+
+from genvarloader._dataset import _flat_variants  # noqa: F401  (registers rc_alleles)
+from genvarloader import _dispatch
+
+_ACGTN = np.frombuffer(b"ACGTN", np.uint8)
+
+
+@st.composite
+def _allele_batch(draw):
+    n_rows = draw(st.integers(1, 4))
+    alleles_per_row = [draw(st.integers(0, 3)) for _ in range(n_rows)]
+    var_offsets = np.concatenate([[0], np.cumsum(alleles_per_row)]).astype(np.int64)
+    n_alleles = int(var_offsets[-1])
+    lens = [draw(st.integers(0, 5)) for _ in range(n_alleles)]
+    seq_offsets = np.concatenate([[0], np.cumsum(lens)]).astype(np.int64)
+    total = int(seq_offsets[-1])
+    data = _ACGTN[draw(st.lists(st.integers(0, 4), min_size=total, max_size=total))] \
+        if total else np.zeros(0, np.uint8)
+    data = np.ascontiguousarray(data, np.uint8)
+    mask = np.array([draw(st.booleans()) for _ in range(n_rows)], np.bool_)
+    return data, seq_offsets, var_offsets, mask
+
+
+@settings(max_examples=200, deadline=None)
+@given(batch=_allele_batch())
+def test_rc_alleles_rust_matches_reference(batch):
+    data, seq_offsets, var_offsets, mask = batch
+    numba_fn, rust_fn = _dispatch.backends("rc_alleles")
+    a = data.copy()
+    b = data.copy()
+    numba_fn(a, seq_offsets, var_offsets, mask)
+    rust_fn(b, seq_offsets, var_offsets, mask)
+    assert a.tobytes() == b.tobytes()
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev pytest tests/parity/test_rc_alleles_parity.py -q`
+Expected: FAIL — `KeyError: no kernel registered as 'rc_alleles'`.
+
+- [ ] **Step 3: Implement the shims + registration**
+
+In `python/genvarloader/_dataset/_flat_variants.py`, near the `assemble_variant_buffers` registration (~line 931), add:
+
+```python
+def _rc_alleles_reference(byte_data, seq_offsets, var_offsets, to_rc_row):
+    """Reference backend: seqpro reverse_complement_masked on a flat allele view.
+
+    `to_rc_row` is the per-(b*p) row mask (already ploidy-broadcast); expand to
+    per-allele via `var_offsets`, then RC each masked allele in place. Mutates
+    `byte_data` in place; byte-identical to `rc_alleles_inplace`.
+    """
+    from seqpro.rag import Ragged
+
+    from .._ragged import reverse_complement_masked
+
+    seq_off = np.ascontiguousarray(seq_offsets, np.int64)
+    var_off = np.ascontiguousarray(var_offsets, np.int64)
+    row_mask = np.ascontiguousarray(to_rc_row, np.bool_).reshape(-1)
+    if not row_mask.any():
+        return
+    per_allele = np.repeat(row_mask, np.diff(var_off))
+    n_alleles = len(seq_off) - 1
+    view = Ragged.from_offsets(byte_data.view("S1"), (n_alleles, None), seq_off)
+    reverse_complement_masked(view, per_allele)  # mutates byte_data in place
+
+
+def _rc_alleles_rust(byte_data, seq_offsets, var_offsets, to_rc_row):
+    _rc_alleles_rust_kernel(
+        np.ascontiguousarray(byte_data, np.uint8),  # in-place: see note below
+        np.ascontiguousarray(seq_offsets, np.int64),
+        np.ascontiguousarray(var_offsets, np.int64),
+        np.ascontiguousarray(to_rc_row, np.bool_),
+    )
+
+
+register(
+    "rc_alleles",
+    numba=_rc_alleles_reference,
+    rust=_rc_alleles_rust,
+    default="rust",
+)
+```
+
+> **In-place caveat:** `np.ascontiguousarray` returns the SAME object when input is already contiguous `uint8`, but a COPY otherwise — which would silently drop the in-place mutation. The callers (Task 4) pass contiguous `uint8` `byte_data` directly, so guard it: assert contiguity instead of coercing. Replace the `_rc_alleles_rust` body with:
+> ```python
+> def _rc_alleles_rust(byte_data, seq_offsets, var_offsets, to_rc_row):
+>     assert byte_data.dtype == np.uint8 and byte_data.flags.c_contiguous, (
+>         "rc_alleles requires a contiguous uint8 byte_data for in-place RC"
+>     )
+>     _rc_alleles_rust_kernel(
+>         byte_data,
+>         np.ascontiguousarray(seq_offsets, np.int64),
+>         np.ascontiguousarray(var_offsets, np.int64),
+>         np.ascontiguousarray(to_rc_row, np.bool_),
+>     )
+> ```
+
+Add the rust import at the top of `_flat_variants.py`, alongside the existing
+`assemble_variant_buffers_*` imports (~lines 20–24, which use `from ..genvarloader import ...`):
+
+```python
+from ..genvarloader import rc_alleles as _rc_alleles_rust_kernel
+```
+
+- [ ] **Step 4: Run to verify it passes**
+
+Run: `pixi run -e dev pytest tests/parity/test_rc_alleles_parity.py -q`
+Expected: PASS (200 examples).
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add python/genvarloader/_dataset/_flat_variants.py tests/parity/test_rc_alleles_parity.py
+rtk git commit -m "feat: register rc_alleles dispatch (rust default, seqpro reference)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 4: Route `_FlatAlleles.reverse_masked` + `RaggedVariants.rc_` through dispatch
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_flat_variants.py` (`_FlatAlleles.reverse_masked`, ~lines 119-142)
+- Modify: `python/genvarloader/_dataset/_rag_variants.py` (`RaggedVariants.rc_`, ~lines 296-351; replace only the inner `_sp_reverse_complement` call)
+
+**Interfaces:**
+- Consumes: `get("rc_alleles")` (Task 3).
+- Produces: unchanged public signatures `_FlatAlleles.reverse_masked(self, mask) -> _FlatAlleles` and `RaggedVariants.rc_(self, to_rc=None) -> RaggedVariants`; output byte-identical to before, now backend-dispatched.
+
+- [ ] **Step 1: Write the failing test (behavior pin on the rust backend)**
+
+Add to `tests/parity/test_rc_alleles_parity.py`:
+
+```python
+def test_flat_alleles_reverse_masked_uses_rc_alleles(monkeypatch):
+    """_FlatAlleles.reverse_masked must call the dispatched rc_alleles kernel."""
+    from genvarloader._dataset._flat_variants import _FlatAlleles
+    from genvarloader._dataset import _flat_variants as fv
+
+    calls = {"n": 0}
+    real = _dispatch.get
+
+    def spy(name):
+        if name == "rc_alleles":
+            calls["n"] += 1
+        return real(name)
+
+    monkeypatch.setattr(fv, "get", spy)
+
+    # one row (b=1, ploidy=1), two alleles "AC","G".
+    byte_data = np.frombuffer(b"ACG", np.uint8).copy()
+    seq_offsets = np.array([0, 2, 3], np.int64)
+    var_offsets = np.array([0, 2], np.int64)
+    fa = _FlatAlleles(byte_data, seq_offsets, var_offsets, (1, 1, None))
+    fa.reverse_masked(np.array([True], np.bool_))
+    assert calls["n"] == 1
+    # "AC"->"GT", "G"->"C"
+    assert fa.byte_data.tobytes() == b"GTC"
+```
+
+> Confirm `get` is imported into `_flat_variants.py` as a module-level name (it is used by the `assemble_variant_buffers` call site at ~line 1085 via `get("assemble_variant_buffers")`). If it is imported as `from .._dispatch import get`, the monkeypatch target `fv.get` is correct.
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev pytest tests/parity/test_rc_alleles_parity.py::test_flat_alleles_reverse_masked_uses_rc_alleles -q`
+Expected: FAIL — `calls["n"] == 0` (still calls seqpro directly).
+
+- [ ] **Step 3: Implement the routing**
+
+Replace `_FlatAlleles.reverse_masked` body (`_flat_variants.py` ~lines 119-142) with:
+
+```python
+    def reverse_masked(self, mask: NDArray[np.bool_]) -> "_FlatAlleles":
+        """DNA reverse-complement the mask-selected rows' alleles, in place.
+
+        ``mask`` is one entry per region (length ``b``); broadcast across ploidy
+        to a per-(b*p) row mask, then expanded per-allele inside the dispatched
+        ``rc_alleles`` kernel (rust default, seqpro reference).
+        """
+        m = np.ascontiguousarray(mask, np.bool_).reshape(-1)
+        per_bp = np.repeat(m, self.ploidy)  # per-(b*p) row mask
+        get("rc_alleles")(
+            self.byte_data,
+            np.asarray(self.seq_offsets, np.int64),
+            np.asarray(self.var_offsets, np.int64),
+            per_bp,
+        )
+        return self
+```
+
+In `RaggedVariants.rc_` (`_rag_variants.py` ~line 333), replace the single line:
+
+```python
+                _sp_reverse_complement(view, _COMP, mask=allele_mask, copy=False)
+```
+
+with a call to the dispatched kernel on the same `data` buffer. Two details:
+1. `data` is `S1` dtype (`chars.data.copy()`), but `rc_alleles` requires `uint8` — pass
+   `data.view(np.uint8)` (shares the buffer, so the in-place RC propagates back into
+   `data`, which `Ragged.from_offsets(data, ...)` then consumes at the next line).
+2. `rc_` already computed the per-allele `allele_mask` (length `n_alleles`), so make each
+   allele its own row via `var_offsets = arange(n_alleles+1)` — the kernel's row→allele
+   expansion is then the identity, reproducing the prior `mask=allele_mask` semantics:
+
+```python
+                get("rc_alleles")(
+                    data.view(np.uint8),
+                    np.asarray(char_off, np.int64),
+                    np.arange(n_alleles + 1, dtype=np.int64),
+                    allele_mask,
+                )
+```
+
+Remove the now-unused `from seqpro.rag import reverse_complement as _sp_reverse_complement`
+import at the top of `rc_` if it has no other use in that method (keep `_COMP` import
+only if still referenced; otherwise drop it). Add `from .._dispatch import get` and
+`import numpy as np` if not already imported at module scope in `_rag_variants.py`.
+
+- [ ] **Step 4: Run to verify it passes**
+
+Run: `pixi run -e dev pytest tests/parity/test_rc_alleles_parity.py -q`
+Expected: PASS (all, incl. the new spy test).
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add python/genvarloader/_dataset/_flat_variants.py python/genvarloader/_dataset/_rag_variants.py tests/parity/test_rc_alleles_parity.py
+rtk git commit -m "refactor: route variant-allele RC through dispatched rc_alleles kernel
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 5: Remove the dead spliced variant guard in `_query.py`
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_query.py` (`_getitem_spliced`, ~lines 306-321)
+
+**Interfaces:**
+- Consumes: nothing new.
+- Produces: `_getitem_spliced` no longer references `_VARIANT_TYPES_S`; spliced RC post-pass remains for the seq/annotated kinds only (the only kinds reachable on the spliced path).
+
+- [ ] **Step 1: Write the failing test (assert the guard is gone / spliced variants still rejected)**
+
+Add to `tests/dataset/test_query_spliced.py` (create if absent; otherwise append):
+
+```python
+import inspect
+
+from genvarloader._dataset import _query
+
+
+def test_spliced_has_no_dead_variant_guard():
+    src = inspect.getsource(_query._getitem_spliced)
+    assert "_VARIANT_TYPES_S" not in src, (
+        "spliced variant RC guard is unreachable (spliced variants are rejected "
+        "upstream) and must be removed"
+    )
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev pytest tests/dataset/test_query_spliced.py -q`
+Expected: FAIL — `_VARIANT_TYPES_S` still present in source.
+
+- [ ] **Step 3: Implement the removal**
+
+In `_getitem_spliced` (`_query.py` ~lines 306-321), replace the backend-split block:
+
+```python
+    if view.rc_neg and to_rc_per_elem is not None:
+        if _active_backend() == "numba":
+            # Numba: RC handled entirely by post-pass for all kinds.
+            recon = tuple(reverse_complement_ragged(r, to_rc_per_elem) for r in recon)
+        else:
+            # Rust: flat-seq kinds folded RC in-kernel (or Python-side inside the
+            # reconstructor).  Spliced output is never a variant type, so this
+            # branch is effectively a no-op, but we keep the guard symmetric
+            # with the unspliced path for correctness.
+            _VARIANT_TYPES_S = (RaggedVariants, _FlatVariants, _FlatVariantWindows)
+            recon = tuple(
+                reverse_complement_ragged(r, to_rc_per_elem)
+                if isinstance(r, _VARIANT_TYPES_S)
+                else r
+                for r in recon
+            )
+```
+
+with:
+
+```python
+    if view.rc_neg and to_rc_per_elem is not None:
+        # Spliced output is never a variant type (spliced variants are rejected
+        # upstream in Haps.__call__). On numba the post-pass RCs the seq/annotated
+        # kinds; on rust those kinds fold RC in-kernel, so this is a no-op there.
+        if _active_backend() == "numba":
+            recon = tuple(reverse_complement_ragged(r, to_rc_per_elem) for r in recon)
+```
+
+Then remove any now-unused imports in `_query.py` that were referenced ONLY by the
+deleted branch (`_FlatVariants`, `RaggedVariants`, `_FlatVariantWindows` may still be
+used by the unspliced path / overloads — check with `rg` before deleting; only drop
+truly unused names).
+
+- [ ] **Step 4: Run to verify it passes**
+
+Run: `pixi run -e dev pytest tests/dataset/test_query_spliced.py -q && pixi run -e dev ruff check python/genvarloader/_dataset/_query.py`
+Expected: PASS; ruff clean (no unused-import error).
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add python/genvarloader/_dataset/_query.py tests/dataset/test_query_spliced.py
+rtk git commit -m "refactor: drop unreachable spliced variant-RC guard
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 6: End-to-end neg-strand variants parity + dummy-fill / custom-allele coverage
+
+**Files:**
+- Modify: `tests/parity/test_variants_dataset_parity.py` (add neg-strand variant-RC cases + `rc_alleles` spy)
+
+**Context (read before writing):** the existing `tests/parity/test_dataset_parity.py::test_neg_strand_parity` already proves byte-identical neg-strand output across backends for `["reference","haplotypes","annotated","tracks","tracks-seqs","haps-tracks"]` — but **not `variants`**. That is the gap this task fills, reusing the same fixture (`tests/parity/_fixtures.py::build_strand_mixed_dataset`, which has −strand regions at indices 1 and 3) and the `_compare_ragged_field` helper already in `test_variants_dataset_parity.py`.
+
+**Design note (why dummy-fill is NOT a divergence risk here):** RC is applied via the dispatched `rc_alleles` kernel at the **same call site on both backends** (the `_query.py` post-pass → `reverse_masked`), which runs **after** dummy-fill. So dummy alleles are RC'd identically by rust and reference. The custom non-palindromic dummy case below is therefore regression-locking coverage (rust kernel handles dummy-filled buffers exactly like the seqpro reference), not a hunt for an ordering bug.
+
+**Interfaces:**
+- Consumes: `build_strand_mixed_dataset` (`tests/parity/_fixtures.py`); `synthetic_case` fixture (provides `.svar_path`, `.ref_path`); `_compare_ragged_field` (same file); `DummyVariant` (`genvarloader._dataset._flat_variants`); `_dispatch._REGISTRY` / `backends` (spy pattern, mirror `test_variants_getitem_parity_and_kernels_invoked`).
+- Produces: byte-identical alt/ref assertions (rust vs reference) for a neg-strand variants read, with a non-vacuity guard that `rc_alleles` actually fires, plus a custom-dummy variant case.
+
+- [ ] **Step 1: Write the failing tests**
+
+Append to `tests/parity/test_variants_dataset_parity.py` (imports at top: add
+`from genvarloader._dataset._flat_variants import DummyVariant` and
+`from ._fixtures import build_strand_mixed_dataset` — match the import style already
+used by `test_dataset_parity.py:33`):
+
+```python
+def _read_variants_both_backends(ds, monkeypatch):
+    """Read ds[:, :] under numba then rust; return (out_numba, out_rust)."""
+    monkeypatch.setenv("GVL_BACKEND", "numba")
+    out_numba = ds[:, :]
+    monkeypatch.setenv("GVL_BACKEND", "rust")
+    out_rust = ds[:, :]
+    return out_numba, out_rust
+
+
+def test_neg_strand_variants_rc_parity_and_kernel_invoked(
+    tmp_path, synthetic_case, monkeypatch
+):
+    """variants-mode neg-strand RC is byte-identical across backends, and the
+    rust rc_alleles kernel actually fires on the live read (non-vacuous)."""
+    import genvarloader as gvl
+
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = gvl.Dataset.open(ds_dir, reference=ref).with_tracks(False).with_seqs("variants")
+
+    # Non-vacuity: fixture must carry −strand regions (rc_neg defaults True).
+    assert np.any(ds._full_regions[:, 3] == -1), "fixture has no −strand regions"
+
+    # Spy on the rust rc_alleles to prove it runs on the live neg-strand path.
+    numba_fn, rust_fn = _dispatch.backends("rc_alleles")
+    calls = {"n": 0}
+
+    def _spy_rust(*a, **k):
+        calls["n"] += 1
+        return rust_fn(*a, **k)
+
+    orig_entry = dict(_dispatch._REGISTRY["rc_alleles"])
+    _dispatch.register("rc_alleles", numba=numba_fn, rust=_spy_rust, default="rust")
+    try:
+        out_numba, out_rust = _read_variants_both_backends(ds, monkeypatch)
+    finally:
+        _dispatch._REGISTRY["rc_alleles"] = orig_entry
+
+    assert calls["n"] > 0, (
+        "rust rc_alleles was never invoked on the neg-strand variants read — "
+        "the backstop is vacuous. Confirm a variant overlaps a −strand region; if "
+        "the synthetic variant set does not, extend build_strand_mixed_dataset with a "
+        "−strand region positioned over a known variant."
+    )
+    for field_name in out_numba.fields:
+        _compare_ragged_field(out_numba[field_name], out_rust[field_name], field_name)
+
+
+def test_neg_strand_variants_custom_dummy_parity(tmp_path, synthetic_case, monkeypatch):
+    """A custom non-palindromic dummy (alt/ref = b'AC') filled into empty groups on
+    a −strand read is RC'd identically by rust and the seqpro reference."""
+    import genvarloader as gvl
+
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = (
+        gvl.Dataset.open(ds_dir, reference=ref)
+        .with_tracks(False)
+        .with_seqs("variants")
+        .with_settings(dummy_variant=DummyVariant(alt=b"AC", ref=b"AC"))
+    )
+    assert np.any(ds._full_regions[:, 3] == -1), "fixture has no −strand regions"
+
+    out_numba, out_rust = _read_variants_both_backends(ds, monkeypatch)
+    for field_name in out_numba.fields:
+        _compare_ragged_field(out_numba[field_name], out_rust[field_name], field_name)
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev pytest tests/parity/test_variants_dataset_parity.py -k "neg_strand_variants" -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: with Tasks 1-4 already landed this should PASS; run it FIRST against the
+pre-Task-4 state to confirm it would fail (e.g. temporarily on the prior commit it
+errors on the missing `rc_alleles` registry entry). If both already pass because
+Tasks 1-4 are merged, treat this task as adding the missing live-path coverage and
+proceed to Step 4. If `calls["n"] == 0`, apply the fixture fallback in the assert msg.
+
+- [ ] **Step 3: (only if vacuous) extend the fixture**
+
+If the spy reports 0 calls, the synthetic variant set has no variant over a −strand
+region. In `tests/parity/_fixtures.py::build_strand_mixed_dataset`, add a −strand BED
+row positioned over a known variant from `synthetic_case` (e.g. the GAGA→G chr1
+deletion region is at +; mirror its coordinates as a −strand region) so a −strand
+group is non-empty. Re-run Step 2. (No production code changes.)
+
+- [ ] **Step 4: Run to verify it passes**
+
+Run: `pixi run -e dev pytest tests/parity/test_variants_dataset_parity.py -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (existing tests + the two new neg-strand cases).
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add tests/parity/test_variants_dataset_parity.py tests/parity/_fixtures.py
+rtk git commit -m "test(parity): e2e neg-strand variants RC + custom-dummy, rc_alleles live spy
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 7: Full-tree verification + roadmap update
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (Target 6 section: tick the deferred variant-RC follow-up; record the new gvl `rc_alleles` kernel + retained seqpro reference)
+
+**Interfaces:**
+- Consumes: all prior tasks.
+- Produces: green full tree on both backends; roadmap reflecting reality.
+
+- [ ] **Step 1: Lint, format, typecheck**
+
+Run:
+```bash
+pixi run -e dev ruff format python/ tests/
+pixi run -e dev ruff check python/ tests/
+pixi run -e dev typecheck
+```
+Expected: all clean (format may rewrite the new test files — re-stage if so).
+
+- [ ] **Step 2: cargo tests**
+
+Run: `pixi run -e dev cargo test`
+Expected: all pass (incl. the 3 new `rc_alleles_inplace` tests).
+
+- [ ] **Step 3: Full pytest tree on BOTH backends**
+
+Run:
+```bash
+pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp
+GVL_BACKEND=numba pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: both green (same passed/xfailed counts as the Target-7 baseline `967 passed / 21 skipped / 4 xfailed`, modulo the new tests added here). Investigate any new failure before proceeding — do NOT claim success without reading the output.
+
+- [ ] **Step 4: Update the roadmap**
+
+In `docs/roadmaps/rust-migration.md`, under Target 6 (~lines 468-489), add a follow-up note (and tick the deferred variant-RC item):
+
+```markdown
+   **✅ Variant-allele RC folded (follow-up, 2026-06-25).** The two deferred kinds
+   (`RaggedVariants` + `_FlatVariants`) no longer route variant-allele RC through the
+   seqpro post-pass with per-batch ragged object churn; a gvl rust kernel
+   (`variants::rc_alleles_inplace`, FFI `rc_alleles`, dispatch `rc_alleles` default
+   rust) RCs the raw `_FlatAlleles` buffers in place, applied AFTER dummy-fill so
+   ordering stays byte-identical (custom non-palindromic dummy alleles covered). The
+   seqpro implementation is retained as the registered reference backend (parity + perf
+   gating; deletion is Phase 5). `_FlatVariantWindows` remains never-RC'd. Plan:
+   `docs/superpowers/plans/2026-06-25-rust-variant-rc-fold.md`.
+```
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): variant-allele RC folded onto gvl rust kernel (Target 6 follow-up)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Notes for the implementer
+
+- **Extension import path:** the compiled rust module is `genvarloader.genvarloader`,
+  imported in `_flat_variants.py` (line ~20) as `from ..genvarloader import <name>`. Reuse
+  that verbatim for `rc_alleles`; tests import `genvarloader.genvarloader` directly.
+- **In-place is load-bearing:** `rc_alleles` mutates `byte_data`. Never wrap the caller's
+  `byte_data` in `np.ascontiguousarray` on a path that could copy (non-contiguous/non-uint8)
+  — assert contiguity instead (Task 3). The `_FlatAlleles.byte_data` buffer is contiguous
+  `uint8` by construction.
+- **The reference IS the oracle:** there is no numba `rc_helper`; the seqpro path is the
+  byte-identical reference. Parity tests compare rust vs that reference, not vs a numba
+  kernel.
+- **Don't touch `flank_tokens` or windows:** RC applies only to `alt`/`ref` allele bytes,
+  matching the current post-pass exactly.
+```
diff --git a/docs/superpowers/plans/2026-06-25-target-5-tracks-intervals-slice.md b/docs/superpowers/plans/2026-06-25-target-5-tracks-intervals-slice.md
new file mode 100644
index 00000000..47c758ce
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-25-target-5-tracks-intervals-slice.md
@@ -0,0 +1,342 @@
+# Target 5 — tracks-only intervals slice optimization — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Drop per-interval `SliceInfo` construction from `intervals_to_tracks` so the tracks-only read path runs ≥ 1.0× numba, byte-identically.
+
+**Architecture:** Address the contiguous `out` buffer as a raw `&mut [f32]` via one hoisted `as_slice_mut()`, replacing `out.slice_mut(s![a..b]).fill(value)` with `out_slice[a..b].fill(value)`. Pure-Rust refactor under the existing cargo tests; same arithmetic, same write order, same values. Unsafe `get_unchecked_mut` is a measured contingency only if the safe form misses the perf gate.
+
+**Tech Stack:** Rust (`ndarray`, PyO3/maturin), Python (pytest, pytest-benchmark, numba oracle), pixi (`-e dev`).
+
+**Spec:** `docs/superpowers/specs/2026-06-25-target-5-tracks-intervals-slice-design.md`
+
+## Global Constraints
+
+- Branch: `opt/target-5-intervals-slice` off `rust-migration` (already created and checked out).
+- **Byte-identical** to the numba oracle — non-negotiable landing gate.
+- **Only** `src/intervals.rs` changes (the kernel body; one added test only if the unsafe fallback lands). No Python, no FFI-signature, no oracle changes.
+- **Keep the `out.fill(0.0)` zero prelude** — tracks-only relies on inter-interval gaps reading 0.
+- The 8 existing cargo tests in `src/intervals.rs` must stay green **untouched**.
+- Measure with `NUMBA_NUM_THREADS=1`; compare the **min** of `pedantic(iterations=10, rounds=50)`.
+- Release build before any perf measurement: `pixi run -e dev maturin develop --release`.
+- HPC: dataset tests need `--basetemp=$(pwd)/.pytest_tmp` (cross-device `os.link` fails with Errno 18 otherwise).
+- Per CLAUDE.md, prefix shell commands with `rtk`.
+
+---
+
+### Task 1: Establish green baseline + record starting ratio
+
+**Files:**
+- Read only: `src/intervals.rs`
+
+**Interfaces:**
+- Consumes: nothing.
+- Produces: a recorded baseline tracks-only `min rust ÷ min numba` ratio (expected ≈ 0.63×) used to confirm improvement in Task 4.
+
+- [ ] **Step 1: Confirm clean tree on the right branch**
+
+Run: `rtk git status && rtk git branch --show-current`
+Expected: branch `opt/target-5-intervals-slice`, only the untracked handoff + the committed spec/plan present.
+
+- [ ] **Step 2: Release build**
+
+Run: `pixi run -e dev maturin develop --release`
+Expected: builds `genvarloader.abi3.so` with no errors.
+
+- [ ] **Step 3: Run the cargo unit tests (baseline green)**
+
+Run: `pixi run -e dev cargo-test`
+Expected: PASS, including the 8 `intervals_to_tracks` tests (`test_basic_paint`, `test_empty_intervals`, `test_end_clamp`, `test_break_on_start_ge_length`, `test_interval_starts_before_query_full_cover`, `test_interval_starts_before_query_partial`, `test_interval_fully_left_of_query`, `test_multi_query_disjoint`).
+
+- [ ] **Step 4: Capture the baseline tracks-only ratio**
+
+Run: `NUMBA_NUM_THREADS=1 pixi run -e dev pytest tests/benchmarks/test_e2e.py -k tracks --basetemp=$(pwd)/.pytest_tmp -q`
+Expected: completes; note the tracks-only min rust and min numba times. Record the ratio (≈ 0.63×) in scratch — this is the before-number for the roadmap.
+
+No commit (measurement only).
+
+---
+
+### Task 2: Refactor `intervals_to_tracks` to a raw contiguous slice
+
+**Files:**
+- Modify: `src/intervals.rs:23-69` (the function body)
+
+**Interfaces:**
+- Consumes: the existing `intervals_to_tracks` signature — unchanged.
+- Produces: identical output buffer; no signature change. Later tasks rely on the public signature staying exactly as-is.
+
+- [ ] **Step 1: Confirm the tests already pin the contract (no new test needed)**
+
+The 8 cargo tests in `src/intervals.rs:72-219` exhaust the behavior (paint, empty, end-clamp, break, the three #242 jitter cases, multi-query). This is a byte-identical refactor, so they ARE the failing/passing gate — do not add or edit them.
+
+- [ ] **Step 2: Apply the refactor**
+
+Replace the body from the zero-prelude through the inner write. Change `out.fill(0.0)` and the per-interval `out.slice_mut(...)` to operate on a hoisted raw slice:
+
+```rust
+    // Step 1: zero the whole output buffer, exactly like `out[:] = 0.0`.
+    // The out buffer is freshly allocated and contiguous; address it as a raw
+    // &mut [f32] so per-interval writes avoid ndarray SliceInfo construction.
+    let out_slice = out.as_slice_mut().unwrap();
+    out_slice.fill(0.0);
+
+    let n_queries = starts.len();
+
+    for query in 0..n_queries {
+        let idx = offset_idxs[query] as usize;
+        let itv_s = itv_offsets[idx] as usize;
+        let itv_e = itv_offsets[idx + 1] as usize;
+
+        if itv_s == itv_e {
+            // No intervals for this query — out slice stays 0.
+            continue;
+        }
+
+        let out_s = out_offsets[query] as usize;
+        let out_e = out_offsets[query + 1] as usize;
+        // length as i64 to do signed arithmetic below.
+        let length = (out_e - out_s) as i64;
+        let query_start = starts[query] as i64;
+
+        for interval in itv_s..itv_e {
+            // start/end computed in i64 (avoids i32 overflow for large coords).
+            let start = itv_starts[interval] as i64 - query_start;
+            let end = itv_ends[interval] as i64 - query_start;
+            let value = itv_values[interval];
+
+            if start >= length {
+                // start >= length: intervals are sorted, all remaining are
+                // also out of range — break.
+                break;
+            }
+            // Clip to the query window. Intervals may start before query_start
+            // (jitter-expanded interval storage vs. the per-read query origin;
+            // see issue #242) or end past it. No negative-index wrap.
+            let s = start.max(0);
+            let e = end.min(length);
+            if e > s {
+                let a = out_s + s as usize;
+                let b = out_s + e as usize;
+                out_slice[a..b].fill(value);
+            }
+        }
+    }
+```
+
+Note: `out` is now bound only to produce `out_slice`; the `mut out: ArrayViewMut1<f32>` parameter stays as-is. The doc comment at `src/intervals.rs:3-15` remains accurate (semantics unchanged) — leave it.
+
+- [ ] **Step 3: Run the cargo tests (must stay green, untouched)**
+
+Run: `pixi run -e dev cargo-test`
+Expected: PASS — all 8 `intervals_to_tracks` tests green, identical to Task 1 Step 3.
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add src/intervals.rs
+rtk git commit -m "perf(intervals): paint tracks via raw contiguous slice
+
+Hoist out.as_slice_mut() once and write out_slice[a..b].fill(value)
+per interval, dropping per-interval ndarray SliceInfo construction
+(~20.5% self-time on the tracks-only read path). Byte-identical:
+same arithmetic, same write order, zero prelude retained.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 3: Parity gate on both backends
+
+**Files:**
+- Read only: `tests/parity/`
+
+**Interfaces:**
+- Consumes: the refactored kernel from Task 2.
+- Produces: proof of byte-identical output vs the numba oracle on the live `__getitem__` path.
+
+- [ ] **Step 1: Rebuild release (Task 2 changed Rust)**
+
+Run: `pixi run -e dev maturin develop --release`
+Expected: builds cleanly.
+
+- [ ] **Step 2: Parity — rust default backend**
+
+Run: `pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS, including the `intervals_to_tracks` hypothesis parity gate and the tracks dataset backstop (`tests/parity/test_dataset_parity.py`) that spies on the kernel to prove it runs.
+
+- [ ] **Step 3: Parity — numba oracle backend**
+
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (byte-identical to Step 2).
+
+No commit (verification only). If either fails, the refactor diverged — return to Task 2; do not proceed.
+
+---
+
+### Task 4: Perf gate — re-measure, escalate to unsafe only if short
+
+**Files:**
+- Modify (conditional): `src/intervals.rs` inner write + one added test, **only if** the safe form misses ≥ 1.0×.
+
+**Interfaces:**
+- Consumes: the refactored kernel.
+- Produces: the recorded post-change tracks-only ratio for the roadmap.
+
+- [ ] **Step 1: Re-measure tracks-only**
+
+Run: `NUMBA_NUM_THREADS=1 pixi run -e dev pytest tests/benchmarks/test_e2e.py -k tracks --basetemp=$(pwd)/.pytest_tmp -q`
+Expected: completes. Compute `min rust ÷ min numba`.
+
+- [ ] **Step 2: Branch on the result**
+
+- **If ≥ 1.0×** → gate cleared. Skip Steps 3–5; record the ratio for Task 5.
+- **If < 1.0×** → proceed to Step 3 (unsafe fallback).
+
+- [ ] **Step 3 (conditional): Escalate the inner write to `get_unchecked_mut`**
+
+In `src/intervals.rs`, replace the safe inner write with:
+
+```rust
+            if e > s {
+                let a = out_s + s as usize;
+                let b = out_s + e as usize;
+                // SAFETY: 0 <= s <= e <= length, and out_s + length == out_e,
+                // where out_offsets is a valid CSR layout over out_slice
+                // (out_e <= out_slice.len()). Hence out_s <= a <= b <= out_e
+                // <= out_slice.len(), so a..b is in bounds.
+                unsafe { out_slice.get_unchecked_mut(a..b).fill(value); }
+            }
+```
+
+- [ ] **Step 4 (conditional): Add a test pinning the SAFETY invariant**
+
+Append to the `tests` module in `src/intervals.rs`:
+
+```rust
+    /// SAFETY invariant: a painted interval never writes past its query's
+    /// out slice end (b <= out_e), even when the interval end far exceeds it.
+    #[test]
+    fn test_paint_never_exceeds_query_slice() {
+        // Two adjacent queries; query 0's interval ends at 1000 but its slice
+        // is out[0..5]; query 1's slice (out[5..10]) must remain untouched
+        // except by its own interval.
+        let result = run(
+            &[0, 1],
+            &[0, 0],
+            &[2, 0],
+            &[1000, 1],
+            &[7.0, 9.0],
+            &[0, 1, 2],
+            10,
+            &[0, 5, 10],
+        );
+        // query 0: out[2..5]=7.0 (clamped at 5, no spill into query 1)
+        // query 1: out[5..6]=9.0
+        assert_eq!(
+            result,
+            vec![0.0, 0.0, 7.0, 7.0, 7.0, 9.0, 0.0, 0.0, 0.0, 0.0]
+        );
+    }
+```
+
+- [ ] **Step 5 (conditional): Rebuild, retest, re-measure**
+
+Run: `pixi run -e dev maturin develop --release && pixi run -e dev cargo-test`
+Expected: PASS (9 tests now).
+Then re-run Step 1's benchmark; confirm ≥ 1.0×.
+
+- [ ] **Step 6 (conditional): Commit the fallback**
+
+```bash
+rtk git add src/intervals.rs
+rtk git commit -m "perf(intervals): elide bounds-check on per-interval paint
+
+Safe slice indexing fell short of numba on tracks-only; use
+get_unchecked_mut with a proven SAFETY invariant (a..b within the
+query's CSR out slice) plus a test pinning no cross-query spill.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 5: Full-tree gate, lint, roadmap update, PR
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (round-2 block: tick Target 5, record ratio, set PR link)
+
+**Interfaces:**
+- Consumes: the green kernel + recorded ratio.
+- Produces: the landed, documented workstream + PR.
+
+- [ ] **Step 1: Full tree — rust default**
+
+Run: `pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (covers `tests/unit/` which scoped runs skip).
+
+- [ ] **Step 2: Full tree — numba oracle**
+
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+- [ ] **Step 3: Lint / format / typecheck**
+
+Run: `pixi run -e dev ruff check python/ tests/ && pixi run -e dev ruff format --check python/ tests/ && pixi run -e dev typecheck`
+Expected: clean (no Python changed, but the project gates on it).
+
+- [ ] **Step 4: Update the roadmap**
+
+In `docs/roadmaps/rust-migration.md`, in the round-2 optimization block: tick Target 5, set its phase marker, and record the re-measured tracks-only ratio (before ≈ 0.63× → after, from Task 4 Step 1) plus whether the safe or unsafe form landed. Add the PR link once opened (Step 6).
+
+- [ ] **Step 5: Commit the roadmap**
+
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): tick Target 5, record tracks-only ratio
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+- [ ] **Step 6: Push and open the parity-gated PR**
+
+```bash
+rtk git push -u origin opt/target-5-intervals-slice
+rtk gh pr create --base rust-migration --title "perf(intervals): tracks-only raw-slice paint (Target 5)" --body "$(cat <<'EOF'
+Closes Target 5 of the Phase 5 read-path optimization (handoff
+docs/handoffs/2026-06-25-phase5-getitem-optimization.md).
+
+Byte-identical refactor of intervals_to_tracks to drop per-interval
+ndarray SliceInfo construction. tracks-only min rust ÷ min numba:
+<BEFORE 0.63x> → <AFTER>.
+
+Parity: green on both backends (rust default + GVL_BACKEND=numba),
+incl. the intervals_to_tracks hypothesis gate and tracks dataset
+backstop. Full tree green both backends.
+
+🤖 Generated with [Claude Code](https://claude.com/claude-code)
+EOF
+)"
+```
+
+Then edit the roadmap PR-link placeholder (Step 4) to the real URL and amend Step 5's commit, or push a follow-up.
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- Problem / SliceInfo cost → Task 2 (the refactor). ✓
+- Keep zero prelude → Task 2 Step 2 comment + Global Constraints. ✓
+- Byte-identical parity, both backends, hypothesis gate + dataset backstop → Task 3. ✓
+- Existing 8 cargo tests stay green untouched → Task 1 Step 3, Task 2 Step 3. ✓
+- Perf gate ≥ 1.0×, min-of-pedantic, NUMBA_NUM_THREADS=1 → Task 1 Step 4, Task 4. ✓
+- Unsafe fallback with SAFETY proof + added test → Task 4 Steps 3–6. ✓
+- Full tree both backends + lint/format/typecheck → Task 5 Steps 1–3. ✓
+- Roadmap update (tick, ratio, PR link) → Task 5 Steps 4–5. ✓
+- Branch off rust-migration, parity-gated PR → Global Constraints, Task 5 Step 6. ✓
+
+**Placeholder scan:** `<BEFORE 0.63x>` / `<AFTER>` in the PR body and roadmap are intentional runtime-measured values, filled from Task 4's measurement — not unspecified work. No "TBD"/"add error handling"/"write tests for the above" left.
+
+**Type consistency:** `intervals_to_tracks` signature untouched throughout; the test helper `run(...)` argument order in Task 4's added test matches the existing helper at `src/intervals.rs:77-100` (offset_idxs, starts, itv_starts, itv_ends, itv_values, itv_offsets, out_len, out_offsets). `out_slice` / `a` / `b` names consistent across Task 2 and Task 4.
diff --git a/docs/superpowers/plans/2026-06-25-target6-kernel-rc.md b/docs/superpowers/plans/2026-06-25-target6-kernel-rc.md
new file mode 100644
index 00000000..e50be270
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-25-target6-kernel-rc.md
@@ -0,0 +1,749 @@
+# Target 6 — Kernel Reverse-Complement Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Emit negative-strand read-path output already reverse-complemented from the Rust fused kernels, removing the cold batch-wide seqpro RC post-pass for the rust backend while keeping the numba path (the parity oracle) byte-identical.
+
+**Architecture:** Add two generic in-place primitives in a new `src/reverse.rs` that reverse (optionally complement) each masked row of a flat `(data, offsets)` buffer. Thread an optional per-row `to_rc` mask into each fused kernel; when present, the kernel RC's each negative-strand query/element's slice **in place, immediately after it is written, inside the existing per-query loop** (hot in cache). Python computes the mask (reusing the existing strand and splice-permutation logic) and, on the rust backend only, stops applying the Python RC post-pass to the five flat output kinds. The numba composed path keeps the existing `reverse_complement_ragged` post-pass unchanged. `RaggedVariants` RC is deferred to Target 7 and continues to use the Python post-pass on both backends.
+
+**Tech Stack:** Rust (PyO3, ndarray) for kernels; Python (numpy) for orchestration; pixi for env/build (`maturin develop`); pytest + cargo for tests.
+
+## Global Constraints
+
+- Spec: `docs/superpowers/specs/2026-06-25-target6-kernel-rc-design.md` (read before starting).
+- Roadmap: `docs/roadmaps/rust-migration.md` — Phase 5, round-2 optimization block. Tick Target 6, record re-measured ratios, set PR link, set the "Target 6 must merge before rayon" marker as part of this work.
+- **Parity is the landing gate: output must be byte-identical between backends.** Run both:
+  `pixi run -e dev pytest tests/parity -q` (rust default) and `GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q` (oracle).
+- `_COMP` LUT contract (reproduce exactly from `python/genvarloader/_ragged.py:330`, `bytes.maketrans(b"ACGT", b"TGCA")`): a `[u8; 256]` that is **identity for everything** except `A(0x41)↔T(0x54)` and `C(0x43)↔G(0x47)` (uppercase only). `N`, IUPAC codes, and lowercase `a/c/g/t` are pass-through.
+- Scope: five flat-buffer kinds (haplotypes, reference, tracks, annotated, splice). **Out of scope:** `RaggedVariants` (deferred to Target 7), `variant-windows`/`intervals` (no-op).
+- Do **not** delete `reverse_complement_ragged` or its `_query.py`/`_reference.py` call — it remains the numba oracle. It becomes backend-and-kind-conditional only.
+- Do not reintroduce per-batch `np.ascontiguousarray` on sample-scale memmaps (keeps `tests/integration/test_scale_guard.py` green).
+- Build before any test run in this worktree: `pixi run -e dev maturin develop --release` (the shared `.pixi` env's installed extension points at the original checkout until rebuilt here).
+- HPC: run pytest with `--basetemp=$(pwd)/.pytest_tmp` so the write path's `os.link` hardlink does not fail cross-device (Errno 18).
+- Commit message style: conventional commits; end with the `Co-Authored-By` trailer.
+- TDD order across kernels: reference → haplotypes → tracks → annotated → splice.
+
+---
+
+## File Structure
+
+**Rust (create):**
+- `src/reverse.rs` — the two in-place primitives + the `_COMP` LUT + cargo unit tests. One responsibility: reverse/reverse-complement masked rows of a flat buffer. Registered as a module in `src/lib.rs`.
+
+**Rust (modify):**
+- `src/ffi/mod.rs` — add an optional `to_rc` param to 5 fused kernels and call the primitive after the write.
+- `src/reference/mod.rs` — `get_reference` core: accept `to_rc` and apply primitive (covers reference, spliced reference).
+- Reconstruct/track cores under `src/{reconstruct,tracks}/` are **not** modified — RC is applied at the FFI layer over the assembled flat buffer, after the core returns, so cores stay untouched.
+
+**Python (modify):**
+- `python/genvarloader/_dataset/_query.py` — compute `to_rc`, thread it into `view.recon(...)`, make the post-pass backend-and-kind-conditional.
+- `python/genvarloader/_dataset/_reference.py`, `_ref.py` — thread `to_rc` into `get_reference`/`_fetch_spliced_ref`; make the standalone RefDataset RC backend-conditional.
+- `python/genvarloader/_dataset/_haps.py` — pass `to_rc` into the three haplotype fused kernels.
+- `python/genvarloader/_dataset/_reconstruct.py` — pass `to_rc` into the track fused kernel; thread `to_rc` through `SeqsTracks`/`HapsTracks`/`Tracks.__call__`.
+- `python/genvarloader/_dataset/_protocol.py` — add `to_rc` to the `Reconstructor.__call__` protocol signature.
+- `python/genvarloader/_dataset/_ref.py` — `Ref.__call__` / wherever `get_reference` is called for an in-Dataset reference reconstructor.
+
+**Tests (create/modify):**
+- `src/reverse.rs` `#[cfg(test)]` — primitive unit tests.
+- Per-kernel cargo tests in `src/ffi/` or alongside cores — synthetic reconstruct-then-RC checks (where the core is callable in pure Rust).
+- `tests/parity/test_dataset_parity.py` — new strand=−1 fixtures + non-vacuity assertions for every in-scope kind.
+
+---
+
+## Task 1: `src/reverse.rs` in-place primitives + `_COMP` LUT
+
+**Files:**
+- Create: `src/reverse.rs`
+- Modify: `src/lib.rs` (add `mod reverse;`)
+- Test: `src/reverse.rs` `#[cfg(test)]`
+
+**Interfaces:**
+- Produces:
+  - `pub const COMP: [u8; 256]` — ACGT↔TGCA, identity elsewhere.
+  - `pub fn reverse_flat_rows_inplace<T: Copy>(data: &mut [T], offsets: ndarray::ArrayView1<i64>, to_rc: ndarray::ArrayView1<bool>)` — reverses element order within each masked row.
+  - `pub fn rc_flat_rows_inplace(data: &mut [u8], offsets: ndarray::ArrayView1<i64>, to_rc: ndarray::ArrayView1<bool>)` — reverses **and** complements bytes via `COMP`.
+- Contract: `offsets.len() == to_rc.len() + 1`. Row `i` spans `data[offsets[i]..offsets[i+1]]`. When `to_rc[i]` is false the row is untouched. Empty rows (`offsets[i] == offsets[i+1]`) are no-ops.
+
+- [ ] **Step 1: Write the failing tests**
+
+```rust
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::array;
+
+    #[test]
+    fn comp_lut_matches_maketrans() {
+        // identity except ACGT<->TGCA uppercase
+        assert_eq!(COMP[b'A' as usize], b'T');
+        assert_eq!(COMP[b'T' as usize], b'A');
+        assert_eq!(COMP[b'C' as usize], b'G');
+        assert_eq!(COMP[b'G' as usize], b'C');
+        assert_eq!(COMP[b'N' as usize], b'N');
+        assert_eq!(COMP[b'a' as usize], b'a'); // lowercase pass-through
+        assert_eq!(COMP[b'c' as usize], b'c');
+        assert_eq!(COMP[b'R' as usize], b'R'); // IUPAC pass-through
+        assert_eq!(COMP[0u8 as usize], 0u8);
+    }
+
+    #[test]
+    fn rc_reverses_and_complements_masked_rows_only() {
+        // two rows: "ACGT" (rc -> "ACGT") and "AACG" (not rc)
+        let mut data = b"ACGTAACG".to_vec();
+        let offsets = array![0i64, 4, 8];
+        let to_rc = array![true, false];
+        rc_flat_rows_inplace(&mut data, offsets.view(), to_rc.view());
+        assert_eq!(&data[0..4], b"ACGT"); // revcomp of ACGT is ACGT
+        assert_eq!(&data[4..8], b"AACG"); // untouched
+    }
+
+    #[test]
+    fn rc_handles_odd_length_and_n() {
+        let mut data = b"ACN".to_vec(); // revcomp -> "NGT"
+        let offsets = array![0i64, 3];
+        let to_rc = array![true];
+        rc_flat_rows_inplace(&mut data, offsets.view(), to_rc.view());
+        assert_eq!(&data, b"NGT");
+    }
+
+    #[test]
+    fn reverse_only_no_complement_f32() {
+        let mut data = vec![1.0f32, 2.0, 3.0, 9.0];
+        let offsets = array![0i64, 3, 4];
+        let to_rc = array![true, false];
+        reverse_flat_rows_inplace(&mut data, offsets.view(), to_rc.view());
+        assert_eq!(data, vec![3.0, 2.0, 1.0, 9.0]);
+    }
+
+    #[test]
+    fn reverse_only_i32_for_annot_arrays() {
+        let mut data = vec![10i32, 11, 12];
+        let offsets = array![0i64, 3];
+        let to_rc = array![true];
+        reverse_flat_rows_inplace(&mut data, offsets.view(), to_rc.view());
+        assert_eq!(data, vec![12, 11, 10]);
+    }
+
+    #[test]
+    fn empty_row_and_all_false_are_noops() {
+        let mut data = b"AC".to_vec();
+        let offsets = array![0i64, 0, 2]; // first row empty
+        rc_flat_rows_inplace(&mut data, offsets.view(), array![true, false].view());
+        assert_eq!(&data, b"AC");
+    }
+}
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `pixi run -e dev cargo test --lib reverse`
+Expected: FAIL — `reverse.rs` / functions not defined (compile error).
+
+- [ ] **Step 3: Write minimal implementation**
+
+```rust
+//! In-place reverse / reverse-complement of masked rows in a flat (data, offsets)
+//! buffer. Used by the read-path kernels to emit negative-strand output already
+//! reverse-complemented, replacing the Python RC post-pass on the rust backend.
+
+use ndarray::ArrayView1;
+
+/// ACGT<->TGCA complement, identity for every other byte. Mirrors
+/// `bytes.maketrans(b"ACGT", b"TGCA")` (python/genvarloader/_ragged.py).
+pub const COMP: [u8; 256] = {
+    let mut t = [0u8; 256];
+    let mut i = 0usize;
+    while i < 256 {
+        t[i] = i as u8;
+        i += 1;
+    }
+    t[b'A' as usize] = b'T';
+    t[b'T' as usize] = b'A';
+    t[b'C' as usize] = b'G';
+    t[b'G' as usize] = b'C';
+    t
+};
+
+/// Reverse element order within each masked row (no complement). Generic over
+/// element width so it serves f32 tracks and i32/i64 annotation arrays.
+pub fn reverse_flat_rows_inplace<T: Copy>(
+    data: &mut [T],
+    offsets: ArrayView1<i64>,
+    to_rc: ArrayView1<bool>,
+) {
+    for i in 0..to_rc.len() {
+        if !to_rc[i] {
+            continue;
+        }
+        let s = offsets[i] as usize;
+        let e = offsets[i + 1] as usize;
+        data[s..e].reverse();
+    }
+}
+
+/// Reverse AND complement bytes within each masked row via `COMP`.
+pub fn rc_flat_rows_inplace(
+    data: &mut [u8],
+    offsets: ArrayView1<i64>,
+    to_rc: ArrayView1<bool>,
+) {
+    for i in 0..to_rc.len() {
+        if !to_rc[i] {
+            continue;
+        }
+        let s = offsets[i] as usize;
+        let e = offsets[i + 1] as usize;
+        let row = &mut data[s..e];
+        row.reverse();
+        for b in row.iter_mut() {
+            *b = COMP[*b as usize];
+        }
+    }
+}
+```
+
+Add `mod reverse;` to `src/lib.rs` near the other `mod` declarations.
+
+- [ ] **Step 4: Run tests to verify they pass**
+
+Run: `pixi run -e dev cargo test --lib reverse`
+Expected: PASS (6 tests).
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/reverse.rs src/lib.rs
+git commit -m "feat(rust): in-place reverse/reverse-complement primitives for read path
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 2: thread `to_rc` into the reference kernel (`get_reference`)
+
+**Files:**
+- Modify: `src/reference/mod.rs` (core `get_reference`), `src/ffi/mod.rs:728` (pyfunction)
+- Test: `src/reference/mod.rs` `#[cfg(test)]`
+
+**Interfaces:**
+- Consumes: `reverse::rc_flat_rows_inplace`, `COMP` from Task 1.
+- Produces: `get_reference` (core + pyfunction) gains a trailing optional `to_rc: Option<ArrayView1<bool>>` (core) / `to_rc: Option<PyReadonlyArray1<bool>>` (pyfunction). When `Some`, after building the output buffer the core calls `rc_flat_rows_inplace(out, out_offsets, to_rc)`. `None` ⇒ unchanged behavior.
+
+- [ ] **Step 1: Write the failing test (core)**
+
+```rust
+// in src/reference/mod.rs #[cfg(test)]
+#[test]
+fn get_reference_applies_rc_when_masked() {
+    // contig "ACGTAA" at offset 0; one region [0,4) -> "ACGT"
+    let reference = ndarray::array![b'A', b'C', b'G', b'T', b'A', b'A'];
+    let ref_offsets = ndarray::array![0i64, 6];
+    let regions = ndarray::array![[0i32, 0, 4]];
+    let out_offsets = ndarray::array![0i64, 4];
+    let to_rc = ndarray::array![true];
+    let out = get_reference(
+        regions.view(), out_offsets.view(), reference.view(),
+        ref_offsets.view(), b'N', false, Some(to_rc.view()),
+    );
+    // forward "ACGT" -> revcomp "ACGT"; use a non-palindrome to be sure:
+    // region [0,3) "ACG" -> revcomp "CGT"
+    assert_eq!(out.to_vec(), b"ACGT".to_vec());
+}
+```
+
+(Adjust the assertion region to a non-palindrome, e.g. `[0,3)` → expect `b"CGT"`, so the test is non-vacuous.)
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev cargo test --lib reference`
+Expected: FAIL — `get_reference` arity mismatch (no `to_rc` param).
+
+- [ ] **Step 3: Implement**
+
+In `src/reference/mod.rs`, add the trailing param and apply after the buffer is built:
+
+```rust
+pub fn get_reference(
+    regions: ArrayView2<i32>,
+    out_offsets: ArrayView1<i64>,
+    reference: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+    parallel: bool,
+    to_rc: Option<ArrayView1<bool>>,
+) -> Array1<u8> {
+    let mut out = /* ...existing buffer build... */;
+    if let Some(to_rc) = to_rc {
+        crate::reverse::rc_flat_rows_inplace(
+            out.as_slice_mut().unwrap(),
+            out_offsets,
+            to_rc,
+        );
+    }
+    out
+}
+```
+
+In `src/ffi/mod.rs:728`, add `to_rc: Option<PyReadonlyArray1<bool>>` as the trailing param and forward `to_rc.as_ref().map(|a| a.as_array())`. Update the Python caller `python/genvarloader/_dataset/_reference.py:686-695` (`_get_reference_rust`) to accept and pass `to_rc=None` for now (no behavior change — real mask wired in Task 7).
+
+- [ ] **Step 4: Run to verify it passes**
+
+Run: `pixi run -e dev cargo test --lib reference`
+Expected: PASS.
+
+- [ ] **Step 5: Build + smoke the Python boundary**
+
+Run: `pixi run -e dev maturin develop --release && pixi run -e dev python -c "import genvarloader"`
+Expected: import OK (signature change accepted).
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add src/reference/mod.rs src/ffi/mod.rs python/genvarloader/_dataset/_reference.py
+git commit -m "feat(rust): optional in-kernel RC for get_reference
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 3: thread `to_rc` into `reconstruct_haplotypes_fused`
+
+**Files:**
+- Modify: `src/ffi/mod.rs:393-500`
+- Test: `src/ffi/mod.rs` or a reconstruct core test module
+
+**Interfaces:**
+- Consumes: `reverse::rc_flat_rows_inplace`.
+- Produces: `reconstruct_haplotypes_fused` gains trailing `to_rc: Option<PyReadonlyArray1<bool>>` (one bool per `(query, hap)` work item, length `n_work`). Applied to `out_data` against `out_offsets_vec` after Step 4 (the reconstruct write), before `into_pyarray`.
+
+- [ ] **Step 1: Write the failing test**
+
+Add a Rust test that drives the **reconstruct core** directly (it is pure Rust): reconstruct a tiny haplotype with no variants so output equals the reference window, then apply `rc_flat_rows_inplace` and assert the bytes equal the hand-computed revcomp. (Tests the exact call the kernel will make.)
+
+```rust
+#[test]
+fn haplotype_buffer_rc_is_revcomp_of_forward() {
+    let mut out = b"ACGTA".to_vec(); // pretend reconstructed forward bytes
+    let offsets = ndarray::array![0i64, 5];
+    let to_rc = ndarray::array![true];
+    crate::reverse::rc_flat_rows_inplace(&mut out, offsets.view(), to_rc.view());
+    assert_eq!(&out, b"TACGT"); // revcomp(ACGTA)
+}
+```
+
+- [ ] **Step 2: Run to verify it fails / compiles red**
+
+Run: `pixi run -e dev cargo test --lib`
+Expected: FAIL until the kernel param is added (and this guard test passes once `reverse` is wired — it already exists from Task 1, so this step mainly guards the kernel arity change; verify the kernel signature change makes Python smoke fail first).
+
+- [ ] **Step 3: Implement**
+
+In `reconstruct_haplotypes_fused`, add trailing `to_rc: Option<PyReadonlyArray1<bool>>`. After Step 4 (`reconstruct::reconstruct_haplotypes_from_sparse(...)`), before `into_pyarray`:
+
+```rust
+if let Some(to_rc) = to_rc.as_ref() {
+    crate::reverse::rc_flat_rows_inplace(
+        out_data.as_slice_mut().unwrap(),
+        out_offsets_vec.view(),
+        to_rc.as_array(),
+    );
+}
+```
+
+Update the Python caller `_haps.py:828` to pass `to_rc=None` for now.
+
+- [ ] **Step 4: Run tests + build**
+
+Run: `pixi run -e dev cargo test --lib && pixi run -e dev maturin develop --release && pixi run -e dev python -c "import genvarloader"`
+Expected: PASS + import OK.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/ffi/mod.rs python/genvarloader/_dataset/_haps.py
+git commit -m "feat(rust): optional in-kernel RC for reconstruct_haplotypes_fused
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 4: thread `to_rc` into `intervals_and_realign_track_fused` (reverse-only f32)
+
+**Files:**
+- Modify: `src/ffi/mod.rs:848` (and the f32 out buffer handling)
+- Test: `src/ffi/mod.rs` `#[cfg(test)]`
+
+**Interfaces:**
+- Consumes: `reverse::reverse_flat_rows_inplace::<f32>`.
+- Produces: `intervals_and_realign_track_fused` gains trailing `to_rc: Option<PyReadonlyArray1<bool>>` (one bool per `(query, hap)` row, length matching `out_offsets`). **Reverse only, no complement** (tracks are numeric). The `out` buffer is an in/out `PyReadwriteArray1<f32>`; apply over its slice against `out_offsets` after the realign write.
+
+- [ ] **Step 1: Write the failing test**
+
+```rust
+#[test]
+fn track_buffer_rc_is_reverse_only() {
+    let mut out = vec![1.0f32, 2.0, 3.0];
+    let offsets = ndarray::array![0i64, 3];
+    let to_rc = ndarray::array![true];
+    crate::reverse::reverse_flat_rows_inplace(&mut out, offsets.view(), to_rc.view());
+    assert_eq!(out, vec![3.0, 2.0, 1.0]); // no value transform
+}
+```
+
+- [ ] **Step 2: Run to verify red on kernel arity**
+
+Run: `pixi run -e dev cargo test --lib` then `maturin develop` smoke.
+Expected: Python smoke fails on arity until param added.
+
+- [ ] **Step 3: Implement**
+
+Add trailing `to_rc: Option<PyReadonlyArray1<bool>>`. After the realign write into `out`:
+
+```rust
+if let Some(to_rc) = to_rc.as_ref() {
+    crate::reverse::reverse_flat_rows_inplace(
+        out.as_slice_mut().unwrap(),
+        out_offsets.as_array(),
+        to_rc.as_array(),
+    );
+}
+```
+
+Update the Python caller `_reconstruct.py:227` to pass `to_rc=None` for now.
+
+- [ ] **Step 4: Run tests + build**
+
+Run: `pixi run -e dev cargo test --lib && pixi run -e dev maturin develop --release && pixi run -e dev python -c "import genvarloader"`
+Expected: PASS + import OK.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/ffi/mod.rs python/genvarloader/_dataset/_reconstruct.py
+git commit -m "feat(rust): optional in-kernel reverse for track realign kernel
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 5: thread `to_rc` into `reconstruct_annotated_haplotypes_fused` (3 buffers in lockstep)
+
+**Files:**
+- Modify: `src/ffi/mod.rs:604-723`
+- Test: `src/ffi/mod.rs` `#[cfg(test)]`
+
+**Interfaces:**
+- Consumes: `reverse::rc_flat_rows_inplace` (bytes) + `reverse::reverse_flat_rows_inplace::<i32>` (annotation arrays).
+- Produces: trailing `to_rc: Option<PyReadonlyArray1<bool>>` (length `n_work`). Applies, per masked row over the shared `out_offsets_vec`: `rc_flat_rows_inplace(out_data)` (reverse+complement), `reverse_flat_rows_inplace(annot_v)` (reverse only), `reverse_flat_rows_inplace(annot_pos)` (reverse only) — all using the same offsets so the three stay aligned, matching `_FlatAnnotatedHaps.reverse_masked` (bytes complemented; `var_idxs`/`ref_coords` reversed without complement).
+
+- [ ] **Step 1: Write the failing test**
+
+```rust
+#[test]
+fn annotated_rc_complements_bytes_reverses_indices() {
+    let mut bytes = b"ACG".to_vec();          // revcomp -> "CGT"
+    let mut vidx = vec![5i32, 6, 7];          // reverse -> [7,6,5]
+    let mut rpos = vec![100i32, 101, 102];    // reverse -> [102,101,100]
+    let offsets = ndarray::array![0i64, 3];
+    let m = ndarray::array![true];
+    crate::reverse::rc_flat_rows_inplace(&mut bytes, offsets.view(), m.view());
+    crate::reverse::reverse_flat_rows_inplace(&mut vidx, offsets.view(), m.view());
+    crate::reverse::reverse_flat_rows_inplace(&mut rpos, offsets.view(), m.view());
+    assert_eq!(&bytes, b"CGT");
+    assert_eq!(vidx, vec![7, 6, 5]);
+    assert_eq!(rpos, vec![102, 101, 100]);
+}
+```
+
+- [ ] **Step 2: Run to verify red on kernel arity**
+
+Run: `pixi run -e dev cargo test --lib` + `maturin develop` smoke.
+Expected: arity failure until added.
+
+- [ ] **Step 3: Implement**
+
+Add trailing `to_rc`. After Step 4 (reconstruct with annotation buffers), before returning:
+
+```rust
+if let Some(to_rc) = to_rc.as_ref() {
+    let m = to_rc.as_array();
+    crate::reverse::rc_flat_rows_inplace(out_data.as_slice_mut().unwrap(), out_offsets_vec.view(), m);
+    crate::reverse::reverse_flat_rows_inplace(annot_v.as_slice_mut().unwrap(), out_offsets_vec.view(), m);
+    crate::reverse::reverse_flat_rows_inplace(annot_pos.as_slice_mut().unwrap(), out_offsets_vec.view(), m);
+}
+```
+
+Update the Python caller `_haps.py:984` to pass `to_rc=None` for now.
+
+- [ ] **Step 4: Run tests + build**
+
+Run: `pixi run -e dev cargo test --lib && pixi run -e dev maturin develop --release && pixi run -e dev python -c "import genvarloader"`
+Expected: PASS + import OK.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/ffi/mod.rs python/genvarloader/_dataset/_haps.py
+git commit -m "feat(rust): optional in-kernel RC for annotated haplotype kernel
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 6: thread `to_rc` into `reconstruct_haplotypes_spliced_fused` (permuted per-element)
+
+**Files:**
+- Modify: `src/ffi/mod.rs:521-577`
+- Test: `src/ffi/mod.rs` `#[cfg(test)]`
+
+**Interfaces:**
+- Consumes: `reverse::rc_flat_rows_inplace`.
+- Produces: trailing `to_rc: Option<PyReadonlyArray1<bool>>` — **already permuted per spliced element** (length = number of permuted elements = `out_offsets.len() - 1`). Applied over `out_offsets_a` (the permuted per-element offsets) so each masked element is RC'd in its own byte range, matching today's `to_rc_per_elem`. Assert in the caller (Task 7) that `to_rc.len() == out_offsets.len() - 1`.
+
+- [ ] **Step 1: Write the failing test**
+
+```rust
+#[test]
+fn spliced_rc_applies_per_element_over_permuted_offsets() {
+    // two permuted elements: "ACG" (rc) and "TTT" (not rc)
+    let mut out = b"ACGTTT".to_vec();
+    let offsets = ndarray::array![0i64, 3, 6];
+    let to_rc = ndarray::array![true, false];
+    crate::reverse::rc_flat_rows_inplace(&mut out, offsets.view(), to_rc.view());
+    assert_eq!(&out[0..3], b"CGT"); // revcomp(ACG)
+    assert_eq!(&out[3..6], b"TTT"); // untouched
+}
+```
+
+- [ ] **Step 2: Run to verify red on kernel arity**
+
+Run: `pixi run -e dev cargo test --lib` + smoke.
+Expected: arity failure until added.
+
+- [ ] **Step 3: Implement**
+
+Add trailing `to_rc`. After `reconstruct_haplotypes_from_sparse(...)`, before `into_pyarray`:
+
+```rust
+if let Some(to_rc) = to_rc.as_ref() {
+    crate::reverse::rc_flat_rows_inplace(
+        out_data.as_slice_mut().unwrap(),
+        out_offsets_a,
+        to_rc.as_array(),
+    );
+}
+```
+
+Update the Python caller `_haps.py:894` to pass `to_rc=None` for now.
+
+- [ ] **Step 4: Run tests + build**
+
+Run: `pixi run -e dev cargo test --lib && pixi run -e dev maturin develop --release && pixi run -e dev python -c "import genvarloader"`
+Expected: PASS + import OK.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/ffi/mod.rs python/genvarloader/_dataset/_haps.py
+git commit -m "feat(rust): optional in-kernel RC for spliced haplotype kernel
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 7: strand=−1 parity fixtures + non-vacuity assertions (safety net BEFORE wiring)
+
+**Files:**
+- Modify: `tests/parity/test_dataset_parity.py`
+
+**Interfaces:**
+- Consumes: existing dataset parity harness + kernel-spy backstop.
+- Produces: parameterized fixtures with a **mix of `+` and `−`** strand regions covering haplotypes, reference, tracks, annotated, and the spliced variant of each; plus a non-vacuity assertion. These must **pass on the current (pre-wiring) code** (rust == numba, both via the post-pass), establishing the regression net that Task 8 must keep green.
+
+- [ ] **Step 1: Write the strand=−1 parity fixtures**
+
+Add a fixture that builds a dataset whose `input_regions` BED includes negative-strand rows (strand column `-1`) interleaved with positive ones, `max_jitter=0`. Parameterize over kinds `["haplotypes", "reference", "tracks", "tracks-seqs", "annotated"]` and spliced/unspliced. Assert byte-identical output between the two backends using the existing harness, and add:
+
+```python
+def test_negative_strand_actually_reverse_complements(neg_strand_dataset):
+    # Non-vacuity: a '-' region's bytes differ from the '+'-oriented bytes.
+    ds = neg_strand_dataset
+    out = ds[neg_region_idx, sample_idx]
+    fwd = forward_oriented_reference(ds, neg_region_idx, sample_idx)  # helper
+    assert out.tobytes() != fwd.tobytes()  # RC genuinely fired
+    assert out.tobytes() == revcomp(fwd).tobytes()  # and is the exact RC
+```
+
+(Use the spy backstop to assert the kernel ran on the live `__getitem__` path.)
+
+- [ ] **Step 2: Run on current code, both backends**
+
+Run:
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev pytest tests/parity/test_dataset_parity.py -q --basetemp=$(pwd)/.pytest_tmp
+GVL_BACKEND=numba pixi run -e dev pytest tests/parity/test_dataset_parity.py -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS on both (net established; the wiring isn't done yet, so both paths still use the post-pass).
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add tests/parity/test_dataset_parity.py
+git commit -m "test(parity): strand=-1 fixtures + non-vacuity RC assertions
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 8: Python wiring — thread real `to_rc`, make post-pass backend-and-kind-conditional
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_query.py` (`_getitem_unspliced` ~`:188`, `_getitem_spliced` ~`:259`), `_protocol.py`, `_reconstruct.py` (`SeqsTracks`/`HapsTracks`/`Tracks.__call__` + track kernel call), `_haps.py` (three kernel calls), `_reference.py` (`_get_reference_rust`, `_fetch_spliced_ref`, standalone RefDataset RC `:438`), `_ref.py` (`Ref.__call__` get_reference call).
+- Test: `tests/parity/test_dataset_parity.py` (Task 7 fixtures stay green).
+
+**Interfaces:**
+- Consumes: every kernel's `to_rc` param (Tasks 2-6); Task 7 fixtures.
+- Produces:
+  - A helper `_active_backend() -> str` (returns `os.environ.get("GVL_BACKEND", "rust")`) so `_query.py`'s guard matches what the recon methods used. Place it next to the recon dispatch (e.g. `_reconstruct.py` or `_query.py`).
+  - `to_rc` flows: `_query.py` computes the mask → `view.recon(..., to_rc=...)` → reconstructors forward it to the rust fused kernels (numba branch ignores it).
+  - Post-pass becomes: numba ⇒ RC all kinds (unchanged); rust ⇒ RC only `RaggedVariants`.
+
+- [ ] **Step 1: Add `to_rc` to the Reconstructor protocol + all `__call__`s**
+
+In `_protocol.py`, add `to_rc: NDArray[np.bool_] | None = None` to `Reconstructor.__call__`. Mirror the param (trailing, default `None`) in `SeqsTracks.__call__`, `HapsTracks.__call__`, `Tracks.__call__`, `Ref.__call__`, `Haps.__call__`, and any kind variants. Each forwards `to_rc` to the fused kernel call on the rust branch only; the numba branch leaves it unused. For composite reconstructors (`SeqsTracks`, `HapsTracks`) forward the same `to_rc` to each sub-call.
+
+- [ ] **Step 2: Pass `to_rc` into the rust kernels**
+
+Replace the `to_rc=None` placeholders added in Tasks 2-6 with the forwarded `to_rc` (converted to a contiguous bool array on the rust branch: `None if to_rc is None else np.ascontiguousarray(to_rc, np.bool_)`). For tracks, the mask is per `(query, hap)` row — replicate the per-query mask across ploidy the same way `out_offsets` is laid out (mirror the existing `reverse_masked` broadcast: `np.repeat`/broadcast in C order to match `out_offsets` rows).
+
+- [ ] **Step 3: Rewire `_query.py` post-pass (the core change)**
+
+In `_getitem_unspliced`:
+
+```python
+to_rc = view.full_regions[r_idx, 3] == -1 if view.rc_neg else None
+recon = view.recon(..., to_rc=to_rc)
+if not isinstance(recon, tuple):
+    recon = (recon,)
+if view.rc_neg:
+    if _active_backend() == "numba":
+        recon = tuple(reverse_complement_ragged(r, to_rc) for r in recon)
+    else:
+        # rust folded flat-seq kinds in-kernel; only the deferred RaggedVariants
+        # (Target 7) still needs the Python pass.
+        recon = tuple(
+            reverse_complement_ragged(r, to_rc) if isinstance(r, RaggedVariants) else r
+            for r in recon
+        )
+```
+
+In `_getitem_spliced`: keep the existing `to_rc_per_elem` computation, pass it into `view.recon(..., to_rc=to_rc_per_elem)`, and apply the identical numba-vs-rust guard. (Spliced output is never `RaggedVariants`, so the rust branch is a no-op there.)
+
+- [ ] **Step 4: Rewire reference RC sites**
+
+In `_reference.py`: thread `to_rc` into `_get_reference_rust`/`get_reference`. For the standalone RefDataset spliced path (`:438-444`), apply the same backend guard — on rust pass `to_rc_perm` into `_fetch_spliced_ref`→`get_reference` and skip `per_elem.reverse_masked`; on numba keep `per_elem.reverse_masked(to_rc_perm, comp=_COMP)`. In `_ref.py`, pass `to_rc` into the unspliced `get_reference` call on the rust branch.
+
+- [ ] **Step 5: Confirm no other callers regressed**
+
+Run: `grep -rn "reverse_complement_ragged\|reverse_masked" python/`
+Expected: callers are only the numba-guarded post-pass + the RaggedVariants rust branch + the numba RefDataset branch. No stray unconditional RC remains on the rust path.
+
+- [ ] **Step 6: Run the parity net + cargo, both backends**
+
+Run:
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev cargo test --lib
+pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS on both backends (Task 7 fixtures now exercise rust in-kernel RC vs numba post-pass and stay byte-identical).
+
+- [ ] **Step 7: Full tree, both backends**
+
+Run:
+```bash
+pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp
+GVL_BACKEND=numba pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp
+pixi run -e dev ruff check python/ tests/ && pixi run -e dev ruff format --check python/ tests/ && pixi run -e dev typecheck
+```
+Expected: PASS / clean.
+
+- [ ] **Step 8: Commit**
+
+```bash
+git add python/genvarloader/_dataset/
+git commit -m "feat: fold strand RC into rust kernels; numba post-pass retained as oracle
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 9: perf re-measure + roadmap update
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md`
+
+**Interfaces:**
+- Consumes: the de-noised `tests/benchmarks/test_e2e.py` harness + `tests/benchmarks/profiling/profile.py`.
+
+- [ ] **Step 1: Re-measure rust÷numba ratios**
+
+Run (release build already done):
+```bash
+pixi run -e dev pytest tests/benchmarks/test_e2e.py -q --basetemp=$(pwd)/.pytest_tmp
+```
+Compare the **min** per-batch for `haplotypes`, `tracks-only`, `tracks-seqs`, `annotated` against the starting points (haplotypes 0.94×, tracks-only 0.63×, etc.).
+
+- [ ] **Step 2: Confirm RC self-time is gone from the rust profile**
+
+Run:
+```bash
+NUMBA_NUM_THREADS=1 perf record -F 999 -o p.data -- .pixi/envs/dev/bin/python \
+    tests/benchmarks/profiling/profile.py --mode haplotypes --n-batches 12000
+perf report --stdio --no-children -i p.data | head -40
+```
+Expected: no `reverse_complement_*` / seqpro RC frame in the rust flat profile.
+
+- [ ] **Step 3: Update the roadmap**
+
+In `docs/roadmaps/rust-migration.md` round-2 block: tick Target 6, record the re-measured ratios under the Phase 5 checkpoint, set the PR link, and set/confirm the marker that **Target 6 must merge before rayon**.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add docs/roadmaps/rust-migration.md
+git commit -m "docs(roadmap): record Target 6 RC fold results; gate rayon on 5+6+7
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- Two primitives + `_COMP` LUT → Task 1. ✓
+- Five flat kinds in-kernel RC → Tasks 2 (reference), 3 (haplotypes), 4 (tracks, reverse-only), 5 (annotated, 3 buffers), 6 (splice, permuted). ✓
+- Mask computed in Python, threaded as `Option<bool>`; `None` fast path → Task 8 steps 1-2 + each kernel's `Option`. ✓
+- Insertion/trailing-fill ordering preserved (RC after forward write) → enforced by applying the primitive after the reconstruct core in every kernel task. ✓
+- Backend-conditional post-pass; numba oracle unchanged; `reverse_complement_ragged` retained → Task 8 step 3 (corrects the spec's "delete" wording per the approved decision). ✓
+- Third RC site `_reference.py:438` → Task 8 step 4. ✓
+- `RaggedVariants` deferred to Target 7; still post-passed on both backends → Task 8 step 3 (rust branch RaggedVariants-only). ✓
+- Vacuous-pass guard: strand=−1 fixtures + non-vacuity assertion → Task 7. ✓
+- Parity both backends + full tree + lint/typecheck → Task 8 steps 6-7. ✓
+- Perf re-measure + roadmap → Task 9. ✓
+- Scale guard not regressed: no `ascontiguousarray` added on memmaps (only on small mask/region arrays) → respected in Task 8 step 2. ✓
+
+**Type consistency:** `to_rc` is `Option<PyReadonlyArray1<bool>>` (pyfunction) / `Option<ArrayView1<bool>>` (core) / `NDArray[np.bool_] | None` (Python) throughout. Primitives named `reverse_flat_rows_inplace` / `rc_flat_rows_inplace` consistently. `_active_backend()` defined once (Task 8) and referenced in `_query.py`/`_reference.py`.
+
+**Note on numba kernel test red/green:** the per-kernel cargo tests (Tasks 2-6) validate the primitive call against hand-computed revcomp on synthetic buffers; the kernel-arity change is smoke-checked via `maturin develop` + import. End-to-end RC correctness is gated by the Task 7 fixtures across the Task 8 flip. If a reconstruct core is not directly callable in a pure-Rust test for a given kernel, rely on the primitive's Task-1 unit tests + the Task 7 parity net (documented per task).
diff --git a/docs/superpowers/plans/2026-06-25-target7-variant-windows-rust-assembly.md b/docs/superpowers/plans/2026-06-25-target7-variant-windows-rust-assembly.md
new file mode 100644
index 00000000..9353664f
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-25-target7-variant-windows-rust-assembly.md
@@ -0,0 +1,1669 @@
+# Target 7 — variant-windows/variants assembly in one Rust call — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Collapse the per-batch object/numpy-temporary churn on the `variants` + `variant-windows` flat-output read path into one flag-driven Rust call that owns the reference fetch + LUT tokenize + flank/window assembly and returns flat `(data, offsets)` buffers, so Python builds the wrapper objects once.
+
+**Architecture:** A new Rust module `src/variants/windows.rs` holds small pure cores (`tokenize`, `slice_flanks`, `assemble_alt_window`, `fetch_windows`) and two mode orchestrators (`assemble_variants_mode`, `assemble_windows_mode`) generic over the token type. Two FFI pyfunctions (`assemble_variant_buffers_u8`, `assemble_variant_buffers_i32`) monomorphize the token type and return a `dict[str, (data, seq_offsets)]`. Python keeps the cheap, dtype-polymorphic front-end (v_idxs gather / AF filter / scalar-field gather) and the `fill_empty_groups` post-pass; only the ragged byte/token assembly tail moves to Rust, behind the dispatch registry with the existing Python/numba helpers retained as the parity oracle.
+
+**Tech Stack:** Rust (`ndarray`, `numpy`/PyO3), Python (numpy, numba oracle), `pixi` for env/build/test, `maturin` for the Rust↔Python build, hypothesis + pytest parity harness.
+
+## Global Constraints
+
+- Branch `opt/target-7-windows-rust-assembly` off `zero-copy-scale-safe-readpath` (do NOT branch off `master`/`rust-migration`).
+- Byte-identical parity is the landing gate: the Rust output must equal the existing Python/numba assembly (dtype, shape, values) for both `variants` and `variant-windows`, across the full `ref`/`alt` ∈ {window, allele} mode matrix, empty groups, and the `flank_tokens` ride-along.
+- Front edge is **assembly tail only**: the v_idxs gather / AF filter / compaction / scalar-field gather stay in Python; the issue-#231 custom-FORMAT dtype-polymorphic numba fallback must remain intact (never route a custom-dtype field through the new typed Rust call).
+- `fill_empty_groups` stays a separate Python post-pass over the existing `fill_empty_seq/scalar/fixed` Rust cores — do NOT fold it into the new call.
+- Do NOT delete the numba/numpy assembly helpers (`compute_windows`, `compute_ref_window`, `compute_alt_window`, `tokenize_alleles`, `compute_flank_tokens`); they become the registered parity oracle.
+- Do NOT reintroduce per-batch `np.ascontiguousarray` on sample-scale memmaps (keep `tests/integration/test_scale_guard.py` green). The mega-call's globals come from `Haps.ffi_static` (sub-linear, already cached) + the variant `ref`-allele bytes.
+- Build after every Rust change: `pixi run -e dev maturin develop --release`. Rust unit tests: `pixi run -e dev cargo-test`. Python tests need `--basetemp=$(pwd)/.pytest_tmp` (HPC cross-device `os.link` Errno 18 guard).
+- `test_e2e_variants` is a **pre-existing xfail** (`_FlatVariants.to_fixed` missing) — confirm it xfails identically at base; not a regression introduced here.
+- Conventional commits; commit at the end of every task. End commit messages with the `Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>` trailer.
+
+---
+
+## File Structure
+
+- **Create** `src/variants/windows.rs` — pure cores (`tokenize`, `slice_flanks`, `assemble_alt_window`, `fetch_windows`) + mode orchestrators (`assemble_variants_mode`, `assemble_windows_mode`) + the `VariantBufs<Tok>` return struct + Rust unit tests.
+- **Modify** `src/variants/mod.rs` — add `pub mod windows;` and re-export nothing else (cores stay in the submodule).
+- **Modify** `src/ffi/mod.rs` — two pyfunctions `assemble_variant_buffers_u8` / `assemble_variant_buffers_i32` returning a `PyDict`.
+- **Modify** `src/lib.rs` — `add_function` for both pyfunctions.
+- **Modify** `python/genvarloader/_dataset/_flat_flanks.py` — add `_assemble_variant_buffers_numba` (the oracle that composes existing helpers into the dict contract) — keeps all current helpers.
+- **Modify** `python/genvarloader/_dataset/_flat_variants.py` — register `assemble_variant_buffers`, add the Rust shim that selects the u8/i32 monomorphization, and rewrite the `get_variants_flat` assembly tail to call `get("assemble_variant_buffers")` and wrap the returned dict once.
+- **Modify** `tests/parity/_harness.py` — add `assert_kernel_parity_dict`.
+- **Create** `tests/parity/test_assemble_variant_buffers_parity.py` — mode-matrix + empty + flank parity.
+- **Modify** `tests/parity/test_dataset_parity.py` — spy that the kernel runs on the live windows/variants `__getitem__` path.
+- **Modify** `docs/roadmaps/rust-migration.md` — tick target 7, record re-measured ratios, set PR link.
+
+---
+
+### Task 1: Rust pure cores — `tokenize`, `slice_flanks`, `assemble_alt_window`
+
+**Files:**
+- Create: `src/variants/windows.rs`
+- Modify: `src/variants/mod.rs:1` (add `pub mod windows;`)
+- Test: cargo unit tests inside `src/variants/windows.rs`
+
+**Interfaces:**
+- Produces:
+  - `pub fn tokenize<Tok: Copy>(bytes: ArrayView1<u8>, lut: ArrayView1<Tok>) -> Array1<Tok>`
+  - `pub fn slice_flanks(data: ArrayView1<u8>, rw_off: ArrayView1<i64>, flank_len: usize) -> (Array1<u8>, Array1<u8>)` — each `(n*flank_len,)`, variant-major: `f5[i*L+k] = data[rw_off[i]+k]`, `f3[i*L+k] = data[rw_off[i+1]-L+k]`
+  - `pub fn assemble_alt_window(f5: ArrayView1<u8>, f3: ArrayView1<u8>, alt_data: ArrayView1<u8>, alt_seq_off: ArrayView1<i64>, flank_len: usize) -> (Array1<u8>, Array1<i64>)`
+
+- [ ] **Step 1: Create the module file with the three cores**
+
+Create `src/variants/windows.rs`:
+
+```rust
+//! Variant-windows / variants flat-buffer assembly cores (pure ndarray).
+//! PyO3 lives in `crate::ffi`. Mirrors the Python helpers in
+//! `_dataset/_flat_flanks.py` (`tokenize_alleles`, `_slice_flanks`,
+//! `_assemble_alt_windows`, `compute_*`) — byte-identical by construction.
+use ndarray::{Array1, ArrayView1};
+
+/// Apply a 256-entry byte->token lookup table. `out[i] = lut[bytes[i]]`.
+/// Mirrors numpy `lut[bytes]`. `Tok` is the token dtype (u8 or i32).
+pub fn tokenize<Tok: Copy>(bytes: ArrayView1<u8>, lut: ArrayView1<Tok>) -> Array1<Tok> {
+    let n = bytes.len();
+    let mut out: Vec<Tok> = Vec::with_capacity(n);
+    for i in 0..n {
+        out.push(lut[bytes[i] as usize]);
+    }
+    Array1::from_vec(out)
+}
+
+/// Derive per-variant (f5, f3) fixed-`flank_len` flanks from a contiguous
+/// per-variant window read `[start-L, end+L)`. `f5` = first `L` bytes of each
+/// row, `f3` = last `L`. Both returned flat `(n*L,)`, variant-major. Mirrors
+/// `_slice_flanks` (`f5 = data[rw_off[:-1,None]+cols]`,
+/// `f3 = data[rw_off[1:,None]-L+cols]`).
+pub fn slice_flanks(
+    data: ArrayView1<u8>,
+    rw_off: ArrayView1<i64>,
+    flank_len: usize,
+) -> (Array1<u8>, Array1<u8>) {
+    let n = rw_off.len() - 1;
+    let mut f5: Vec<u8> = Vec::with_capacity(n * flank_len);
+    let mut f3: Vec<u8> = Vec::with_capacity(n * flank_len);
+    for i in 0..n {
+        let s = rw_off[i] as usize;
+        let e = rw_off[i + 1] as usize;
+        for k in 0..flank_len {
+            f5.push(data[s + k]);
+        }
+        for k in 0..flank_len {
+            f3.push(data[e - flank_len + k]);
+        }
+    }
+    (Array1::from_vec(f5), Array1::from_vec(f3))
+}
+
+/// Concatenate `flank5 . alt . flank3` per variant into a flat byte buffer.
+/// `f5`/`f3` are `(n*flank_len,)` variant-major. Mirrors numba
+/// `_assemble_alt_windows`. Returns `(out_bytes, out_offsets)`.
+pub fn assemble_alt_window(
+    f5: ArrayView1<u8>,
+    f3: ArrayView1<u8>,
+    alt_data: ArrayView1<u8>,
+    alt_seq_off: ArrayView1<i64>,
+    flank_len: usize,
+) -> (Array1<u8>, Array1<i64>) {
+    let n = alt_seq_off.len() - 1;
+    let mut out_off = Array1::<i64>::zeros(n + 1);
+    for i in 0..n {
+        let alt_len = alt_seq_off[i + 1] - alt_seq_off[i];
+        out_off[i + 1] = out_off[i] + 2 * flank_len as i64 + alt_len;
+    }
+    let total = out_off[n] as usize;
+    let mut out: Vec<u8> = Vec::with_capacity(total);
+    for i in 0..n {
+        for k in 0..flank_len {
+            out.push(f5[i * flank_len + k]);
+        }
+        for k in alt_seq_off[i] as usize..alt_seq_off[i + 1] as usize {
+            out.push(alt_data[k]);
+        }
+        for k in 0..flank_len {
+            out.push(f3[i * flank_len + k]);
+        }
+    }
+    (Array1::from_vec(out), out_off)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::arr1;
+
+    #[test]
+    fn test_tokenize_u8() {
+        // lut maps byte 65('A')->0, 67('C')->1, everything else->9 (unknown).
+        let mut lut = vec![9u8; 256];
+        lut[65] = 0;
+        lut[67] = 1;
+        let lut = Array1::from_vec(lut);
+        let bytes = arr1(&[65u8, 67, 78]); // A, C, N(unknown)
+        let out = tokenize(bytes.view(), lut.view());
+        assert_eq!(out.to_vec(), vec![0u8, 1, 9]);
+    }
+
+    #[test]
+    fn test_tokenize_i32() {
+        // i32 tokens (alphabet larger than 255 forces i32 in Python).
+        let mut lut = vec![999i32; 256];
+        lut[71] = 300; // 'G' -> 300
+        let lut = Array1::from_vec(lut);
+        let bytes = arr1(&[71u8, 84]); // G, T(unknown)
+        let out = tokenize(bytes.view(), lut.view());
+        assert_eq!(out.to_vec(), vec![300i32, 999]);
+    }
+
+    #[test]
+    fn test_slice_flanks() {
+        // 2 variants, L=2. var0 window=[1,2,3,4,5] (len 5), var1=[6,7,8,9] (len 4).
+        // rw_off = [0, 5, 9].
+        let data = arr1(&[1u8, 2, 3, 4, 5, 6, 7, 8, 9]);
+        let rw_off = arr1(&[0i64, 5, 9]);
+        let (f5, f3) = slice_flanks(data.view(), rw_off.view(), 2);
+        // f5: first 2 of each = [1,2 | 6,7]; f3: last 2 of each = [4,5 | 8,9]
+        assert_eq!(f5.to_vec(), vec![1u8, 2, 6, 7]);
+        assert_eq!(f3.to_vec(), vec![4u8, 5, 8, 9]);
+    }
+
+    #[test]
+    fn test_assemble_alt_window() {
+        // L=1. f5=[10|20], f3=[11|21]. alt: var0="A"(65), var1="CG"(67,71).
+        let f5 = arr1(&[10u8, 20]);
+        let f3 = arr1(&[11u8, 21]);
+        let alt_data = arr1(&[65u8, 67, 71]);
+        let alt_seq_off = arr1(&[0i64, 1, 3]);
+        let (out, off) = assemble_alt_window(
+            f5.view(),
+            f3.view(),
+            alt_data.view(),
+            alt_seq_off.view(),
+            1,
+        );
+        // var0: 10, 65, 11  (2*1 + 1 = 3 bytes)
+        // var1: 20, 67,71, 21  (2*1 + 2 = 4 bytes)
+        assert_eq!(out.to_vec(), vec![10u8, 65, 11, 20, 67, 71, 21]);
+        assert_eq!(off.to_vec(), vec![0i64, 3, 7]);
+    }
+}
+```
+
+- [ ] **Step 2: Wire the module in**
+
+Add to `src/variants/mod.rs` as the first line after the module doc comment (line 1):
+
+```rust
+pub mod windows;
+```
+
+- [ ] **Step 3: Run the cores' unit tests to verify they pass**
+
+Run: `pixi run -e dev cargo-test 2>&1 | rtk err`
+Expected: the four new `windows::tests::*` tests PASS; existing tests still pass.
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add src/variants/windows.rs src/variants/mod.rs
+rtk git commit -m "feat(variants): add tokenize/slice_flanks/assemble_alt_window cores
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 2: Rust `fetch_windows` helper (reference window reads)
+
+**Files:**
+- Modify: `src/variants/windows.rs`
+- Test: cargo unit test inside `src/variants/windows.rs`
+
+**Interfaces:**
+- Consumes: `crate::reference::get_reference(regions: ArrayView2<i32>, out_offsets: ArrayView1<i64>, reference: ArrayView1<u8>, ref_offsets: ArrayView1<i64>, pad_char: u8, parallel: bool) -> Array1<u8>`
+- Produces: `pub fn fetch_windows(v_contigs: ArrayView1<i32>, starts_v: ArrayView1<i32>, ilens_v: ArrayView1<i32>, flank_len: i64, reference: ArrayView1<u8>, ref_offsets: ArrayView1<i64>, pad_char: u8) -> (Array1<u8>, Array1<i64>)` — the per-variant `[start-L, end+L)` read flat buffer + its per-variant offsets (`rw_off`, len `n+1`). `ends = starts - min(ilen,0) + 1`.
+
+- [ ] **Step 1: Write the failing test**
+
+Add to the `tests` module in `src/variants/windows.rs`:
+
+```rust
+    #[test]
+    fn test_fetch_windows() {
+        use ndarray::Array1 as A1;
+        // Single contig reference: bytes 0..20.
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        // 1 variant, contig 0, start=5, ilen=0 (SNP) → end = 5 - 0 + 1 = 6.
+        // L=2 → read [start-L, end+L) = [3, 8) → bytes [3,4,5,6,7].
+        let v_contigs = arr1(&[0i32]);
+        let starts = arr1(&[5i32]);
+        let ilens = arr1(&[0i32]);
+        let (data, rw_off) = fetch_windows(
+            v_contigs.view(),
+            starts.view(),
+            ilens.view(),
+            2,
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        assert_eq!(data.to_vec(), vec![3u8, 4, 5, 6, 7]);
+        assert_eq!(rw_off.to_vec(), vec![0i64, 5]);
+    }
+
+    #[test]
+    fn test_fetch_windows_deletion_widens() {
+        use ndarray::Array1 as A1;
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        // ilen=-2 (2bp deletion) → end = start - (-2) + 1 = start + 3.
+        // start=5, L=1 → read [4, 9) → bytes [4,5,6,7,8] (len 5).
+        let v_contigs = arr1(&[0i32]);
+        let starts = arr1(&[5i32]);
+        let ilens = arr1(&[-2i32]);
+        let (data, rw_off) = fetch_windows(
+            v_contigs.view(),
+            starts.view(),
+            ilens.view(),
+            1,
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        assert_eq!(data.to_vec(), vec![4u8, 5, 6, 7, 8]);
+        assert_eq!(rw_off.to_vec(), vec![0i64, 5]);
+    }
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev cargo-test 2>&1 | rtk err`
+Expected: FAIL — `cannot find function fetch_windows in this scope`.
+
+- [ ] **Step 3: Implement `fetch_windows`**
+
+Add to `src/variants/windows.rs` (above the `#[cfg(test)]` module). Note the `use` additions at the top of the file — change the import line to:
+
+```rust
+use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
+```
+
+Then add:
+
+```rust
+/// Fetch the per-variant reference window `[start-L, end+L)` into one flat
+/// buffer, with `ends = starts - min(ilen, 0) + 1`. Returns `(data, rw_off)`
+/// where `rw_off` are per-variant byte boundaries (len `n+1`). Reuses
+/// `reference::get_reference`'s padded core (absolute-coordinate OOB padding).
+/// Mirrors `reference.fetch(v_contigs, starts-L, ends+L)`.
+pub fn fetch_windows(
+    v_contigs: ArrayView1<i32>,
+    starts_v: ArrayView1<i32>,
+    ilens_v: ArrayView1<i32>,
+    flank_len: i64,
+    reference: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+) -> (Array1<u8>, Array1<i64>) {
+    let n = starts_v.len();
+    let mut regions = Array2::<i32>::zeros((n, 3));
+    let mut rw_off = Array1::<i64>::zeros(n + 1);
+    for i in 0..n {
+        let start = starts_v[i] as i64;
+        let ilen = ilens_v[i] as i64;
+        let end = start - ilen.min(0) + 1;
+        let rstart = start - flank_len;
+        let rend = end + flank_len;
+        regions[[i, 0]] = v_contigs[i];
+        regions[[i, 1]] = rstart as i32;
+        regions[[i, 2]] = rend as i32;
+        rw_off[i + 1] = rw_off[i] + (rend - rstart);
+    }
+    let data = crate::reference::get_reference(
+        regions.view(),
+        rw_off.view(),
+        reference,
+        ref_offsets,
+        pad_char,
+        false, // serial: disjoint output already; this is per-variant fanout
+    );
+    (data, rw_off)
+}
+```
+
+- [ ] **Step 4: Run to verify it passes**
+
+Run: `pixi run -e dev cargo-test 2>&1 | rtk err`
+Expected: `windows::tests::test_fetch_windows` and `..._deletion_widens` PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add src/variants/windows.rs
+rtk git commit -m "feat(variants): add fetch_windows reference-read helper
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 3: Rust `assemble_variants_mode` orchestrator (byte alleles + flank_tokens)
+
+**Files:**
+- Modify: `src/variants/windows.rs`
+- Test: cargo unit test inside `src/variants/windows.rs`
+
+**Interfaces:**
+- Consumes: `crate::variants::gather_alleles(v_idxs, allele_bytes, allele_offsets) -> (Array1<u8>, Array1<i64>)`; Task 1/2 cores.
+- Produces:
+  - `pub struct VariantBufs<Tok> { pub byte_bufs: Vec<(&'static str, Array1<u8>, Array1<i64>)>, pub tok_bufs: Vec<(&'static str, Array1<Tok>, Array1<i64>)> }`
+  - `pub fn assemble_variants_mode<Tok: Copy>(...) -> VariantBufs<Tok>` (signature in Step 3)
+
+- [ ] **Step 1: Write the failing test**
+
+Add to the `tests` module in `src/variants/windows.rs`:
+
+```rust
+    #[test]
+    fn test_assemble_variants_mode_alt_and_flank() {
+        use ndarray::Array1 as A1;
+        // Global alleles: v0="A"(65), v1="CG"(67,71). offsets [0,1,3].
+        let alt_global = arr1(&[65u8, 67, 71]);
+        let alt_off = arr1(&[0i64, 1, 3]);
+        // Select v_idxs [1, 0] in one row.
+        let v_idxs = arr1(&[1i32, 0]);
+        let row_offsets = arr1(&[0i64, 2]);
+        // Reference 0..20, single contig. v_starts/ilens are GLOBAL (indexed by v_idx).
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        let v_starts = arr1(&[5i32, 8]); // global per-variant
+        let ilens = arr1(&[0i32, 0]);
+        let v_contigs = arr1(&[0i32, 0]); // per-selected-variant contig
+        // L=1, token LUT: identity-ish u8 (byte value -> itself for the test).
+        let lut: A1<u8> = A1::from_vec((0u8..=255).collect());
+
+        let bufs = assemble_variants_mode::<u8>(
+            v_idxs.view(),
+            row_offsets.view(),
+            alt_global.view(),
+            alt_off.view(),
+            None, // no ref alleles
+            None,
+            true, // want_flank
+            1,    // flank_len
+            Some(lut.view()),
+            v_contigs.view(),
+            v_starts.view(),
+            ilens.view(),
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        // byte_bufs: only "alt". v_idxs [1,0] → "CG" then "A" → [67,71,65], off [0,2,3].
+        assert_eq!(bufs.byte_bufs.len(), 1);
+        let (name, data, off) = &bufs.byte_bufs[0];
+        assert_eq!(*name, "alt");
+        assert_eq!(data.to_vec(), vec![67u8, 71, 65]);
+        assert_eq!(off.to_vec(), vec![0i64, 2, 3]);
+        // tok_bufs: only "flank_tokens". Each variant: [f5(1) | f3(1)] = 2 tokens.
+        // var0 = v_idx 1: start=8, ilen=0 → end=9, read [7,10) = [7,8,9]; f5=[7], f3=[9].
+        // var1 = v_idx 0: start=5, ilen=0 → end=6, read [4,7) = [4,5,6]; f5=[4], f3=[6].
+        // tokens (identity lut) = [7,9, 4,6]; offsets = row_offsets [0,2].
+        assert_eq!(bufs.tok_bufs.len(), 1);
+        let (tname, tdata, toff) = &bufs.tok_bufs[0];
+        assert_eq!(*tname, "flank_tokens");
+        assert_eq!(tdata.to_vec(), vec![7u8, 9, 4, 6]);
+        assert_eq!(toff.to_vec(), vec![0i64, 2]);
+    }
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev cargo-test 2>&1 | rtk err`
+Expected: FAIL — `cannot find function assemble_variants_mode` / `cannot find struct VariantBufs`.
+
+- [ ] **Step 3: Implement the struct + orchestrator**
+
+Add to `src/variants/windows.rs` (above the `#[cfg(test)]` module):
+
+```rust
+/// Assembled flat buffers returned by the mode orchestrators. `byte_bufs` carry
+/// raw allele bytes (u8); `tok_bufs` carry LUT-applied tokens (`Tok`). Each
+/// tuple is `(field_name, data, seq_offsets)`.
+pub struct VariantBufs<Tok> {
+    pub byte_bufs: Vec<(&'static str, Array1<u8>, Array1<i64>)>,
+    pub tok_bufs: Vec<(&'static str, Array1<Tok>, Array1<i64>)>,
+}
+
+/// Gather per-selected-variant `start`/`ilen` from the GLOBAL arrays via `v_idxs`.
+fn gather_starts_ilens(
+    v_idxs: ArrayView1<i32>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+) -> (Array1<i32>, Array1<i32>) {
+    let n = v_idxs.len();
+    let mut s = Array1::<i32>::zeros(n);
+    let mut il = Array1::<i32>::zeros(n);
+    for i in 0..n {
+        let v = v_idxs[i] as usize;
+        s[i] = v_starts[v];
+        il[i] = ilens[v];
+    }
+    (s, il)
+}
+
+/// Plain-`variants` assembly tail: raw alt bytes (always), raw ref bytes
+/// (optional), `flank_tokens` ride-along (optional). Mirrors the variants tail
+/// of `get_variants_flat` (gather_alleles + compute_flank_tokens).
+#[allow(clippy::too_many_arguments)]
+pub fn assemble_variants_mode<Tok: Copy>(
+    v_idxs: ArrayView1<i32>,
+    row_offsets: ArrayView1<i64>,
+    alt_global: ArrayView1<u8>,
+    alt_off_global: ArrayView1<i64>,
+    ref_global: Option<ArrayView1<u8>>,
+    ref_off_global: Option<ArrayView1<i64>>,
+    want_flank: bool,
+    flank_len: i64,
+    lut: Option<ArrayView1<Tok>>,
+    v_contigs: ArrayView1<i32>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+    reference: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+) -> VariantBufs<Tok> {
+    let mut byte_bufs = Vec::new();
+    let mut tok_bufs = Vec::new();
+
+    let (alt_data, alt_seq_off) =
+        crate::variants::gather_alleles(v_idxs, alt_global, alt_off_global);
+    byte_bufs.push(("alt", alt_data, alt_seq_off));
+
+    if let (Some(rg), Some(ro)) = (ref_global, ref_off_global) {
+        let (ref_data, ref_seq_off) = crate::variants::gather_alleles(v_idxs, rg, ro);
+        byte_bufs.push(("ref", ref_data, ref_seq_off));
+    }
+
+    if want_flank {
+        let lut = lut.expect("flank tokens requested but no token LUT supplied");
+        let (starts_v, ilens_v) = gather_starts_ilens(v_idxs, v_starts, ilens);
+        let (rw_data, rw_off) = fetch_windows(
+            v_contigs, starts_v.view(), ilens_v.view(), flank_len, reference, ref_offsets,
+            pad_char,
+        );
+        let l = flank_len as usize;
+        let (f5, f3) = slice_flanks(rw_data.view(), rw_off.view(), l);
+        // Concatenate [f5 | f3] per variant (2L tokens, variant-major), tokenize.
+        let n = f5.len() / l;
+        let mut flank_bytes: Vec<u8> = Vec::with_capacity(n * 2 * l);
+        for i in 0..n {
+            for k in 0..l {
+                flank_bytes.push(f5[i * l + k]);
+            }
+            for k in 0..l {
+                flank_bytes.push(f3[i * l + k]);
+            }
+        }
+        let fb = Array1::from_vec(flank_bytes);
+        let tok = tokenize(fb.view(), lut);
+        // flank_tokens offsets are the variant-level row_offsets (fixed 2L inner
+        // axis carried separately Python-side as a trailing regular dim).
+        tok_bufs.push(("flank_tokens", tok, row_offsets.to_owned()));
+    }
+
+    VariantBufs { byte_bufs, tok_bufs }
+}
+```
+
+- [ ] **Step 4: Run to verify it passes**
+
+Run: `pixi run -e dev cargo-test 2>&1 | rtk err`
+Expected: `test_assemble_variants_mode_alt_and_flank` PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add src/variants/windows.rs
+rtk git commit -m "feat(variants): assemble_variants_mode (alt/ref bytes + flank tokens)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 4: Rust `assemble_windows_mode` orchestrator (token windows)
+
+**Files:**
+- Modify: `src/variants/windows.rs`
+- Test: cargo unit test inside `src/variants/windows.rs`
+
+**Interfaces:**
+- Consumes: Task 1/2/3 cores + `gather_alleles`.
+- Produces: `pub fn assemble_windows_mode<Tok: Copy>(...) -> VariantBufs<Tok>` (signature in Step 3). `ref_mode`/`alt_mode`: `1` = window (flanked, tokenized), `2` = allele (bare tokenized). Field names: `ref_window`/`alt_window` for mode 1, `ref`/`alt` for mode 2.
+
+- [ ] **Step 1: Write the failing test**
+
+Add to the `tests` module in `src/variants/windows.rs`:
+
+```rust
+    #[test]
+    fn test_assemble_windows_mode_both_windows() {
+        use ndarray::Array1 as A1;
+        // Global alt alleles: v0="A"(65). offsets [0,1].
+        let alt_global = arr1(&[65u8]);
+        let alt_off = arr1(&[0i64, 1]);
+        let v_idxs = arr1(&[0i32]);
+        let row_offsets = arr1(&[0i64, 1]);
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        let v_starts = arr1(&[5i32]);
+        let ilens = arr1(&[0i32]);
+        let v_contigs = arr1(&[0i32]);
+        let lut: A1<u8> = A1::from_vec((0u8..=255).collect()); // identity
+
+        let bufs = assemble_windows_mode::<u8>(
+            v_idxs.view(),
+            row_offsets.view(),
+            1, // ref_mode = window
+            1, // alt_mode = window
+            alt_global.view(),
+            alt_off.view(),
+            None,
+            None,
+            1, // flank_len
+            lut.view(),
+            v_contigs.view(),
+            v_starts.view(),
+            ilens.view(),
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        // SNP start=5 ilen=0 → end=6; read [4,7) = [4,5,6]. L=1.
+        // ref_window tokens (identity) = [4,5,6], off [0,3].
+        // alt_window = f5[4] . alt[65] . f3[6] = [4,65,6], off [0,3].
+        assert_eq!(bufs.byte_bufs.len(), 0);
+        let names: Vec<&str> = bufs.tok_bufs.iter().map(|t| t.0).collect();
+        assert_eq!(names, vec!["ref_window", "alt_window"]);
+        assert_eq!(bufs.tok_bufs[0].1.to_vec(), vec![4u8, 5, 6]);
+        assert_eq!(bufs.tok_bufs[0].2.to_vec(), vec![0i64, 3]);
+        assert_eq!(bufs.tok_bufs[1].1.to_vec(), vec![4u8, 65, 6]);
+        assert_eq!(bufs.tok_bufs[1].2.to_vec(), vec![0i64, 3]);
+    }
+
+    #[test]
+    fn test_assemble_windows_mode_bare_alleles() {
+        use ndarray::Array1 as A1;
+        // alt v0="AC"(65,67); ref v0="G"(71).
+        let alt_global = arr1(&[65u8, 67]);
+        let alt_off = arr1(&[0i64, 2]);
+        let ref_global = arr1(&[71u8]);
+        let ref_off = arr1(&[0i64, 1]);
+        let v_idxs = arr1(&[0i32]);
+        let row_offsets = arr1(&[0i64, 1]);
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        let v_starts = arr1(&[5i32]);
+        let ilens = arr1(&[0i32]);
+        let v_contigs = arr1(&[0i32]);
+        let lut: A1<u8> = A1::from_vec((0u8..=255).collect());
+
+        let bufs = assemble_windows_mode::<u8>(
+            v_idxs.view(),
+            row_offsets.view(),
+            2, // ref_mode = allele (bare)
+            2, // alt_mode = allele (bare)
+            alt_global.view(),
+            alt_off.view(),
+            Some(ref_global.view()),
+            Some(ref_off.view()),
+            1,
+            lut.view(),
+            v_contigs.view(),
+            v_starts.view(),
+            ilens.view(),
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        let names: Vec<&str> = bufs.tok_bufs.iter().map(|t| t.0).collect();
+        assert_eq!(names, vec!["ref", "alt"]);
+        // bare ref tokens = [71], off [0,1]; bare alt tokens = [65,67], off [0,2].
+        assert_eq!(bufs.tok_bufs[0].1.to_vec(), vec![71u8]);
+        assert_eq!(bufs.tok_bufs[0].2.to_vec(), vec![0i64, 1]);
+        assert_eq!(bufs.tok_bufs[1].1.to_vec(), vec![65u8, 67]);
+        assert_eq!(bufs.tok_bufs[1].2.to_vec(), vec![0i64, 2]);
+    }
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev cargo-test 2>&1 | rtk err`
+Expected: FAIL — `cannot find function assemble_windows_mode`.
+
+- [ ] **Step 3: Implement `assemble_windows_mode`**
+
+Add to `src/variants/windows.rs` (above the `#[cfg(test)]` module):
+
+```rust
+/// `variant-windows` assembly tail. `ref_mode`/`alt_mode`: 1 = flanked window
+/// (`[start-L,end+L)` for ref; `flank5.alt.flank3` for alt), 2 = bare tokenized
+/// allele. Produces only token buffers (scalar fields are handled Python-side).
+/// Mirrors the windows branch of `get_variants_flat` (incl. the single fused
+/// fetch shared by ref_window + alt_window).
+#[allow(clippy::too_many_arguments)]
+pub fn assemble_windows_mode<Tok: Copy>(
+    v_idxs: ArrayView1<i32>,
+    _row_offsets: ArrayView1<i64>,
+    ref_mode: i64,
+    alt_mode: i64,
+    alt_global: ArrayView1<u8>,
+    alt_off_global: ArrayView1<i64>,
+    ref_global: Option<ArrayView1<u8>>,
+    ref_off_global: Option<ArrayView1<i64>>,
+    flank_len: i64,
+    lut: ArrayView1<Tok>,
+    v_contigs: ArrayView1<i32>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+    reference: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+) -> VariantBufs<Tok> {
+    let mut tok_bufs = Vec::new();
+    let l = flank_len as usize;
+
+    // alt alleles are always gathered (needed for alt window or bare alt).
+    let (alt_data, alt_seq_off) =
+        crate::variants::gather_alleles(v_idxs, alt_global, alt_off_global);
+
+    // One fused fetch if either side needs a window read.
+    let need_fetch = ref_mode == 1 || alt_mode == 1;
+    let fetched = if need_fetch {
+        let (starts_v, ilens_v) = gather_starts_ilens(v_idxs, v_starts, ilens);
+        Some(fetch_windows(
+            v_contigs, starts_v.view(), ilens_v.view(), flank_len, reference, ref_offsets,
+            pad_char,
+        ))
+    } else {
+        None
+    };
+
+    // ref side (ordered first to match Python field insertion order).
+    if ref_mode == 1 {
+        let (rw_data, rw_off) = fetched.as_ref().expect("ref window needs a fetch");
+        let tok = tokenize(rw_data.view(), lut);
+        tok_bufs.push(("ref_window", tok, rw_off.clone()));
+    } else if ref_mode == 2 {
+        let rg = ref_global.expect("bare ref allele needs ref byte buffer");
+        let ro = ref_off_global.expect("bare ref allele needs ref offsets");
+        let (ref_data, ref_seq_off) = crate::variants::gather_alleles(v_idxs, rg, ro);
+        let tok = tokenize(ref_data.view(), lut);
+        tok_bufs.push(("ref", tok, ref_seq_off));
+    }
+
+    // alt side.
+    if alt_mode == 1 {
+        let (rw_data, rw_off) = fetched.as_ref().expect("alt window needs a fetch");
+        let (f5, f3) = slice_flanks(rw_data.view(), rw_off.view(), l);
+        let (alt_bytes, alt_off) = assemble_alt_window(
+            f5.view(),
+            f3.view(),
+            alt_data.view(),
+            alt_seq_off.view(),
+            l,
+        );
+        let tok = tokenize(alt_bytes.view(), lut);
+        tok_bufs.push(("alt_window", tok, alt_off));
+    } else if alt_mode == 2 {
+        let tok = tokenize(alt_data.view(), lut);
+        tok_bufs.push(("alt", tok, alt_seq_off));
+    }
+
+    VariantBufs { byte_bufs: Vec::new(), tok_bufs }
+}
+```
+
+- [ ] **Step 4: Run to verify it passes**
+
+Run: `pixi run -e dev cargo-test 2>&1 | rtk err`
+Expected: both `test_assemble_windows_mode_*` PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add src/variants/windows.rs
+rtk git commit -m "feat(variants): assemble_windows_mode (token windows + bare alleles)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 5: FFI pyfunctions + registration
+
+**Files:**
+- Modify: `src/ffi/mod.rs`
+- Modify: `src/lib.rs:36` (after the last `add_function` for variants)
+- Test: Python smoke import (Step 5)
+
+**Interfaces:**
+- Produces two Python-callable functions, importable as
+  `from genvarloader.genvarloader import assemble_variant_buffers_u8, assemble_variant_buffers_i32`.
+- Signature (identical for both; the suffix names the token dtype `Tok`):
+  ```
+  assemble_variant_buffers_<tok>(
+      mode: int,                # 0 = variants, 1 = windows
+      v_idxs: i32[n],
+      row_offsets: i64[b*p+1],
+      alt_global: u8[],
+      alt_off_global: i64[],
+      ref_global: Optional[u8[]],
+      ref_off_global: Optional[i64[]],
+      want_ref_bytes: bool,     # variants mode: emit raw "ref" bytes
+      want_flank: bool,         # variants mode: emit "flank_tokens"
+      ref_mode: int,            # windows mode: 1 window / 2 allele
+      alt_mode: int,            # windows mode: 1 window / 2 allele
+      flank_len: int,
+      lut: Optional[<tok>[256]],
+      v_contigs: i32[n],
+      v_starts: i32[],          # global per-variant
+      ilens: i32[],             # global per-variant
+      reference: u8[],
+      ref_offsets: i64[],       # contig offsets
+      pad_char: int,
+  ) -> dict[str, tuple[np.ndarray, np.ndarray]]   # name -> (data, seq_offsets)
+  ```
+
+- [ ] **Step 1: Add the shared dict-builder + two pyfunctions**
+
+Add to the top imports of `src/ffi/mod.rs` (extend the existing `use` lines):
+
+```rust
+use numpy::PyArrayMethods;
+use pyo3::types::PyDict;
+use crate::variants::windows::{assemble_variants_mode, assemble_windows_mode, VariantBufs};
+```
+
+Add these functions to `src/ffi/mod.rs` (near the other variants pyfunctions):
+
+```rust
+/// Build the `{name: (data, seq_offsets)}` dict from assembled buffers.
+fn bufs_to_pydict<'py, Tok: numpy::Element + Copy>(
+    py: Python<'py>,
+    bufs: VariantBufs<Tok>,
+) -> Bound<'py, PyDict> {
+    let d = PyDict::new(py);
+    for (name, data, off) in bufs.byte_bufs {
+        d.set_item(name, (data.into_pyarray(py), off.into_pyarray(py)))
+            .unwrap();
+    }
+    for (name, data, off) in bufs.tok_bufs {
+        d.set_item(name, (data.into_pyarray(py), off.into_pyarray(py)))
+            .unwrap();
+    }
+    d
+}
+
+/// Monomorphized assembly entry. `Tok` is the token dtype; `mode` selects
+/// variants (0) vs windows (1). See module docs in `variants::windows`.
+#[allow(clippy::too_many_arguments)]
+fn assemble_variant_buffers_impl<'py, Tok: numpy::Element + Copy>(
+    py: Python<'py>,
+    mode: i64,
+    v_idxs: PyReadonlyArray1<i32>,
+    row_offsets: PyReadonlyArray1<i64>,
+    alt_global: PyReadonlyArray1<u8>,
+    alt_off_global: PyReadonlyArray1<i64>,
+    ref_global: Option<PyReadonlyArray1<u8>>,
+    ref_off_global: Option<PyReadonlyArray1<i64>>,
+    want_ref_bytes: bool,
+    want_flank: bool,
+    ref_mode: i64,
+    alt_mode: i64,
+    flank_len: i64,
+    lut: Option<PyReadonlyArray1<Tok>>,
+    v_contigs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    reference: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+) -> Bound<'py, PyDict> {
+    let rg = ref_global.as_ref().map(|a| a.as_array());
+    let ro = ref_off_global.as_ref().map(|a| a.as_array());
+    let lut_v = lut.as_ref().map(|a| a.as_array());
+    let bufs = if mode == 0 {
+        assemble_variants_mode::<Tok>(
+            v_idxs.as_array(),
+            row_offsets.as_array(),
+            alt_global.as_array(),
+            alt_off_global.as_array(),
+            if want_ref_bytes { rg } else { None },
+            if want_ref_bytes { ro } else { None },
+            want_flank,
+            flank_len,
+            lut_v,
+            v_contigs.as_array(),
+            v_starts.as_array(),
+            ilens.as_array(),
+            reference.as_array(),
+            ref_offsets.as_array(),
+            pad_char,
+        )
+    } else {
+        assemble_windows_mode::<Tok>(
+            v_idxs.as_array(),
+            row_offsets.as_array(),
+            ref_mode,
+            alt_mode,
+            alt_global.as_array(),
+            alt_off_global.as_array(),
+            rg,
+            ro,
+            flank_len,
+            lut_v.expect("windows mode requires a token LUT"),
+            v_contigs.as_array(),
+            v_starts.as_array(),
+            ilens.as_array(),
+            reference.as_array(),
+            ref_offsets.as_array(),
+            pad_char,
+        )
+    };
+    bufs_to_pydict(py, bufs)
+}
+
+/// u8-token assembly (token_dtype == uint8). See `assemble_variant_buffers_impl`.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn assemble_variant_buffers_u8<'py>(
+    py: Python<'py>,
+    mode: i64,
+    v_idxs: PyReadonlyArray1<i32>,
+    row_offsets: PyReadonlyArray1<i64>,
+    alt_global: PyReadonlyArray1<u8>,
+    alt_off_global: PyReadonlyArray1<i64>,
+    ref_global: Option<PyReadonlyArray1<u8>>,
+    ref_off_global: Option<PyReadonlyArray1<i64>>,
+    want_ref_bytes: bool,
+    want_flank: bool,
+    ref_mode: i64,
+    alt_mode: i64,
+    flank_len: i64,
+    lut: Option<PyReadonlyArray1<u8>>,
+    v_contigs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    reference: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+) -> Bound<'py, PyDict> {
+    assemble_variant_buffers_impl::<u8>(
+        py, mode, v_idxs, row_offsets, alt_global, alt_off_global, ref_global,
+        ref_off_global, want_ref_bytes, want_flank, ref_mode, alt_mode, flank_len,
+        lut, v_contigs, v_starts, ilens, reference, ref_offsets, pad_char,
+    )
+}
+
+/// i32-token assembly (token_dtype == int32). See `assemble_variant_buffers_impl`.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn assemble_variant_buffers_i32<'py>(
+    py: Python<'py>,
+    mode: i64,
+    v_idxs: PyReadonlyArray1<i32>,
+    row_offsets: PyReadonlyArray1<i64>,
+    alt_global: PyReadonlyArray1<u8>,
+    alt_off_global: PyReadonlyArray1<i64>,
+    ref_global: Option<PyReadonlyArray1<u8>>,
+    ref_off_global: Option<PyReadonlyArray1<i64>>,
+    want_ref_bytes: bool,
+    want_flank: bool,
+    ref_mode: i64,
+    alt_mode: i64,
+    flank_len: i64,
+    lut: Option<PyReadonlyArray1<i32>>,
+    v_contigs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    reference: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+) -> Bound<'py, PyDict> {
+    assemble_variant_buffers_impl::<i32>(
+        py, mode, v_idxs, row_offsets, alt_global, alt_off_global, ref_global,
+        ref_off_global, want_ref_bytes, want_flank, ref_mode, alt_mode, flank_len,
+        lut, v_contigs, v_starts, ilens, reference, ref_offsets, pad_char,
+    )
+}
+```
+
+- [ ] **Step 2: Register both in `src/lib.rs`**
+
+After the line `m.add_function(wrap_pyfunction!(ffi::fill_empty_seq_i32, m)?)?;` (currently `src/lib.rs:35`), add:
+
+```rust
+    m.add_function(wrap_pyfunction!(ffi::assemble_variant_buffers_u8, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::assemble_variant_buffers_i32, m)?)?;
+```
+
+- [ ] **Step 3: Build the extension**
+
+Run: `pixi run -e dev maturin develop --release 2>&1 | rtk err`
+Expected: builds clean (no errors). Warnings about `too_many_arguments` are suppressed by the `allow` attributes.
+
+- [ ] **Step 4: Run the Rust unit tests again (regression)**
+
+Run: `pixi run -e dev cargo-test 2>&1 | rtk err`
+Expected: all `windows::tests::*` plus existing tests PASS.
+
+- [ ] **Step 5: Smoke-test the import**
+
+Run:
+```bash
+pixi run -e dev python -c "from genvarloader.genvarloader import assemble_variant_buffers_u8, assemble_variant_buffers_i32; print('ok')"
+```
+Expected: prints `ok`.
+
+- [ ] **Step 6: Commit**
+
+```bash
+rtk git add src/ffi/mod.rs src/lib.rs
+rtk git commit -m "feat(ffi): assemble_variant_buffers_{u8,i32} pyfunctions
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 6: Python numba oracle + dispatch registration + dict parity harness
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_flat_flanks.py`
+- Modify: `python/genvarloader/_dataset/_flat_variants.py` (imports + register block)
+- Modify: `tests/parity/_harness.py`
+- Test: `tests/parity/test_assemble_variant_buffers_parity.py` (created in Task 8; harness verified here via a tiny inline check)
+
+**Interfaces:**
+- Produces:
+  - `_flat_flanks._assemble_variant_buffers_numba(mode, v_idxs, row_offsets, alt_global, alt_off_global, ref_global, ref_off_global, want_ref_bytes, want_flank, ref_mode, alt_mode, flank_len, lut, v_contigs, v_starts, ilens, reference, ref_offsets, pad_char) -> dict[str, tuple[np.ndarray, np.ndarray]]` — same contract as the Rust pyfunctions, composed from the existing helpers.
+  - `_flat_variants._assemble_variant_buffers_rust(...same args...)` — the dtype-selecting shim.
+  - dispatch key `"assemble_variant_buffers"` (default `"rust"`).
+  - `tests.parity._harness.assert_kernel_parity_dict(name, *inputs)`.
+
+- [ ] **Step 1: Write the numba oracle composing existing helpers**
+
+Add to `python/genvarloader/_dataset/_flat_flanks.py` (after the existing imports and `from ._flat_variants import _FlatWindow`):
+
+```python
+from ._flat_variants import _gather_alleles  # noqa: E402  (numba/rust dispatch gather)
+
+
+def _assemble_variant_buffers_numba(
+    mode,
+    v_idxs,
+    row_offsets,
+    alt_global,
+    alt_off_global,
+    ref_global,
+    ref_off_global,
+    want_ref_bytes,
+    want_flank,
+    ref_mode,
+    alt_mode,
+    flank_len,
+    lut,
+    v_contigs,
+    v_starts,
+    ilens,
+    reference,
+    ref_offsets,
+    pad_char,
+):
+    """Parity oracle: compose the existing numpy/numba assembly helpers into the
+    same ``{name: (data, seq_offsets)}`` dict the Rust mega-call returns.
+
+    ``reference``/``ref_offsets``/``pad_char`` are the raw reference-genome
+    arrays; this oracle wraps them in a lightweight fetch shim so it can reuse
+    ``compute_*`` unchanged."""
+    from numpy.typing import NDArray  # noqa: F401
+
+    out: dict = {}
+    v_idxs = np.ascontiguousarray(v_idxs, np.int32)
+    row_offsets = np.ascontiguousarray(row_offsets, np.int64)
+
+    # per-selected-variant start/ilen (global arrays indexed by v_idxs)
+    starts_v = np.asarray(v_starts, np.int32)[v_idxs]
+    ilens_v = np.asarray(ilens, np.int32)[v_idxs]
+    v_contigs = np.ascontiguousarray(v_contigs, np.int32)
+
+    class _RefShim:
+        """Minimal reference.fetch() over raw arrays, matching Reference.fetch."""
+
+        def fetch(self, contigs, starts, ends):
+            from .._ragged import Ragged
+            from ..genvarloader import get_reference
+
+            lengths = np.asarray(ends) - np.asarray(starts)
+            from .._utils import lengths_to_offsets
+
+            offs = lengths_to_offsets(lengths)
+            regions = np.stack(
+                [
+                    np.asarray(contigs, np.int32),
+                    np.asarray(starts, np.int32),
+                    np.asarray(ends, np.int32),
+                ],
+                axis=1,
+            )
+            seqs = get_reference(
+                regions,
+                offs,
+                np.asarray(reference, np.uint8),
+                np.asarray(ref_offsets, np.int64),
+                int(pad_char),
+                False,
+            )
+            return Ragged.from_offsets(seqs.view("S1"), (len(contigs), None), offs)
+
+    ref_shim = _RefShim()
+    lut_arr = None if lut is None else np.asarray(lut)
+
+    if mode == 0:
+        alt_data, alt_seq_off = _gather_alleles(v_idxs, alt_global, alt_off_global)
+        out["alt"] = (np.ascontiguousarray(alt_data, np.uint8), alt_seq_off)
+        if want_ref_bytes:
+            ref_data, ref_seq_off = _gather_alleles(v_idxs, ref_global, ref_off_global)
+            out["ref"] = (np.ascontiguousarray(ref_data, np.uint8), ref_seq_off)
+        if want_flank:
+            tok, off = compute_flank_tokens(
+                ref_shim, v_contigs, starts_v, ilens_v, flank_len, lut_arr, row_offsets
+            )
+            out["flank_tokens"] = (tok, np.asarray(off, np.int64))
+    else:
+        alt_data, alt_seq_off = _gather_alleles(v_idxs, alt_global, alt_off_global)
+        if ref_mode == 1:
+            rw = compute_ref_window(
+                ref_shim, v_contigs, starts_v, ilens_v, flank_len, lut_arr, row_offsets
+            )
+            out["ref_window"] = (rw.data, rw.seq_offsets)
+        elif ref_mode == 2:
+            ref_data, ref_seq_off = _gather_alleles(v_idxs, ref_global, ref_off_global)
+            rw = tokenize_alleles(ref_data, ref_seq_off, lut_arr, row_offsets)
+            out["ref"] = (rw.data, rw.seq_offsets)
+        if alt_mode == 1:
+            aw = compute_alt_window(
+                ref_shim, v_contigs, starts_v, ilens_v, alt_data, alt_seq_off,
+                flank_len, lut_arr, row_offsets,
+            )
+            out["alt_window"] = (aw.data, aw.seq_offsets)
+        elif alt_mode == 2:
+            aw = tokenize_alleles(alt_data, alt_seq_off, lut_arr, row_offsets)
+            out["alt"] = (aw.data, aw.seq_offsets)
+    return out
+```
+
+> Note: confirm the import paths `from .._ragged import Ragged`, `from .._utils import lengths_to_offsets`, and `from ..genvarloader import get_reference` resolve in this package (grep them: `rtk grep "def lengths_to_offsets" python/genvarloader/_utils.py` and `rtk grep "get_reference" python/genvarloader/__init__.py` / the compiled module). If `get_reference` is not yet exported from the Python package, import it from `..genvarloader` (the compiled extension) — it is already used by `_reference.py:143`, so mirror that exact import.
+
+- [ ] **Step 2: Add the Rust dtype-selecting shim + register the kernel**
+
+In `python/genvarloader/_dataset/_flat_variants.py`, add to the rust imports block (near the other `from ..genvarloader import ... as ..._rust`):
+
+```python
+from ..genvarloader import assemble_variant_buffers_i32 as _assemble_i32_rust
+from ..genvarloader import assemble_variant_buffers_u8 as _assemble_u8_rust
+```
+
+Then add the shim + registration (place it after the existing `register(...)` blocks, e.g. after the `fill_empty_seq` registrations):
+
+```python
+def _assemble_variant_buffers_rust(
+    mode,
+    v_idxs,
+    row_offsets,
+    alt_global,
+    alt_off_global,
+    ref_global,
+    ref_off_global,
+    want_ref_bytes,
+    want_flank,
+    ref_mode,
+    alt_mode,
+    flank_len,
+    lut,
+    v_contigs,
+    v_starts,
+    ilens,
+    reference,
+    ref_offsets,
+    pad_char,
+):
+    """Select the u8/i32 monomorphization by token dtype. ``lut`` is None only
+    when no tokenized output is requested (plain variants, no flank); then the
+    u8 entry is used and ``lut`` stays None."""
+    fn = _assemble_u8_rust
+    if lut is not None and np.asarray(lut).dtype == np.int32:
+        fn = _assemble_i32_rust
+    return fn(
+        int(mode),
+        np.ascontiguousarray(v_idxs, np.int32),
+        np.ascontiguousarray(row_offsets, np.int64),
+        np.ascontiguousarray(alt_global, np.uint8),
+        np.ascontiguousarray(alt_off_global, np.int64),
+        None if ref_global is None else np.ascontiguousarray(ref_global, np.uint8),
+        None if ref_off_global is None else np.ascontiguousarray(ref_off_global, np.int64),
+        bool(want_ref_bytes),
+        bool(want_flank),
+        int(ref_mode),
+        int(alt_mode),
+        int(flank_len),
+        None if lut is None else np.ascontiguousarray(lut),
+        np.ascontiguousarray(v_contigs, np.int32),
+        np.ascontiguousarray(v_starts, np.int32),
+        np.ascontiguousarray(ilens, np.int32),
+        np.ascontiguousarray(reference, np.uint8),
+        np.ascontiguousarray(ref_offsets, np.int64),
+        int(pad_char),
+    )
+
+
+def _assemble_variant_buffers_numba_entry(*args):
+    from ._flat_flanks import _assemble_variant_buffers_numba
+
+    return _assemble_variant_buffers_numba(*args)
+
+
+register(
+    "assemble_variant_buffers",
+    numba=_assemble_variant_buffers_numba_entry,
+    rust=_assemble_variant_buffers_rust,
+    default="rust",
+)
+```
+
+> The numba entry is a thin lazy wrapper to avoid a circular import (`_flat_flanks` imports from `_flat_variants`).
+
+- [ ] **Step 3: Add the dict parity assertion to the harness**
+
+Add to `tests/parity/_harness.py`:
+
+```python
+def assert_kernel_parity_dict(name: str, *inputs) -> None:
+    """Parity for kernels that RETURN a dict[str, tuple[ndarray, ...]].
+
+    Asserts identical key sets and byte-identical values per key (dtype, shape,
+    values) between the numba and rust backends.
+    """
+    numba_fn, rust_fn = _dispatch.backends(name)
+    got_numba = numba_fn(*inputs)
+    got_rust = rust_fn(*inputs)
+    assert set(got_numba) == set(got_rust), (
+        f"{name}: keys {sorted(got_numba)} != {sorted(got_rust)}"
+    )
+    for key in got_numba:
+        nt = got_numba[key]
+        rt = got_rust[key]
+        assert len(nt) == len(rt), f"{name}[{key}]: tuple len {len(nt)} != {len(rt)}"
+        for i, (a, b) in enumerate(zip(nt, rt)):
+            a = np.asarray(a)
+            b = np.asarray(b)
+            assert a.dtype == b.dtype, f"{name}[{key}][{i}]: dtype {a.dtype} != {b.dtype}"
+            assert a.shape == b.shape, f"{name}[{key}][{i}]: shape {a.shape} != {b.shape}"
+            np.testing.assert_array_equal(a, b)
+```
+
+- [ ] **Step 4: Build + verify the registration imports cleanly**
+
+Run:
+```bash
+pixi run -e dev maturin develop --release 2>&1 | rtk err
+pixi run -e dev python -c "import genvarloader._dataset._flat_variants as m; from genvarloader._dispatch import backends; print(backends('assemble_variant_buffers'))"
+```
+Expected: prints the `(numba_entry, rust_shim)` callables tuple — confirms the key registered.
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add python/genvarloader/_dataset/_flat_flanks.py python/genvarloader/_dataset/_flat_variants.py tests/parity/_harness.py
+rtk git commit -m "feat(variants): register assemble_variant_buffers (rust default, numba oracle)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 7: Rewrite `get_variants_flat` assembly tail to call the dispatched kernel
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_flat_variants.py:974-1083` (the windows branch + flank ride-along + the alt/ref allele gather in the scalar-field block)
+- Test: covered by Task 8 parity + the existing `tests/parity/test_variants_dataset_parity.py`
+
+**Interfaces:**
+- Consumes: `get("assemble_variant_buffers")(...)` from Task 6 returning `dict[str, (data, seq_off)]`.
+- Produces: unchanged public return types `_FlatVariants` / `_FlatVariantWindows` (callers see no change).
+
+- [ ] **Step 1: Replace the alt/ref allele gather + windows branch + flank ride-along**
+
+In `get_variants_flat`, the current flow gathers `alt` (and optional `ref`) alleles inline (lines ~927-942), then later builds windows (lines ~974-1055) and the flank ride-along (lines ~1057-1077). Replace those three regions so the **ragged** buffers come from one dispatched call, while **scalar** fields stay inline.
+
+Concretely, after the scalar/dosage/custom fields are built into `fields` (keep all of that), compute the shared inputs and call the kernel:
+
+```python
+    from .._haps import _HapsFfiStatic  # noqa: F401  (type only)
+
+    stat = haps.ffi_static
+    # v_contigs: per-selected-variant contig id (only needed when fetching).
+    needs_fetch = (
+        regions is not None
+        and haps.token_lut is not None
+        and (
+            (issubclass(haps.kind, _FlatVariantWindows) and opt is not None)
+            or bool(haps.flank_length)
+        )
+    )
+    if needs_fetch:
+        regions_arr = np.asarray(regions)
+        group_contigs = np.repeat(regions_arr[:, 0], eff_ploidy)
+        v_contigs = np.repeat(group_contigs, np.diff(row_offsets)).astype(np.int32)
+    else:
+        v_contigs = np.zeros(len(v_idxs), np.int32)
+
+    ref_present = "ref" in haps.var_fields and haps.variants.ref is not None
+    ref_global = ref_off_global = None
+    if ref_present or (
+        issubclass(haps.kind, _FlatVariantWindows)
+        and opt is not None
+        and (opt.ref == "allele")
+    ):
+        ref_global = np.asarray(haps.variants.ref.data).view(np.uint8)
+        ref_off_global = np.asarray(haps.variants.ref.offsets, np.int64)
+```
+
+- [ ] **Step 2: Build the windows-mode result from the dict**
+
+Replace the windows branch (`if regions is not None and issubclass(haps.kind, _FlatVariantWindows) and opt is not None:` ... `return win`) with:
+
+```python
+    opt = haps.window_opt
+    if (
+        regions is not None
+        and issubclass(haps.kind, _FlatVariantWindows)
+        and opt is not None
+    ):
+        L = opt.flank_length
+        ref_mode = 1 if opt.ref == "window" else 2
+        alt_mode = 1 if opt.alt == "window" else 2
+        bufs = get("assemble_variant_buffers")(
+            1,  # windows mode
+            v_idxs,
+            row_offsets,
+            stat.alt_alleles,
+            stat.alt_offsets,
+            ref_global,
+            ref_off_global,
+            False,  # want_ref_bytes (windows mode emits tokens, not raw bytes)
+            False,  # want_flank
+            ref_mode,
+            alt_mode,
+            L,
+            haps.token_lut,
+            v_contigs,
+            stat.v_starts,
+            stat.ilens,
+            stat.ref,        # reference genome buffer
+            stat.ref_offsets,  # contig offsets
+            haps.reference.pad_char,
+        )
+        wshape = (b, eff_ploidy, None, None)
+        wfields = {k: v for k, v in fields.items() if k not in ("alt", "ref")}
+        win = _FlatVariantWindows(wfields)
+        for name, (data, seq_off) in bufs.items():
+            fw = _FlatWindow(data, np.asarray(seq_off, np.int64), row_offsets, wshape)
+            setattr(win, name, fw)
+        if haps.dummy_variant is not None:
+            win = win.fill_empty_groups(
+                haps.dummy_variant, unk=haps.unknown_token, flank_length=L
+            )
+        return win
+```
+
+- [ ] **Step 3: Build the plain-variants alt/ref + flank result from the dict**
+
+Replace the inline alt/ref allele gather and the flank ride-along so the plain-variants path also goes through the kernel. Where the code currently does `fields["alt"] = _FlatAlleles(...)` and `fields["ref"] = _FlatAlleles(...)`, and the later `if haps.flank_length and ...: compute_flank_tokens(...)` block, replace with a single call after the scalar fields are assembled:
+
+```python
+    want_flank = bool(
+        haps.flank_length and haps.token_lut is not None and regions is not None
+    )
+    L = haps.flank_length or 0
+    bufs = get("assemble_variant_buffers")(
+        0,  # variants mode
+        v_idxs,
+        row_offsets,
+        stat.alt_alleles,
+        stat.alt_offsets,
+        ref_global,
+        ref_off_global,
+        ref_present,  # want_ref_bytes
+        want_flank,
+        0,  # ref_mode (unused in variants mode)
+        0,  # alt_mode (unused)
+        L,
+        haps.token_lut,
+        v_contigs,
+        stat.v_starts,
+        stat.ilens,
+        stat.ref if stat.ref is not None else np.zeros(0, np.uint8),
+        stat.ref_offsets if stat.ref_offsets is not None else np.zeros(1, np.int64),
+        haps.reference.pad_char if haps.reference is not None else 0,
+    )
+    alt_data, alt_seq_off = bufs["alt"]
+    fields["alt"] = _FlatAlleles(
+        np.asarray(alt_data, np.uint8), np.asarray(alt_seq_off, np.int64), row_offsets, shape
+    )
+    if "ref" in bufs:
+        ref_data, ref_seq_off = bufs["ref"]
+        fields["ref"] = _FlatAlleles(
+            np.asarray(ref_data, np.uint8), np.asarray(ref_seq_off, np.int64), row_offsets, shape
+        )
+    flat = _FlatVariants(fields)
+    if "flank_tokens" in bufs:
+        from .._flat import _Flat
+
+        tok, off = bufs["flank_tokens"]
+        flat.flank_tokens = _Flat.from_offsets(
+            tok, (b, eff_ploidy, None, 2 * L), np.asarray(off, np.int64)
+        )
+
+    if haps.dummy_variant is not None:
+        flat = flat.fill_empty_groups(haps.dummy_variant, unk=haps.unknown_token)
+
+    return flat
+```
+
+> IMPORTANT ordering: the `fields` dict insertion order determines downstream wrapping; today `alt` is inserted before `start`/`ref`/etc. Preserve the existing field order — build `fields["alt"]` placeholder position by keeping the scalar block as-is and only swapping the alt/ref *values* to come from `bufs`. If the original code inserted `alt` first, keep `alt` first (move the `bufs["alt"]` assignment up to where `fields["alt"]` was originally set, not appended at the end). Verify with `RaggedVariants` field order in a parity run (Task 8).
+
+- [ ] **Step 4: Remove the now-dead inline assembly**
+
+Delete the now-unreachable inline `compute_windows`/`compute_ref_window`/`compute_alt_window`/`tokenize_alleles`/`compute_flank_tokens` call sites in `get_variants_flat` (the helper *functions* stay in `_flat_flanks.py` as the oracle). Confirm no other caller depends on them on the hot path: `rtk grep "compute_windows\|compute_ref_window\|compute_alt_window\|compute_flank_tokens\|tokenize_alleles" python/genvarloader/_dataset/_flat_variants.py` should now only show imports used by the oracle, not the hot path.
+
+- [ ] **Step 5: Build + smoke-run one windows query**
+
+Run:
+```bash
+pixi run -e dev maturin develop --release 2>&1 | rtk err
+pixi run -e dev pytest tests/parity/test_variants_dataset_parity.py -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | rtk err
+```
+Expected: existing variants dataset parity PASSES on the default (rust) backend.
+
+- [ ] **Step 6: Commit**
+
+```bash
+rtk git add python/genvarloader/_dataset/_flat_variants.py
+rtk git commit -m "perf(variants): route windows/variants assembly through one rust call
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 8: Parity fixtures + dataset backstop spy + both-backend gate
+
+**Files:**
+- Create: `tests/parity/test_assemble_variant_buffers_parity.py`
+- Modify: `tests/parity/test_dataset_parity.py` (add a kernel-spy that proves the call runs on the live windows/variants `__getitem__` path)
+
+**Interfaces:**
+- Consumes: `assert_kernel_parity_dict` (Task 6), the registered `assemble_variant_buffers` kernel.
+
+- [ ] **Step 1: Write the kernel-level mode-matrix parity test**
+
+Create `tests/parity/test_assemble_variant_buffers_parity.py`:
+
+```python
+"""Parity: the new assemble_variant_buffers mega-call (rust) must be
+byte-identical to the composed numba oracle for variants + variant-windows,
+across the ref/alt mode matrix, the flank ride-along, and empty selections."""
+
+import numpy as np
+import pytest
+
+import genvarloader._dataset._flat_variants  # noqa: F401  (triggers register())
+from tests.parity._harness import assert_kernel_parity_dict
+
+pytestmark = pytest.mark.parity
+
+
+def _reference():
+    # single contig of 40 bytes, ASCII A/C/G/T cycling.
+    bases = np.frombuffer(b"ACGT", np.uint8)
+    ref = np.tile(bases, 10).astype(np.uint8)
+    ref_offsets = np.array([0, ref.size], np.int64)
+    return ref, ref_offsets
+
+
+def _lut(dtype):
+    # A->0 C->1 G->2 T->3, everything else (incl. N) -> 4 (unknown).
+    lut = np.full(256, 4, dtype)
+    for i, b in enumerate(b"ACGT"):
+        lut[b] = i
+    return lut
+
+
+def _globals():
+    # 3 global variants: alt "A","CG","T"; ref "C","G","AA".
+    alt = np.frombuffer(b"ACGT", np.uint8)  # placeholder; rebuild explicitly below
+    alt_bytes = np.frombuffer(b"ACGT", np.uint8)
+    # alt alleles: v0="A", v1="CG", v2="T"
+    alt_data = np.frombuffer(b"ACGT", np.uint8)
+    alt_data = np.frombuffer(b"A" b"CG" b"T", np.uint8)
+    alt_off = np.array([0, 1, 3, 4], np.int64)
+    ref_data = np.frombuffer(b"C" b"G" b"AA", np.uint8)
+    ref_off = np.array([0, 1, 2, 4], np.int64)
+    v_starts = np.array([5, 12, 20], np.int32)
+    ilens = np.array([0, -1, 1], np.int32)  # SNP, 1bp del, 1bp ins
+    return alt_data, alt_off, ref_data, ref_off, v_starts, ilens
+
+
+@pytest.mark.parametrize("tok_dtype", [np.uint8, np.int32])
+@pytest.mark.parametrize("ref_mode,alt_mode", [(1, 1), (1, 2), (2, 1), (2, 2)])
+def test_windows_mode_matrix(tok_dtype, ref_mode, alt_mode):
+    ref, ref_offsets = _reference()
+    alt_data, alt_off, ref_data, ref_off, v_starts, ilens = _globals()
+    lut = _lut(tok_dtype)
+    # one row selecting all 3 variants
+    v_idxs = np.array([0, 1, 2], np.int32)
+    row_offsets = np.array([0, 3], np.int64)
+    v_contigs = np.zeros(3, np.int32)
+    assert_kernel_parity_dict(
+        "assemble_variant_buffers",
+        1,  # windows
+        v_idxs, row_offsets, alt_data, alt_off, ref_data, ref_off,
+        False, False, ref_mode, alt_mode, 2, lut, v_contigs, v_starts, ilens,
+        ref, ref_offsets, ord("N"),
+    )
+
+
+@pytest.mark.parametrize("tok_dtype", [np.uint8, np.int32])
+@pytest.mark.parametrize("want_ref,want_flank", [(False, False), (True, False), (False, True), (True, True)])
+def test_variants_mode_matrix(tok_dtype, want_ref, want_flank):
+    ref, ref_offsets = _reference()
+    alt_data, alt_off, ref_data, ref_off, v_starts, ilens = _globals()
+    lut = _lut(tok_dtype) if want_flank else None
+    v_idxs = np.array([2, 0, 1], np.int32)
+    row_offsets = np.array([0, 1, 3], np.int64)  # 2 rows
+    v_contigs = np.zeros(3, np.int32)
+    assert_kernel_parity_dict(
+        "assemble_variant_buffers",
+        0,  # variants
+        v_idxs, row_offsets, alt_data, alt_off, ref_data, ref_off,
+        want_ref, want_flank, 0, 0, 2, lut, v_contigs, v_starts, ilens,
+        ref, ref_offsets, ord("N"),
+    )
+
+
+@pytest.mark.parametrize("mode,ref_mode,alt_mode", [(0, 0, 0), (1, 1, 1)])
+def test_empty_selection(mode, ref_mode, alt_mode):
+    """A row that selects zero variants must round-trip identically."""
+    ref, ref_offsets = _reference()
+    alt_data, alt_off, ref_data, ref_off, v_starts, ilens = _globals()
+    lut = _lut(np.uint8)
+    v_idxs = np.array([], np.int32)
+    row_offsets = np.array([0, 0], np.int64)  # 1 empty row
+    v_contigs = np.array([], np.int32)
+    assert_kernel_parity_dict(
+        "assemble_variant_buffers",
+        mode,
+        v_idxs, row_offsets, alt_data, alt_off, ref_data, ref_off,
+        False, (mode == 0), ref_mode, alt_mode, 2, lut, v_contigs, v_starts, ilens,
+        ref, ref_offsets, ord("N"),
+    )
+```
+
+> Clean up the placeholder lines in `_globals` (the first two `alt`/`alt_bytes`/`alt_data` reassignments are scratch — keep only the final explicit `alt_data = np.frombuffer(b"A" b"CG" b"T", np.uint8)`). Verify the test file has no unused locals via `ruff check`.
+
+- [ ] **Step 2: Run the kernel parity on both backends**
+
+Run:
+```bash
+pixi run -e dev pytest tests/parity/test_assemble_variant_buffers_parity.py -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | rtk err
+GVL_BACKEND=numba pixi run -e dev pytest tests/parity/test_assemble_variant_buffers_parity.py -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | rtk err
+```
+Expected: all PASS on both backends. (The dict harness compares numba vs rust internally regardless of `GVL_BACKEND`, but running both confirms registration import paths are env-independent.)
+
+- [ ] **Step 3: Add a live-path kernel spy to the dataset backstop**
+
+In `tests/parity/test_dataset_parity.py`, add a test that monkeypatches the registry's rust entry for `assemble_variant_buffers` with a counting wrapper, opens a small variant-windows dataset, indexes one batch, and asserts the wrapper was called (proves the kernel runs on the live `__getitem__`, guarding against a vacuous parity pass). Mirror the existing spy pattern in that file. Skeleton:
+
+```python
+def test_assemble_variant_buffers_runs_on_live_windows_path(tmp_path):
+    """The rust mega-call must actually fire on the windows __getitem__ path."""
+    from genvarloader import _dispatch
+
+    entry = _dispatch._REGISTRY["assemble_variant_buffers"]
+    calls = {"n": 0}
+    real = entry["rust"]
+
+    def spy(*args, **kwargs):
+        calls["n"] += 1
+        return real(*args, **kwargs)
+
+    entry["rust"] = spy
+    try:
+        ds = _open_variant_windows_dataset(tmp_path)  # reuse this file's helper
+        _ = ds[0, 0]
+    finally:
+        entry["rust"] = real
+    assert calls["n"] > 0, "assemble_variant_buffers never ran on the live path"
+```
+
+> Use the existing dataset-construction helper in `test_dataset_parity.py` (grep for how the file builds a windows/variants dataset: `rtk grep "variant.windows\|VarWindowOpt\|with_seqs" tests/parity/test_dataset_parity.py`). If no windows helper exists, build a minimal one with `gvl.write` + `Dataset.open(...).with_seqs("variant-windows", VarWindowOpt(...))`, matching the corpus the other dataset-parity tests use.
+
+- [ ] **Step 4: Run the dataset backstop + the variants/windows dataset parity, both backends**
+
+Run:
+```bash
+pixi run -e dev pytest tests/parity/test_dataset_parity.py tests/parity/test_variants_dataset_parity.py -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | rtk err
+GVL_BACKEND=numba pixi run -e dev pytest tests/parity/test_dataset_parity.py tests/parity/test_variants_dataset_parity.py -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | rtk err
+```
+Expected: all PASS on both backends.
+
+- [ ] **Step 5: Full tree, both backends, + lint/format/typecheck**
+
+Run:
+```bash
+pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | rtk err
+GVL_BACKEND=numba pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | rtk err
+pixi run -e dev cargo-test 2>&1 | rtk err
+pixi run -e dev ruff check python/ tests/ && pixi run -e dev ruff format python/ tests/ && pixi run -e dev typecheck
+```
+Expected: full tree PASSES on both backends (except the pre-existing `test_e2e_variants` xfail, which must xfail identically — confirm it is xfail, not fail). Rust tests pass; lint/format/typecheck clean.
+
+- [ ] **Step 6: Commit**
+
+```bash
+rtk git add tests/parity/test_assemble_variant_buffers_parity.py tests/parity/test_dataset_parity.py
+rtk git commit -m "test(parity): assemble_variant_buffers mode matrix + live-path spy
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 9: Perf re-measure + roadmap update
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (round-2 target 7 entry + re-measurement block + Phase-5 marker/PR link)
+
+**Interfaces:** none (documentation + measurement).
+
+- [ ] **Step 1: Confirm the pre-existing xfail is unchanged at this branch**
+
+Run: `pixi run -e dev pytest tests/benchmarks/test_e2e.py::test_e2e_variants -q --basetemp=$(pwd)/.pytest_tmp 2>&1 | rtk err`
+Expected: `xfailed` (NOT failed, NOT passed). Record that it matches base behavior.
+
+- [ ] **Step 2: Re-measure variant-windows and variants (rust vs numba, min of pedantic)**
+
+Run (build release first if not already):
+```bash
+pixi run -e dev maturin develop --release 2>&1 | rtk err
+pixi run -e dev pytest tests/benchmarks/test_e2e.py -k "variant" --benchmark-only -q --basetemp=$(pwd)/.pytest_tmp
+```
+Also capture the `perf` flat self-time to confirm the GC/eval share dropped:
+```bash
+NUMBA_NUM_THREADS=1 perf record -F 999 -o p.data -- .pixi/envs/dev/bin/python \
+    tests/benchmarks/profiling/profile.py --mode variant-windows --n-batches 12000
+perf report --stdio --no-children -i p.data | head -40
+```
+Expected: GC (`gc_collect_main`/`deduce_unreachable`/`visit_reachable`/`dict_traverse`) self-time share is materially lower than the ~14% baseline; record the new variant-windows and variants min-ms ratios.
+
+- [ ] **Step 3: Update the roadmap**
+
+In `docs/roadmaps/rust-migration.md`, change target 7's marker from ⬜ to ✅ (or 🚧 with the PR link if not yet merged), append the re-measured variant-windows/variants ratios to the round-2 re-measurement block, and set the PR link. Keep the wording consistent with how targets 1–4 record their results (status marker + branch/PR + before→after numbers).
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): target 7 done — variant-windows rust assembly, re-measured
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+- [ ] **Step 5: Final push gate (per CLAUDE.md)**
+
+Confirm the full tree is green on both backends (Task 8 Step 5) and the branch is ready for PR. Open the PR against `zero-copy-scale-safe-readpath` (the base branch), not `master`.
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- Scope = all variants + windows → Tasks 3 (variants mode) + 4 (windows mode), routed in Task 7. ✓
+- Rust owns the fetch → Task 2 `fetch_windows` reusing `reference::get_reference`. ✓
+- One mega-call → single FFI entry per token dtype (Task 5), one dispatch key (Task 6). ✓
+- Front edge = assembly tail only → front-end + scalar gather untouched in Task 7; #231 dtype-polymorphic fields never routed through the typed call. ✓
+- fill_empty stays separate → Task 7 keeps `fill_empty_groups` post-pass. ✓
+- Parity via registry with numba oracle → Task 6 oracle + Task 8 mode-matrix + live-path spy. ✓
+- Perf gate + roadmap → Task 9. ✓
+- Pre-existing xfail handling → Task 9 Step 1 + Task 8 Step 5 note. ✓
+- Scale-guard not regressed → globals sourced from `ffi_static` (sub-linear), no new `ascontiguousarray` on sample-scale memmaps. ✓
+
+**Placeholder scan:** Two intentional verification-and-adjust notes remain (Task 6 Step 1 import-path confirmation; Task 7 Step 3 field-order preservation; Task 8 Step 3 dataset-helper reuse). These are explicit "grep-then-confirm" instructions with the exact command and fallback, not vague TODOs — acceptable because the exact existing symbol/helper must be confirmed against the live tree rather than guessed.
+
+**Type consistency:** `VariantBufs<Tok>` (Task 3) is consumed unchanged in Tasks 4–5. Field names (`alt`, `ref`, `ref_window`, `alt_window`, `flank_tokens`) are identical across the Rust orchestrators (Tasks 3–4), the numba oracle (Task 6), the Python wrapping (Task 7), and the parity test (Task 8). The mega-call argument order is identical across the Rust pyfunctions (Task 5), the rust shim + numba oracle (Task 6), and both call sites (Task 7) and the parity tests (Task 8).
+
+---
+
+## Risks & watch-points (for the implementer)
+
+- **Field insertion order** (`_FlatVariants.fields`) feeds `RaggedVariants` construction order downstream. Task 7 Step 3 must preserve today's order (`alt` first where it was first); the dataset parity in Task 8 Step 4 is the gate that catches a reordering.
+- **`reference is None`** path: variants mode with no reference + no flank must still emit `alt` (and `ref`) bytes. Task 7 passes zero-length reference placeholders in that case; the empty-selection parity (Task 8 `test_empty_selection`) and the no-reference dataset parity cover it.
+- **Token dtype selection**: `_assemble_variant_buffers_rust` picks i32 only when `lut.dtype == int32`; otherwise u8. When `lut is None` (plain variants, no flank), u8 entry with `lut=None` — the orchestrator never touches the LUT on that path.
+- **`unphased_union`**: `row_offsets` is already folded to `eff_ploidy=1` before the kernel call (front-end, unchanged). `v_contigs` is built with `eff_ploidy`, so it stays consistent. Add an `unphased_union=True` windows fixture to the dataset parity if the existing corpus lacks one.
diff --git a/docs/superpowers/plans/2026-06-25-zero-copy-scale-safe-readpath.md b/docs/superpowers/plans/2026-06-25-zero-copy-scale-safe-readpath.md
new file mode 100644
index 00000000..40f2eb87
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-25-zero-copy-scale-safe-readpath.md
@@ -0,0 +1,1588 @@
+# Zero-copy, scale-safe Rust read path (gvl format 2.0) Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Eliminate per-batch materialization of per-sample-scale memmaps at the Python→Rust boundary, cache only the truly-static sub-linear arrays, and skip provably-unnecessary zero-init — all byte-identical to current output — gated behind a `format_version` 1.0.0 → 2.0.0 bump with an explicit `gvl.migrate`.
+
+**Architecture:** One breaking on-disk change converts track-interval storage from array-of-structs (`INTERVAL_DTYPE`, itemsize 12, strided field views) to struct-of-arrays (three contiguous files `starts.npy`/`ends.npy`/`values.npy` sharing the existing `offsets.npy`). Contiguous memmaps then cross the FFI boundary zero-copy, replacing the `np.ascontiguousarray(...)` calls that copied the whole per-sample-scale interval store every batch. A loud boundary guard (`_ffi_array`) replaces silent materialization; sub-linear per-variant arrays are cached once per reconstructor; and fully-overwritten Rust output buffers drop their zero-init.
+
+**Tech Stack:** Python 3.10+, NumPy, Polars, Rust (PyO3/ndarray/bigtools/coitrees), Maturin, pytest + cargo test, pixi.
+
+## Global Constraints
+
+- **Byte-identical parity is the landing gate.** Every change is layout/marshalling only; output bytes are unchanged. Verified across `GVL_BACKEND=rust` and `GVL_BACKEND=numba` via `tests/parity` plus the dataset/unit/integration suites.
+- **Public API delta is exactly:** add `migrate` to `python/genvarloader/__init__.py` `__all__`; bump `DATASET_FORMAT_VERSION` to `2.0.0`. No other public signature changes. Per `CLAUDE.md`, this requires a `skills/genvarloader/SKILL.md` update (Task 7).
+- **No new perf gate.** Throughput is recorded in the roadmap, not gated. The one hard new gate is the **scale-guard** test (Task 4): no memmap-materializing copy on the read path.
+- **Commands run under pixi:** `pixi run -e dev <task>`. After any Rust change, rebuild the extension with `pixi run -e dev maturin develop --release` before running Python tests. Dataset/parity tests need `--basetemp=$(pwd)/.pytest_tmp` (Carter `os.link` Errno 18). Prefix shell commands with `rtk`.
+- **Lint/format/typecheck scope:** `pixi run -e dev ruff check python/ tests/`, `pixi run -e dev ruff format python/ tests/`, `pixi run -e dev typecheck`. Rust: `pixi run -e dev cargo clippy`, `cargo test`.
+- **Merge style:** merge commit, never squash. Work on branch `zero-copy-scale-safe-readpath` (off `rust-migration`, after #245/#246 closed out `phase-3-reconstruction`).
+- **No committed `.gvl` fixtures exist** (verified: `git ls-files` shows only build scripts under `tests/benchmarks/data/`, no on-disk datasets). All test datasets are generated through `gvl.write`, so after Task 1 every freshly-built dataset is born 2.0.0/SoA — the version gate (Task 2) cannot break the committed suite. The migration test (Task 3) synthesizes its own 1.x AoS dataset.
+
+---
+
+## File-Touch Map
+
+| File | Change | Task |
+|---|---|---|
+| `python/genvarloader/_dataset/_write.py` | `DATASET_FORMAT_VERSION` → 2.0.0; SoA writers (`_write_ragged_intervals`, `_write_track_legacy` chunked); `_check_dataset_format_version` helper | 1, 2 |
+| `python/genvarloader/_dataset/_tracks.py` | `_open_intervals` memmaps three contiguous arrays; drop `INTERVAL_DTYPE` import | 1 |
+| `src/bigwig.rs` | `write_track` emits SoA; update oracle byte test | 1 |
+| `src/tables.rs` | `write_track_impl` emits SoA; update oracle byte test | 1 |
+| `python/genvarloader/_dataset/_open.py` | call `_check_dataset_format_version` in `_load_metadata` | 2 |
+| `python/genvarloader/_dataset/_migrate.py` (new) | `migrate()` streaming in-place AoS→SoA | 3 |
+| `python/genvarloader/__init__.py` | export `migrate` in `__all__` | 3 |
+| `python/genvarloader/_dataset/_utils.py` | `_ffi_array(arr, dtype, name)` boundary helper | 4 |
+| `python/genvarloader/_dataset/_reconstruct.py` | drop `ascontiguousarray` on sample-scale args; apply `_ffi_array` | 4 |
+| `python/genvarloader/_dataset/_haps.py` | same for fused haps/annotated/splice calls; cache sub-linear arrays (Task 5) | 4, 5 |
+| `src/ffi/mod.rs` | uninitialized output allocation in the fused kernels | 6 |
+| `tests/integration/conftest.py` (new) | `track_dataset_path` fixture | 1 |
+| `tests/integration/test_format_2_soa.py` (new) | SoA round-trip | 1 |
+| `tests/integration/test_format_version_gate.py` (new) | version gate | 2 |
+| `tests/integration/test_migrate.py` (new) | migration round-trip / idempotency / interruption | 3 |
+| `tests/integration/test_scale_guard.py` (new) | no-memmap-copy gate | 4 |
+| `tests/unit/dataset/test_ffi_array.py` (new) | `_ffi_array` guard | 4 |
+| `tests/unit/dataset/test_haps_ffi_cache.py` (new) | sub-linear cache | 5 |
+| `skills/genvarloader/SKILL.md` | document `migrate` + format 2.0 open behavior | 7 |
+| `docs/roadmaps/rust-migration.md` | mark targets addressed; record throughput | 7 |
+
+---
+
+## Background facts the implementer needs
+
+- **`.npy` files here are headerless raw little-endian bytes.** The writers stream raw `to_le_bytes()` / `np.memmap`; the reader memmaps with an explicit `dtype`. There is no numpy `.npy` magic header. SoA = three raw files of the same length (number of intervals), all 4 bytes per element (`int32`, `int32`, `float32`), sharing one `int64` `offsets.npy`.
+- **`INTERVAL_DTYPE`** (`python/genvarloader/_ragged.py:26`) `= np.dtype([("start", i4), ("end", i4), ("value", f4)], align=True)`, itemsize 12. After Task 1 it is no longer on the read or born-write path; it survives only for the migration reader (Task 3) and any in-memory record construction. (A second, unused copy exists at `python/genvarloader/_types.py:18`; it is not imported anywhere — leave it untouched, out of scope.)
+- **Four interval writers feed the same on-disk layout:** `_write_ragged_intervals` (Python, annotation/table single-chunk), `_write_track_legacy` (Python, chunked sample tracks), `bigwig.rs::write_track` (Rust, BigWig tracks via `_write_track_rust`), `tables.rs::write_track_impl` (Rust, table tracks via `_write_track_table`). **All four** must emit SoA in Task 1, or datasets written by the path you missed will be unreadable by the new reader.
+- **`_as_starts_stops`** (`_genotypes.py:119`) builds a fresh contiguous `(2, n)` array via `np.stack`; its output `.base` is not a memmap, so it never trips the scale-guard. Leave it and the `_geno_offsets_2d` precompute (`_reconstruct.py:198`) unchanged.
+
+---
+
+## Task 1: AoS → SoA interval storage + `format_version` 2.0.0 (Component A)
+
+The single breaking change. Flips all four writers and the one reader together (a partial flip is not independently green) and bumps the format version. Atomic deliverable: a freshly-written dataset stores SoA and reads back byte-identically.
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_write.py` (`DATASET_FORMAT_VERSION` `:44`; `_write_ragged_intervals` `:1085-1108`; `_write_track_legacy` chunked block `:1322-1334`)
+- Modify: `python/genvarloader/_dataset/_tracks.py` (`_open_intervals` `:706-725`; `INTERVAL_DTYPE` import `:18`)
+- Modify: `src/bigwig.rs` (`write_track` `:26-126`; oracle test `:319-335`)
+- Modify: `src/tables.rs` (`write_track_impl` `:161-224`; oracle test `:453-467`)
+- Create: `tests/integration/conftest.py`
+- Create: `tests/integration/test_format_2_soa.py`
+
+**Interfaces:**
+- Produces (on-disk, per track dir under `intervals/<track>/` and `annot_intervals/<track>/`):
+  - `starts.npy` — raw `int32`, contiguous, length = total intervals
+  - `ends.npy` — raw `int32`, contiguous
+  - `values.npy` — raw `float32`, contiguous
+  - `offsets.npy` — raw `int64`, **unchanged** (length n+1)
+- Produces: `DATASET_FORMAT_VERSION == SemanticVersion.parse("2.0.0")`
+- Produces (test): `track_dataset_path` fixture → `Path` to a freshly-written 2.0 dataset with a phased VCF + one BigWig `"cov"` track.
+- Consumes: existing `RaggedIntervals` (`_ragged.py:31`) and `Ragged.from_offsets`.
+
+- [ ] **Step 1: Write the failing round-trip test + fixture**
+
+Create `tests/integration/conftest.py`:
+
+```python
+"""Shared fixtures for tests/integration/."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pyBigWig
+import pytest
+
+import genvarloader as gvl
+
+
+@pytest.fixture
+def track_dataset_path(source_bed, vcf_dir, tmp_path) -> Path:
+    """A freshly-written 2.0 dataset (phased VCF + one BigWig 'cov' track),
+    yielded as a writable path so tests may downgrade/migrate it in place.
+
+    Mirrors tests/dataset/conftest.py::snap_dataset but yields a path (not an
+    opened Dataset) and is function-scoped so each test gets a mutable copy.
+    """
+    from genoray import VCF
+
+    samples = ["s0", "s1", "s2"]
+    contig_sizes = [("chr1", 2_000_000), ("chr2", 2_000_000)]
+    bw_paths: dict[str, str] = {}
+    for i, s in enumerate(samples):
+        p = tmp_path / f"{s}.bw"
+        with pyBigWig.open(str(p), "w") as bw:
+            bw.addHeader(contig_sizes, maxZooms=0)
+            v = float(i + 1)
+            bw.addEntries(
+                ["chr1", "chr1", "chr2", "chr2"],
+                [499_990, 1_010_686, 17_320, 1_234_560],
+                ends=[500_030, 1_010_706, 17_340, 1_234_580],
+                values=[v, v, v, v],
+            )
+        bw_paths[s] = str(p)
+    out = tmp_path / "ds.gvl"
+    gvl.write(
+        path=out,
+        bed=source_bed,
+        variants=VCF(vcf_dir / "filtered_source.vcf.gz"),
+        tracks=gvl.BigWigs("cov", bw_paths),
+        max_jitter=2,
+    )
+    return out
+```
+
+Create `tests/integration/test_format_2_soa.py`:
+
+```python
+"""Format 2.0 stores track intervals as struct-of-arrays (Task 1)."""
+
+from __future__ import annotations
+
+import json
+
+import numpy as np
+
+import genvarloader as gvl
+from genvarloader._dataset._write import DATASET_FORMAT_VERSION
+
+
+def test_dataset_version_is_2(track_dataset_path):
+    assert str(DATASET_FORMAT_VERSION) == "2.0.0"
+    meta = json.loads((track_dataset_path / "metadata.json").read_text())
+    assert meta["format_version"] == "2.0.0"
+
+
+def test_soa_files_present_and_aos_absent(track_dataset_path):
+    track_dir = track_dataset_path / "intervals" / "cov"
+    assert (track_dir / "starts.npy").exists()
+    assert (track_dir / "ends.npy").exists()
+    assert (track_dir / "values.npy").exists()
+    assert (track_dir / "offsets.npy").exists()
+    assert not (track_dir / "intervals.npy").exists()
+
+
+def test_soa_files_contiguous_and_typed(track_dataset_path):
+    track_dir = track_dataset_path / "intervals" / "cov"
+    starts = np.memmap(track_dir / "starts.npy", dtype=np.int32, mode="r")
+    ends = np.memmap(track_dir / "ends.npy", dtype=np.int32, mode="r")
+    values = np.memmap(track_dir / "values.npy", dtype=np.float32, mode="r")
+    assert starts.flags["C_CONTIGUOUS"]
+    assert ends.flags["C_CONTIGUOUS"]
+    assert values.flags["C_CONTIGUOUS"]
+    assert len(starts) == len(ends) == len(values)
+
+
+def test_reads_back(track_dataset_path, reference):
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_tracks("cov")
+    out = ds[0, 0]
+    assert out is not None
+```
+
+- [ ] **Step 2: Run the test to verify it fails**
+
+Run: `pixi run -e dev pytest tests/integration/test_format_2_soa.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: FAIL — `test_dataset_version_is_2` fails (`"1.0.0" != "2.0.0"`) and `test_soa_files_present_and_aos_absent` fails (`intervals.npy` still present, `starts.npy` absent).
+
+- [ ] **Step 3: Bump the format version**
+
+In `python/genvarloader/_dataset/_write.py:44` change:
+
+```python
+DATASET_FORMAT_VERSION = SemanticVersion.parse("1.0.0")
+```
+
+to:
+
+```python
+DATASET_FORMAT_VERSION = SemanticVersion.parse("2.0.0")
+```
+
+- [ ] **Step 4: Convert the Python single-chunk writer to SoA**
+
+In `python/genvarloader/_dataset/_write.py`, replace `_write_ragged_intervals` (`:1085-1108`) body. New version:
+
+```python
+def _write_ragged_intervals(out_dir: Path, itvs: "RaggedIntervals") -> None:
+    """Write a RaggedIntervals (values/starts/ends share offsets) to out_dir as
+    struct-of-arrays: starts/ends/values.npy + offsets.npy. Single-chunk writer
+    used for annotation tracks (format 2.0)."""
+    out_dir.mkdir(parents=True, exist_ok=True)
+    for name, data, dt in (
+        ("starts", itvs.starts.data, np.int32),
+        ("ends", itvs.ends.data, np.int32),
+        ("values", itvs.values.data, np.float32),
+    ):
+        out = np.memmap(out_dir / f"{name}.npy", dtype=dt, mode="w+", shape=data.shape)
+        out[:] = data
+        out.flush()
+
+    offsets = itvs.values.offsets
+    out = np.memmap(
+        out_dir / "offsets.npy",
+        dtype=offsets.dtype,
+        mode="w+",
+        shape=len(offsets),
+    )
+    out[:] = offsets
+    out.flush()
+```
+
+- [ ] **Step 5: Convert the Python chunked writer to SoA**
+
+In `python/genvarloader/_dataset/_write.py`, the chunked sample-track writer (`_write_track_legacy`) currently writes one AoS memmap at `:1322-1334`:
+
+```python
+        pbar.set_description(f"Writing intervals for {part.height} regions on {contig}")
+        out = np.memmap(
+            out_dir / "intervals.npy",
+            dtype=INTERVAL_DTYPE,
+            mode="w+" if interval_offset == 0 else "r+",
+            shape=intervals.values.data.shape,
+            offset=interval_offset,
+        )
+        out["start"] = intervals.starts.data
+        out["end"] = intervals.ends.data
+        out["value"] = intervals.values.data
+        out.flush()
+        interval_offset += out.nbytes
+```
+
+Replace with three SoA memmaps. `interval_offset` becomes an **element** counter (all three dtypes are 4 bytes, so each file's byte offset is `interval_offset * itemsize`):
+
+```python
+        pbar.set_description(f"Writing intervals for {part.height} regions on {contig}")
+        n = intervals.values.data.shape[0]
+        for name, data, dt in (
+            ("starts", intervals.starts.data, np.int32),
+            ("ends", intervals.ends.data, np.int32),
+            ("values", intervals.values.data, np.float32),
+        ):
+            out = np.memmap(
+                out_dir / f"{name}.npy",
+                dtype=dt,
+                mode="w+" if interval_offset == 0 else "r+",
+                shape=n,
+                offset=interval_offset * np.dtype(dt).itemsize,
+            )
+            out[:] = data
+            out.flush()
+        interval_offset += n
+```
+
+(`interval_offset` is initialized to `0` at `:1304`; it previously counted bytes, now counts elements — both start at 0 so the `mode="w+" if interval_offset == 0` guard is unchanged in meaning.) Leave the `INTERVAL_DTYPE` import at `:37` in place — Task 3's migration reader still needs it, and `_write.py` is not on the hot read path.
+
+- [ ] **Step 6: Convert the reader to SoA**
+
+In `python/genvarloader/_dataset/_tracks.py`, replace `_open_intervals` (`:706-725`):
+
+```python
+    @staticmethod
+    def _open_intervals(path: Path, n_regions: int, n_samples: int) -> RaggedIntervals:
+        if n_samples == 0:
+            shape = (n_regions, None)
+        else:
+            shape = (n_regions, n_samples, None)
+        starts_data = np.memmap(path / "starts.npy", dtype=np.int32, mode="r")
+        ends_data = np.memmap(path / "ends.npy", dtype=np.int32, mode="r")
+        values_data = np.memmap(path / "values.npy", dtype=np.float32, mode="r")
+        offsets = np.memmap(path / "offsets.npy", dtype=np.int64, mode="r")
+        starts = Ragged.from_offsets(starts_data, shape, offsets)
+        ends = Ragged.from_offsets(ends_data, shape, offsets)
+        values = Ragged.from_offsets(values_data, shape, offsets)
+        return RaggedIntervals(starts, ends, values)
+```
+
+Then drop `INTERVAL_DTYPE` from the import at `_tracks.py:18`:
+
+```python
+from .._ragged import FlatIntervals, RaggedIntervals, RaggedTracks
+```
+
+(was `from .._ragged import INTERVAL_DTYPE, FlatIntervals, RaggedIntervals, RaggedTracks`).
+
+- [ ] **Step 7: Convert the Rust BigWig writer to SoA**
+
+In `src/bigwig.rs::write_track`, replace the single `itv_writer` with three writers. At `:40`:
+
+```rust
+    let mut itv_writer = BufWriter::new(File::create(out_dir.join("intervals.npy"))?);
+```
+
+becomes:
+
+```rust
+    let mut starts_writer = BufWriter::new(File::create(out_dir.join("starts.npy"))?);
+    let mut ends_writer = BufWriter::new(File::create(out_dir.join("ends.npy"))?);
+    let mut values_writer = BufWriter::new(File::create(out_dir.join("values.npy"))?);
+```
+
+At the write loop (`:106-114`):
+
+```rust
+            for sample_vals in per_sample {
+                for v in sample_vals {
+                    itv_writer.write_all(&(v.start as i32).to_le_bytes())?;
+                    itv_writer.write_all(&(v.end as i32).to_le_bytes())?;
+                    itv_writer.write_all(&v.value.to_le_bytes())?;
+                    acc += 1;
+                }
+                offsets.push(acc);
+            }
+```
+
+becomes:
+
+```rust
+            for sample_vals in per_sample {
+                for v in sample_vals {
+                    starts_writer.write_all(&(v.start as i32).to_le_bytes())?;
+                    ends_writer.write_all(&(v.end as i32).to_le_bytes())?;
+                    values_writer.write_all(&v.value.to_le_bytes())?;
+                    acc += 1;
+                }
+                offsets.push(acc);
+            }
+```
+
+And the flush (`:118`):
+
+```rust
+    itv_writer.flush()?;
+```
+
+becomes:
+
+```rust
+    starts_writer.flush()?;
+    ends_writer.flush()?;
+    values_writer.flush()?;
+```
+
+- [ ] **Step 8: Update the Rust BigWig oracle byte test**
+
+In `src/bigwig.rs`, the oracle test currently builds one interleaved `expected` and reads `intervals.npy` (`:319-327`):
+
+```rust
+        // Expected intervals.npy bytes: [i32 start, i32 end, f32 value] per row.
+        let mut expected = Vec::new();
+        for i in 0..vals.len() {
+            expected.extend_from_slice(&(coords[[i, 0]] as i32).to_le_bytes());
+            expected.extend_from_slice(&(coords[[i, 1]] as i32).to_le_bytes());
+            expected.extend_from_slice(&vals[i].to_le_bytes());
+        }
+        let got = fs::read(tmp.join("intervals.npy")).unwrap();
+        assert_eq!(got, expected, "intervals.npy bytes mismatch");
+```
+
+Replace with three SoA expectations:
+
+```rust
+        // Expected SoA bytes: separate i32 starts, i32 ends, f32 values.
+        let mut exp_starts = Vec::new();
+        let mut exp_ends = Vec::new();
+        let mut exp_values = Vec::new();
+        for i in 0..vals.len() {
+            exp_starts.extend_from_slice(&(coords[[i, 0]] as i32).to_le_bytes());
+            exp_ends.extend_from_slice(&(coords[[i, 1]] as i32).to_le_bytes());
+            exp_values.extend_from_slice(&vals[i].to_le_bytes());
+        }
+        assert_eq!(fs::read(tmp.join("starts.npy")).unwrap(), exp_starts, "starts mismatch");
+        assert_eq!(fs::read(tmp.join("ends.npy")).unwrap(), exp_ends, "ends mismatch");
+        assert_eq!(fs::read(tmp.join("values.npy")).unwrap(), exp_values, "values mismatch");
+```
+
+(The `offsets.npy` assertion below it is unchanged.)
+
+- [ ] **Step 9: Convert the Rust table writer to SoA**
+
+In `src/tables.rs::write_track_impl`, at `:161`:
+
+```rust
+        let mut itv_w = BufWriter::new(File::create(out_dir.join("intervals.npy"))?);
+```
+
+becomes:
+
+```rust
+        let mut starts_w = BufWriter::new(File::create(out_dir.join("starts.npy"))?);
+        let mut ends_w = BufWriter::new(File::create(out_dir.join("ends.npy"))?);
+        let mut values_w = BufWriter::new(File::create(out_dir.join("values.npy"))?);
+```
+
+The row-write loop (`:211-215`):
+
+```rust
+            for (s, e, v) in &region_rows {
+                itv_w.write_all(&s.to_le_bytes())?;
+                itv_w.write_all(&e.to_le_bytes())?;
+                itv_w.write_all(&v.to_le_bytes())?;
+            }
+```
+
+becomes:
+
+```rust
+            for (s, e, v) in &region_rows {
+                starts_w.write_all(&s.to_le_bytes())?;
+                ends_w.write_all(&e.to_le_bytes())?;
+                values_w.write_all(&v.to_le_bytes())?;
+            }
+```
+
+The flush (`:222`):
+
+```rust
+        itv_w.flush()?;
+```
+
+becomes:
+
+```rust
+        starts_w.flush()?;
+        ends_w.flush()?;
+        values_w.flush()?;
+```
+
+- [ ] **Step 10: Update the Rust table oracle byte test**
+
+In `src/tables.rs`, the oracle test (`:453-466`) builds `exp_itv` interleaved and reads `intervals.npy`:
+
+```rust
+            for i in 0..vals.len() {
+                exp_itv.extend_from_slice(&coords[[i, 0]].to_le_bytes());
+                exp_itv.extend_from_slice(&coords[[i, 1]].to_le_bytes());
+                exp_itv.extend_from_slice(&vals[i].to_le_bytes());
+            }
+```
+
+Replace the `exp_itv` declaration and this loop with three vectors. Find the `let mut exp_itv = Vec::new();` declaration near the top of the test and replace it plus the loop and the final read/assert (`:464-467`):
+
+```rust
+        let mut exp_starts: Vec<u8> = Vec::new();
+        let mut exp_ends: Vec<u8> = Vec::new();
+        let mut exp_values: Vec<u8> = Vec::new();
+```
+
+loop body:
+
+```rust
+            for i in 0..vals.len() {
+                exp_starts.extend_from_slice(&coords[[i, 0]].to_le_bytes());
+                exp_ends.extend_from_slice(&coords[[i, 1]].to_le_bytes());
+                exp_values.extend_from_slice(&vals[i].to_le_bytes());
+            }
+```
+
+final assertions (replacing the `intervals.npy` read at `:464,466`):
+
+```rust
+        assert_eq!(std::fs::read(tmp.join("starts.npy")).unwrap(), exp_starts, "starts mismatch");
+        assert_eq!(std::fs::read(tmp.join("ends.npy")).unwrap(), exp_ends, "ends mismatch");
+        assert_eq!(std::fs::read(tmp.join("values.npy")).unwrap(), exp_values, "values mismatch");
+```
+
+(The `got_off`/`exp_off` offsets assertion is unchanged.)
+
+- [ ] **Step 11: Rebuild the extension and run cargo tests**
+
+Run: `pixi run -e dev maturin develop --release`
+Expected: builds clean.
+
+Run: `pixi run -e dev cargo test`
+Expected: PASS, including `bigwig::tests::write_track_matches_count_and_intervals_oracle` and `tables::tests::write_track_matches_oracle_bytes`.
+
+- [ ] **Step 12: Run the Task 1 round-trip test**
+
+Run: `pixi run -e dev pytest tests/integration/test_format_2_soa.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (4 tests).
+
+- [ ] **Step 13: Run the full parity + dataset suites on both backends**
+
+Run: `pixi run -e dev pytest tests/parity tests/dataset tests/unit -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (byte-identical on the numba backend too).
+
+- [ ] **Step 14: Lint, format, typecheck, commit**
+
+Run: `pixi run -e dev ruff format python/ tests/ && pixi run -e dev ruff check python/ tests/ && pixi run -e dev typecheck && pixi run -e dev cargo clippy`
+Expected: clean.
+
+```bash
+rtk git add python/genvarloader/_dataset/_write.py python/genvarloader/_dataset/_tracks.py src/bigwig.rs src/tables.rs tests/integration/conftest.py tests/integration/test_format_2_soa.py
+rtk git commit -m "feat(format)!: store track intervals as struct-of-arrays (gvl 2.0)
+
+Convert AoS INTERVAL_DTYPE (itemsize 12, strided field views) to three
+contiguous files starts/ends/values.npy sharing offsets.npy, across all
+four writers (Python single-chunk + chunked, Rust bigwig + table) and the
+reader. Bump DATASET_FORMAT_VERSION to 2.0.0. Byte-identical output.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 2: Version gate on open (Component B)
+
+Reject a 1.x (or `None`) dataset at open with a clear `gvl.migrate` hint; reject a future-major dataset with an upgrade error.
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_write.py` (add `_check_dataset_format_version` near `DATASET_FORMAT_VERSION` `:44`)
+- Modify: `python/genvarloader/_dataset/_open.py` (`_load_metadata` `:103-107`)
+- Create: `tests/integration/test_format_version_gate.py`
+
+**Interfaces:**
+- Consumes: `Metadata` (`_write.py:65`, has `format_version: SemanticVersion | None`), `DATASET_FORMAT_VERSION` (now `2.0.0`).
+- Produces: `_check_dataset_format_version(meta: Metadata, path: Path) -> None` — raises `ValueError` on `format_version is None` or `major < 2` (migrate hint) and on `major > 2` (upgrade hint); returns `None` when `major == 2`.
+
+- [ ] **Step 1: Write the failing test**
+
+Create `tests/integration/test_format_version_gate.py`:
+
+```python
+"""Open-time format_version gate (Task 2)."""
+
+from __future__ import annotations
+
+import json
+import shutil
+
+import pytest
+
+import genvarloader as gvl
+
+
+def _set_version(path, version):
+    meta_path = path / "metadata.json"
+    raw = json.loads(meta_path.read_text())
+    raw["format_version"] = version
+    meta_path.write_text(json.dumps(raw))
+
+
+def test_old_major_raises_migrate_hint(track_dataset_path, reference):
+    _set_version(track_dataset_path, "1.0.0")
+    with pytest.raises(ValueError, match="migrate"):
+        gvl.Dataset.open(track_dataset_path, reference=reference)
+
+
+def test_none_version_raises_migrate_hint(track_dataset_path, reference, tmp_path):
+    dst = tmp_path / "noneversion.gvl"
+    shutil.copytree(track_dataset_path, dst)
+    meta_path = dst / "metadata.json"
+    raw = json.loads(meta_path.read_text())
+    raw["format_version"] = None
+    meta_path.write_text(json.dumps(raw))
+    with pytest.raises(ValueError, match="migrate"):
+        gvl.Dataset.open(dst, reference=reference)
+
+
+def test_future_major_raises_upgrade_hint(track_dataset_path, reference):
+    _set_version(track_dataset_path, "3.0.0")
+    with pytest.raises(ValueError, match="[Uu]pgrade"):
+        gvl.Dataset.open(track_dataset_path, reference=reference)
+
+
+def test_current_major_opens(track_dataset_path, reference):
+    # written fresh at 2.0.0 by the fixture
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference)
+    assert ds is not None
+```
+
+- [ ] **Step 2: Run the test to verify it fails**
+
+Run: `pixi run -e dev pytest tests/integration/test_format_version_gate.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: FAIL — `test_old_major_raises_migrate_hint` and the others that expect a raise do not raise (no gate yet).
+
+- [ ] **Step 3: Add the gate helper**
+
+In `python/genvarloader/_dataset/_write.py`, immediately after the `DATASET_FORMAT_VERSION` definition (`:44-46`), add:
+
+```python
+def _check_dataset_format_version(meta: "Metadata", path: Path) -> None:
+    """Validate a dataset's on-disk format version against the supported major.
+
+    Pre-versioning datasets (``format_version is None``) and any older major are
+    treated as needing migration. A newer major means the reader is too old.
+    """
+    fv = meta.format_version
+    current = DATASET_FORMAT_VERSION
+    if fv is None or fv.major < current.major:
+        raise ValueError(
+            f"Dataset at {path} uses format version {fv} but this genvarloader "
+            f"expects {current}. Run `genvarloader.migrate({str(path)!r})` to "
+            f"upgrade it in place."
+        )
+    if fv.major > current.major:
+        raise ValueError(
+            f"Dataset at {path} was written by a newer genvarloader (format "
+            f"version {fv} > supported {current}). Upgrade genvarloader."
+        )
+```
+
+(`Metadata` is defined later in the file at `:65`; the forward reference in the annotation string is fine.)
+
+- [ ] **Step 4: Wire the gate into open**
+
+In `python/genvarloader/_dataset/_open.py`, update the import at `:27`:
+
+```python
+from ._write import Metadata, _check_dataset_format_version
+```
+
+and `_load_metadata` (`:103-107`):
+
+```python
+    def _load_metadata(self) -> Metadata:
+        with _py_open(self.path / "metadata.json") as f:
+            metadata = Metadata.model_validate_json(f.read())
+        _check_dataset_format_version(metadata, self.path)
+        validate_dataset(metadata, self.path)
+        return metadata
+```
+
+- [ ] **Step 5: Run the test to verify it passes**
+
+Run: `pixi run -e dev pytest tests/integration/test_format_version_gate.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (4 tests).
+
+- [ ] **Step 6: Confirm no regression in the open path**
+
+Run: `pixi run -e dev pytest tests/dataset tests/unit -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (all fixtures are born 2.0.0, so the gate is a no-op for them).
+
+- [ ] **Step 7: Lint, format, typecheck, commit**
+
+Run: `pixi run -e dev ruff format python/ tests/ && pixi run -e dev ruff check python/ tests/ && pixi run -e dev typecheck`
+Expected: clean.
+
+```bash
+rtk git add python/genvarloader/_dataset/_write.py python/genvarloader/_dataset/_open.py tests/integration/test_format_version_gate.py
+rtk git commit -m "feat(open): gate dataset open on format_version major
+
+Reject pre-2.0 (or unversioned) datasets with a gvl.migrate hint and
+future-major datasets with an upgrade error.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 3: `gvl.migrate(path)` — streaming in-place AoS → SoA (Component C)
+
+In-place, streaming, idempotent, crash-safe rewrite of a 1.x AoS dataset to 2.0 SoA.
+
+**Files:**
+- Create: `python/genvarloader/_dataset/_migrate.py`
+- Modify: `python/genvarloader/__init__.py` (import + `__all__`)
+- Create: `tests/integration/test_migrate.py`
+
+**Interfaces:**
+- Consumes: `INTERVAL_DTYPE` (`_ragged.py:26`), `DATASET_FORMAT_VERSION` (`_write.py:44`), `SemanticVersion`.
+- Produces: `migrate(path: str | Path) -> None` — exported in `genvarloader.__all__`. Converts every `intervals/<track>/intervals.npy` and `annot_intervals/<track>/intervals.npy` to SoA, bumps `metadata.json` `format_version` to `2.0.0` (durable, after all SoA written), then deletes the AoS files. No-op (with leftover-AoS cleanup) on an already-2.0 dataset.
+- Produces (test helper, local to the test module): `_downgrade_to_aos(path)` — inverse for synthesizing a 1.x fixture from a fresh 2.0 dataset.
+
+- [ ] **Step 1: Write the failing test**
+
+Create `tests/integration/test_migrate.py`:
+
+```python
+"""gvl.migrate: 1.x AoS -> 2.0 SoA round-trip, idempotency, crash-safety (Task 3)."""
+
+from __future__ import annotations
+
+import json
+
+import numpy as np
+
+import genvarloader as gvl
+from genvarloader._ragged import INTERVAL_DTYPE
+
+
+def _track_dirs(path):
+    for base in ("intervals", "annot_intervals"):
+        d = path / base
+        if d.is_dir():
+            for child in sorted(d.iterdir()):
+                if child.is_dir():
+                    yield child
+
+
+def _downgrade_to_aos(path):
+    """Rewrite a fresh 2.0 SoA dataset back to a 1.x AoS dataset in place."""
+    for d in _track_dirs(path):
+        starts = np.memmap(d / "starts.npy", dtype=np.int32, mode="r")
+        ends = np.memmap(d / "ends.npy", dtype=np.int32, mode="r")
+        values = np.memmap(d / "values.npy", dtype=np.float32, mode="r")
+        rec = np.empty(len(starts), dtype=INTERVAL_DTYPE)
+        rec["start"] = starts
+        rec["end"] = ends
+        rec["value"] = values
+        out = np.memmap(d / "intervals.npy", dtype=INTERVAL_DTYPE, mode="w+", shape=rec.shape)
+        out[:] = rec
+        out.flush()
+        del starts, ends, values, out
+        (d / "starts.npy").unlink()
+        (d / "ends.npy").unlink()
+        (d / "values.npy").unlink()
+    meta_path = path / "metadata.json"
+    raw = json.loads(meta_path.read_text())
+    raw["format_version"] = "1.0.0"
+    meta_path.write_text(json.dumps(raw))
+
+
+def test_round_trip_byte_identical(track_dataset_path, reference):
+    before = gvl.Dataset.open(track_dataset_path, reference=reference).with_tracks("cov")[0, 0]
+    before = np.asarray(before).copy()
+
+    _downgrade_to_aos(track_dataset_path)
+    gvl.migrate(track_dataset_path)
+
+    track_dir = track_dataset_path / "intervals" / "cov"
+    assert (track_dir / "starts.npy").exists()
+    assert (track_dir / "ends.npy").exists()
+    assert (track_dir / "values.npy").exists()
+    assert not (track_dir / "intervals.npy").exists()
+    assert json.loads((track_dataset_path / "metadata.json").read_text())["format_version"] == "2.0.0"
+
+    after = gvl.Dataset.open(track_dataset_path, reference=reference).with_tracks("cov")[0, 0]
+    np.testing.assert_array_equal(np.asarray(after), before)
+
+
+def test_idempotent(track_dataset_path):
+    _downgrade_to_aos(track_dataset_path)
+    gvl.migrate(track_dataset_path)
+    gvl.migrate(track_dataset_path)  # second run is a no-op, must not raise
+    track_dir = track_dataset_path / "intervals" / "cov"
+    assert not (track_dir / "intervals.npy").exists()
+
+
+def test_resumable_after_interrupt_before_metadata_bump(track_dataset_path):
+    """Crash after SoA written but before metadata bump: still 1.x, re-runnable."""
+    _downgrade_to_aos(track_dataset_path)
+    # Simulate partial migration: write SoA, leave AoS + 1.x metadata.
+    from genvarloader._dataset._migrate import _migrate_track
+
+    for d in _track_dirs(track_dataset_path):
+        _migrate_track(d)
+    meta = json.loads((track_dataset_path / "metadata.json").read_text())
+    assert meta["format_version"] == "1.0.0"  # not bumped yet
+    track_dir = track_dataset_path / "intervals" / "cov"
+    assert (track_dir / "intervals.npy").exists()  # AoS still present
+
+    gvl.migrate(track_dataset_path)  # completes the migration
+    assert json.loads((track_dataset_path / "metadata.json").read_text())["format_version"] == "2.0.0"
+    assert not (track_dir / "intervals.npy").exists()
+
+
+def test_cleans_leftover_aos_after_interrupt_before_delete(track_dataset_path):
+    """Crash after metadata bump but before AoS delete: re-run removes AoS."""
+    _downgrade_to_aos(track_dataset_path)
+    gvl.migrate(track_dataset_path)  # full migration -> SoA + 2.0 metadata
+    track_dir = track_dataset_path / "intervals" / "cov"
+    # Re-introduce a leftover AoS file (as if delete was interrupted).
+    starts = np.memmap(track_dir / "starts.npy", dtype=np.int32, mode="r")
+    rec = np.zeros(len(starts), dtype=INTERVAL_DTYPE)
+    out = np.memmap(track_dir / "intervals.npy", dtype=INTERVAL_DTYPE, mode="w+", shape=rec.shape)
+    out[:] = rec
+    out.flush()
+    del starts, out
+
+    gvl.migrate(track_dataset_path)  # idempotent cleanup
+    assert not (track_dir / "intervals.npy").exists()
+```
+
+- [ ] **Step 2: Run the test to verify it fails**
+
+Run: `pixi run -e dev pytest tests/integration/test_migrate.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: FAIL — `ImportError`/`AttributeError`: `genvarloader` has no attribute `migrate`.
+
+- [ ] **Step 3: Implement the migration module**
+
+Create `python/genvarloader/_dataset/_migrate.py`:
+
+```python
+"""In-place, streaming, idempotent migration of a 1.x AoS dataset to 2.0 SoA.
+
+Per track under ``intervals/<track>/`` and ``annot_intervals/<track>/``:
+stream ``intervals.npy`` (INTERVAL_DTYPE) in record chunks into three contiguous
+``starts/ends/values.npy`` files. Only after every track's SoA is durable do we
+bump ``metadata.json`` (last durable write); then delete the AoS files.
+
+Crash-safety by ordering: an interruption before the metadata bump leaves the
+dataset still-1.x (old AoS intact, re-runnable); an interruption after the bump
+but before deletion leaves both layouts, and a re-run completes the cleanup.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from collections.abc import Iterator
+from pathlib import Path
+
+import numpy as np
+from loguru import logger
+from pydantic_extra_types.semantic_version import SemanticVersion
+
+from .._ragged import INTERVAL_DTYPE
+from ._write import DATASET_FORMAT_VERSION
+
+_CHUNK = 1_000_000  # records per streamed block
+
+
+def _track_dirs(path: Path) -> Iterator[Path]:
+    for base in ("intervals", "annot_intervals"):
+        d = path / base
+        if d.is_dir():
+            for child in sorted(d.iterdir()):
+                if child.is_dir():
+                    yield child
+
+
+def _migrate_track(track_dir: Path) -> None:
+    """Stream one track's AoS intervals.npy into SoA starts/ends/values.npy.
+
+    No-op if intervals.npy is absent (already migrated or never AoS). Leaves the
+    AoS file in place; the caller deletes it only after metadata is bumped.
+    """
+    aos = track_dir / "intervals.npy"
+    if not aos.exists():
+        return
+    src = np.memmap(aos, dtype=INTERVAL_DTYPE, mode="r")
+    n = int(src.shape[0])
+    starts = np.memmap(track_dir / "starts.npy", dtype=np.int32, mode="w+", shape=n)
+    ends = np.memmap(track_dir / "ends.npy", dtype=np.int32, mode="w+", shape=n)
+    values = np.memmap(track_dir / "values.npy", dtype=np.float32, mode="w+", shape=n)
+    for i in range(0, n, _CHUNK):
+        j = min(i + _CHUNK, n)
+        block = src[i:j]
+        starts[i:j] = block["start"]
+        ends[i:j] = block["end"]
+        values[i:j] = block["value"]
+    for m in (starts, ends, values):
+        m.flush()
+    logger.info(f"Migrated {n} intervals in {track_dir} to SoA.")
+    del src, starts, ends, values
+
+
+def migrate(path: str | Path) -> None:
+    """Migrate a GVL dataset's track intervals from format 1.x (array-of-structs)
+    to format 2.0 (struct-of-arrays), in place.
+
+    Streaming and crash-safe: peak extra disk is one track's interval store.
+    Genotypes, regions, and reference are untouched. Idempotent — a no-op (with
+    leftover-AoS cleanup) on a dataset that is already 2.0.
+
+    Parameters
+    ----------
+    path
+        Path to the GVL dataset directory.
+    """
+    path = Path(path)
+    meta_path = path / "metadata.json"
+    if not meta_path.exists():
+        raise FileNotFoundError(f"No metadata.json at {meta_path}")
+    raw = json.loads(meta_path.read_text())
+    fv = raw.get("format_version")
+    already_v2 = (
+        fv is not None
+        and SemanticVersion.parse(fv).major >= DATASET_FORMAT_VERSION.major
+    )
+    track_dirs = list(_track_dirs(path))
+
+    if already_v2:
+        # Idempotent cleanup: remove leftover AoS from an interrupted delete.
+        for d in track_dirs:
+            aos = d / "intervals.npy"
+            if aos.exists() and (d / "starts.npy").exists():
+                aos.unlink()
+        return
+
+    # 1. Convert every track to SoA (AoS left in place).
+    for d in track_dirs:
+        _migrate_track(d)
+
+    # 2. Durably bump metadata LAST (atomic replace).
+    raw["format_version"] = str(DATASET_FORMAT_VERSION)
+    tmp = meta_path.with_suffix(".json.tmp")
+    tmp.write_text(json.dumps(raw))
+    with open(tmp, "rb") as f:
+        os.fsync(f.fileno())
+    os.replace(tmp, meta_path)
+
+    # 3. Delete AoS files.
+    for d in track_dirs:
+        aos = d / "intervals.npy"
+        if aos.exists():
+            aos.unlink()
+    logger.info(f"Migrated dataset {path} to format {DATASET_FORMAT_VERSION}.")
+```
+
+- [ ] **Step 4: Export `migrate`**
+
+In `python/genvarloader/__init__.py`, add the import (after the `_svar_link` import at `:29`):
+
+```python
+from ._dataset._migrate import migrate
+```
+
+and insert `"migrate"` into `__all__` (alphabetically, between `"get_splice_bed"` and `"migrate_svar_link"`):
+
+```python
+    "get_splice_bed",
+    "migrate",
+    "migrate_svar_link",
+```
+
+- [ ] **Step 5: Run the test to verify it passes**
+
+Run: `pixi run -e dev pytest tests/integration/test_migrate.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (4 tests).
+
+- [ ] **Step 6: Lint, format, typecheck, commit**
+
+Run: `pixi run -e dev ruff format python/ tests/ && pixi run -e dev ruff check python/ tests/ && pixi run -e dev typecheck`
+Expected: clean.
+
+```bash
+rtk git add python/genvarloader/_dataset/_migrate.py python/genvarloader/__init__.py tests/integration/test_migrate.py
+rtk git commit -m "feat(migrate): add gvl.migrate for 1.x AoS -> 2.0 SoA
+
+Streaming, idempotent, crash-safe in-place rewrite of track intervals.
+Metadata is bumped only after all SoA files are durable, then AoS deleted.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 4: Zero-copy FFI contract + loud boundary guard (Component D)
+
+Drop `np.ascontiguousarray(...)` on per-sample-scale memmapped args (now contiguous after Task 1, or already contiguous for genotypes), replacing it with `_ffi_array` — which crosses zero-copy or raises a precise error. The scale-guard test locks the defect closed.
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_utils.py` (add `_ffi_array`)
+- Modify: `python/genvarloader/_dataset/_reconstruct.py` (`:232-250` track-fused args)
+- Modify: `python/genvarloader/_dataset/_haps.py` (`:796`, `:869`, `:958` — `geno_v_idxs` in the three fused calls)
+- Create: `tests/unit/dataset/test_ffi_array.py`
+- Create: `tests/integration/test_scale_guard.py`
+
+**Interfaces:**
+- Produces: `_ffi_array(arr: np.ndarray, dtype, name: str) -> np.ndarray` in `_dataset/_utils.py` — returns `arr` unchanged if C-contiguous and exact dtype; else raises `ValueError` naming `name`.
+- Consumes: SoA interval memmaps (Task 1), `self.haps.genotypes.data` / `self.genotypes.data` (already contiguous `int32` memmaps).
+- **Scope:** the guard applies ONLY to per-sample-scale memmap args. Batch-bounded freshly-constructed arrays (`req.regions`, `req.shifts`, `req.geno_offset_idx`, `req.keep`, `req.keep_offsets`, the `_reconstruct.py` `o_idx`/`out_ofsts_per_t`/etc.) keep `np.ascontiguousarray` (cheap). The sub-linear per-variant args (`v_starts`, `ilens`, `alt`, `ref`, ...) are handled by Task 5 — leave them as `np.ascontiguousarray(...)` in this task.
+
+- [ ] **Step 1: Write the failing FFI-guard unit test**
+
+Create `tests/unit/dataset/test_ffi_array.py`:
+
+```python
+"""_ffi_array boundary guard (Task 4)."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from genvarloader._dataset._utils import _ffi_array
+
+
+def test_passes_contiguous_correct_dtype():
+    arr = np.arange(10, dtype=np.int32)
+    out = _ffi_array(arr, np.int32, "geno_v_idxs")
+    assert out is arr  # zero-copy: same object
+
+
+def test_raises_on_non_contiguous():
+    base = np.zeros((10, 3), dtype=np.int32)
+    strided = base[:, 1]  # non-contiguous column view
+    assert not strided.flags["C_CONTIGUOUS"]
+    with pytest.raises(ValueError, match="geno_v_idxs"):
+        _ffi_array(strided, np.int32, "geno_v_idxs")
+
+
+def test_raises_on_wrong_dtype():
+    arr = np.arange(10, dtype=np.int64)
+    with pytest.raises(ValueError, match="itv_starts"):
+        _ffi_array(arr, np.int32, "itv_starts")
+```
+
+- [ ] **Step 2: Run the test to verify it fails**
+
+Run: `pixi run -e dev pytest tests/unit/dataset/test_ffi_array.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: FAIL — `ImportError: cannot import name '_ffi_array'`.
+
+- [ ] **Step 3: Implement `_ffi_array`**
+
+In `python/genvarloader/_dataset/_utils.py`, add (the file already imports `numpy as np`):
+
+```python
+def _ffi_array(arr: np.ndarray, dtype, name: str) -> np.ndarray:
+    """Assert a per-sample-scale FFI argument crosses zero-copy.
+
+    Returns ``arr`` unchanged iff it is C-contiguous with exactly ``dtype``;
+    otherwise raises a precise ``ValueError`` naming ``name``. This replaces a
+    silent ``np.ascontiguousarray`` that would copy the whole per-sample-scale
+    memmap (GB-scale at the >1M-sample design target). Use it ONLY for
+    sample-scale memmap args; batch-bounded arrays may keep coercing.
+    """
+    dt = np.dtype(dtype)
+    if not arr.flags["C_CONTIGUOUS"]:
+        raise ValueError(
+            f"FFI argument {name!r} must be C-contiguous to cross zero-copy; got "
+            f"a non-contiguous array (coercing would force a sample-scale copy)."
+        )
+    if arr.dtype != dt:
+        raise ValueError(
+            f"FFI argument {name!r} must have dtype {dt}; got {arr.dtype} "
+            f"(coercing would force a sample-scale cast/copy)."
+        )
+    return arr
+```
+
+- [ ] **Step 4: Run the FFI-guard test to verify it passes**
+
+Run: `pixi run -e dev pytest tests/unit/dataset/test_ffi_array.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (3 tests).
+
+- [ ] **Step 5: Apply the guard in the track-fused path**
+
+In `python/genvarloader/_dataset/_reconstruct.py`, add the import near the top (it already imports from `._utils`; if not, add `from ._utils import _ffi_array`). Then in the `intervals_and_realign_track_fused(...)` call (`:232-250`), replace the sample-scale args:
+
+`geno_v_idxs` (`:232-234`):
+
+```python
+                        geno_v_idxs=_ffi_array(
+                            self.haps.genotypes.data, np.int32, "geno_v_idxs"
+                        ),
+```
+
+`itv_starts` / `itv_ends` / `itv_values` / `itv_offsets` (`:241-250`):
+
+```python
+                        itv_starts=_ffi_array(
+                            intervals.starts.data, np.int32, "itv_starts"
+                        ),
+                        itv_ends=_ffi_array(intervals.ends.data, np.int32, "itv_ends"),
+                        itv_values=_ffi_array(
+                            intervals.values.data, np.float32, "itv_values"
+                        ),
+                        itv_offsets=_ffi_array(
+                            intervals.starts.offsets, np.int64, "itv_offsets"
+                        ),
+```
+
+Leave `v_starts` and `ilens` (`:236-239`) as `np.ascontiguousarray(...)` — Task 5 converts those to the cached arrays. Leave `o_idx`, `out_ofsts_per_t`, `regions`, `shifts`, `geno_idx`, `track_ofsts_per_t`, `params`, `keep`, `keep_offsets` as `np.ascontiguousarray(...)` (batch-bounded).
+
+- [ ] **Step 6: Apply the guard to the fused haps/annotated/splice calls**
+
+In `python/genvarloader/_dataset/_haps.py`, add `from ._utils import _ffi_array` to the imports if not already present. Then replace `geno_v_idxs` in all three fused calls:
+
+`:796` (plain `reconstruct_haplotypes_fused`):
+
+```python
+                    geno_v_idxs=_ffi_array(self.genotypes.data, np.int32, "geno_v_idxs"),
+```
+
+`:869` (`reconstruct_haplotypes_spliced_fused`):
+
+```python
+                geno_v_idxs=_ffi_array(self.genotypes.data, np.int32, "geno_v_idxs"),
+```
+
+`:958` (`reconstruct_annotated_haplotypes_fused`):
+
+```python
+                        geno_v_idxs=_ffi_array(self.genotypes.data, np.int32, "geno_v_idxs"),
+```
+
+Leave the sub-linear args (`v_starts`, `ilens`, `alt_alleles`, `alt_offsets`, `ref_`, `ref_offsets`) as `np.ascontiguousarray(...)` for now — Task 5. Leave `regions`, `shifts`, `geno_offset_idx`, `keep`, `keep_offsets`, `permuted_regions`, `flat_shifts`, `flat_geno_offset_idx`, `out_offsets` as `np.ascontiguousarray(...)` (batch-bounded). Leave `_as_starts_stops(self.genotypes.offsets)` untouched.
+
+- [ ] **Step 7: Write the failing scale-guard test**
+
+Create `tests/integration/test_scale_guard.py`:
+
+```python
+"""Scale-guard: no per-batch copy materializes a memmap on the read path (Task 4).
+
+Mirrors the py-spy diagnostic that found the defect: monkeypatch
+np.ascontiguousarray over one ds[r, s] and assert zero copies whose source
+.base is an np.memmap.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import genvarloader as gvl
+
+
+@pytest.fixture
+def _no_memmap_copies(monkeypatch):
+    real = np.ascontiguousarray
+    offenders: list[str] = []
+
+    def spy(a, dtype=None, *args, **kwargs):
+        arr = np.asarray(a)
+        base = getattr(arr, "base", None)
+        if isinstance(base, np.memmap) or isinstance(arr, np.memmap):
+            # A copy would be forced iff non-contiguous or dtype-mismatched.
+            would_copy = (not arr.flags["C_CONTIGUOUS"]) or (
+                dtype is not None and arr.dtype != np.dtype(dtype)
+            )
+            if would_copy:
+                offenders.append(f"{getattr(arr, 'shape', None)} {arr.dtype}->{dtype}")
+        return real(a, dtype, *args, **kwargs)
+
+    monkeypatch.setattr(np, "ascontiguousarray", spy)
+    return offenders
+
+
+def test_tracks_only_no_memmap_copy(track_dataset_path, reference, _no_memmap_copies):
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_tracks("cov")
+    _ = ds[0, 0]
+    assert _no_memmap_copies == [], f"sample-scale memmap copies: {_no_memmap_copies}"
+
+
+def test_haps_no_memmap_copy(track_dataset_path, reference, _no_memmap_copies):
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_seqs("haplotypes")
+    _ = ds[0, 0]
+    assert _no_memmap_copies == [], f"sample-scale memmap copies: {_no_memmap_copies}"
+
+
+def test_annotated_no_memmap_copy(track_dataset_path, reference, _no_memmap_copies):
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_seqs("annotated")
+    _ = ds[0, 0]
+    assert _no_memmap_copies == [], f"sample-scale memmap copies: {_no_memmap_copies}"
+```
+
+- [ ] **Step 8: Run the scale-guard test**
+
+Run: `pixi run -e dev pytest tests/integration/test_scale_guard.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS. (After Task 1 the interval memmaps are contiguous and the guard replaced their `ascontiguousarray`; `genotypes.data`/`offsets` and the reference/variant memmaps are contiguous so no copy is forced. If any test fails, the offender list names the shape/dtype — that is a real sample-scale copy to eliminate, not a test to relax.)
+
+- [ ] **Step 9: Run parity on both backends**
+
+Run: `pixi run -e dev pytest tests/parity tests/dataset tests/unit -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+- [ ] **Step 10: Lint, format, typecheck, commit**
+
+Run: `pixi run -e dev ruff format python/ tests/ && pixi run -e dev ruff check python/ tests/ && pixi run -e dev typecheck`
+Expected: clean.
+
+```bash
+rtk git add python/genvarloader/_dataset/_utils.py python/genvarloader/_dataset/_reconstruct.py python/genvarloader/_dataset/_haps.py tests/unit/dataset/test_ffi_array.py tests/integration/test_scale_guard.py
+rtk git commit -m "feat(ffi): zero-copy boundary guard for sample-scale memmaps
+
+Replace silent np.ascontiguousarray on per-sample-scale interval/genotype
+memmaps with _ffi_array (cross zero-copy or raise). Scale-guard test asserts
+no memmap-materializing copy on the read path.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 5: RAM-cache the sub-linear static arrays (Component E)
+
+Cache, once per `Haps` reconstructor, the typed-contiguous per-variant/reference arrays the kernels consume, dropping their per-batch `np.ascontiguousarray` (chiefly the `int64`→`int32` recast of `v_starts`).
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_haps.py` (add `_HapsFfiStatic` dataclass + `_ffi_static` field + `ffi_static` property on `Haps` `:238-280`; replace sub-linear args at the fused calls `:797-806`, `:870-877`, `:959-970`)
+- Modify: `python/genvarloader/_dataset/_reconstruct.py` (`v_starts`/`ilens` in the track-fused call `:236-239`)
+- Create: `tests/unit/dataset/test_haps_ffi_cache.py`
+
+**Interfaces:**
+- Produces: `Haps.ffi_static -> _HapsFfiStatic` (cached) with fields:
+  - `v_starts: NDArray[np.int32]` (from `variants.start`, int64→int32)
+  - `ilens: NDArray[np.int32]` (from `variants.ilen`)
+  - `alt_alleles: NDArray[np.uint8]` (from `variants.alt.data.view(np.uint8)`)
+  - `alt_offsets: NDArray[np.int64]` (from `variants.alt.offsets`)
+  - `ref: NDArray[np.uint8] | None` (from `reference.reference`; `None` if no reference)
+  - `ref_offsets: NDArray[np.int64] | None` (from `reference.offsets`; `None` if no reference)
+- Consumes: `self.variants` (`_Variants`), `self.reference` (`Reference | None`).
+- **Excluded from caching:** per-sample-scale arrays (genotypes) — those are governed by Task 4.
+
+- [ ] **Step 1: Write the failing cache test**
+
+Create `tests/unit/dataset/test_haps_ffi_cache.py`:
+
+```python
+"""Haps caches FFI-ready sub-linear arrays once (Task 5)."""
+
+from __future__ import annotations
+
+import numpy as np
+
+import genvarloader as gvl
+from genvarloader._dataset._haps import Haps
+
+
+def _haps(track_dataset_path, reference) -> Haps:
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_seqs("haplotypes")
+    seqs = ds._seqs
+    assert isinstance(seqs, Haps)
+    return seqs
+
+
+def test_ffi_static_cached(track_dataset_path, reference):
+    haps = _haps(track_dataset_path, reference)
+    first = haps.ffi_static
+    second = haps.ffi_static
+    assert first is second  # cached, computed once
+
+
+def test_ffi_static_contiguous_and_typed(track_dataset_path, reference):
+    s = _haps(track_dataset_path, reference).ffi_static
+    assert s.v_starts.dtype == np.int32 and s.v_starts.flags["C_CONTIGUOUS"]
+    assert s.ilens.dtype == np.int32 and s.ilens.flags["C_CONTIGUOUS"]
+    assert s.alt_alleles.dtype == np.uint8 and s.alt_alleles.flags["C_CONTIGUOUS"]
+    assert s.alt_offsets.dtype == np.int64 and s.alt_offsets.flags["C_CONTIGUOUS"]
+    assert s.ref is not None and s.ref.dtype == np.uint8 and s.ref.flags["C_CONTIGUOUS"]
+    assert s.ref_offsets is not None and s.ref_offsets.dtype == np.int64
+
+
+def test_ffi_static_v_starts_matches_source(track_dataset_path, reference):
+    haps = _haps(track_dataset_path, reference)
+    np.testing.assert_array_equal(
+        haps.ffi_static.v_starts, np.asarray(haps.variants.start, np.int32)
+    )
+```
+
+- [ ] **Step 2: Run the test to verify it fails**
+
+Run: `pixi run -e dev pytest tests/unit/dataset/test_haps_ffi_cache.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: FAIL — `AttributeError: 'Haps' object has no attribute 'ffi_static'` (and `_HapsFfiStatic` import would fail if referenced).
+
+- [ ] **Step 3: Add the cache dataclass and property**
+
+In `python/genvarloader/_dataset/_haps.py`, add a small dataclass above `class Haps` (near the existing `@dataclass(slots=True)` at `:238`):
+
+```python
+@dataclass(slots=True)
+class _HapsFfiStatic:
+    """FFI-ready, contiguous, correctly-typed sub-linear arrays consumed by the
+    fused kernels. Grows only with the variant/reference count (sub-linear in
+    samples), so it is cached for the lifetime of the Haps reconstructor."""
+
+    v_starts: NDArray[np.int32]
+    ilens: NDArray[np.int32]
+    alt_alleles: NDArray[np.uint8]
+    alt_offsets: NDArray[np.int64]
+    ref: "NDArray[np.uint8] | None"
+    ref_offsets: "NDArray[np.int64] | None"
+```
+
+On the `Haps` dataclass, add a private cache field. Place it among the other `field(init=False)` declarations (e.g. after `available_var_fields: list[str] = field(init=False)` at `:262`):
+
+```python
+    _ffi_static: "_HapsFfiStatic | None" = field(default=None, init=False)
+```
+
+And add the property (anywhere in the `Haps` class body, e.g. after `__post_init__`):
+
+```python
+    @property
+    def ffi_static(self) -> _HapsFfiStatic:
+        """Lazily-computed, cached FFI-ready sub-linear arrays (see _HapsFfiStatic)."""
+        if self._ffi_static is None:
+            ref = self.reference
+            self._ffi_static = _HapsFfiStatic(
+                v_starts=np.ascontiguousarray(self.variants.start, np.int32),
+                ilens=np.ascontiguousarray(self.variants.ilen, np.int32),
+                alt_alleles=np.ascontiguousarray(
+                    self.variants.alt.data.view(np.uint8), np.uint8
+                ),
+                alt_offsets=np.ascontiguousarray(self.variants.alt.offsets, np.int64),
+                ref=None if ref is None else np.ascontiguousarray(ref.reference, np.uint8),
+                ref_offsets=None
+                if ref is None
+                else np.ascontiguousarray(ref.offsets, np.int64),
+            )
+        return self._ffi_static
+```
+
+(`Haps` is `@dataclass(slots=True)` but not frozen, so assigning `self._ffi_static` is allowed; `NDArray` is already imported in `_haps.py`.)
+
+- [ ] **Step 4: Use the cache in the fused haps/annotated/splice calls**
+
+In `python/genvarloader/_dataset/_haps.py`, at the plain fused call (`:797-806`) replace:
+
+```python
+                    v_starts=np.ascontiguousarray(self.variants.start, np.int32),
+                    ilens=np.ascontiguousarray(self.variants.ilen, np.int32),
+                    alt_alleles=np.ascontiguousarray(
+                        self.variants.alt.data.view(np.uint8), np.uint8
+                    ),
+                    alt_offsets=np.ascontiguousarray(
+                        self.variants.alt.offsets, np.int64
+                    ),
+                    ref_=np.ascontiguousarray(self.reference.reference, np.uint8),
+                    ref_offsets=np.ascontiguousarray(self.reference.offsets, np.int64),
+```
+
+with:
+
+```python
+                    v_starts=self.ffi_static.v_starts,
+                    ilens=self.ffi_static.ilens,
+                    alt_alleles=self.ffi_static.alt_alleles,
+                    alt_offsets=self.ffi_static.alt_offsets,
+                    ref_=self.ffi_static.ref,
+                    ref_offsets=self.ffi_static.ref_offsets,
+```
+
+Apply the identical replacement at the spliced fused call (`:870-877`) and the annotated fused call (`:959-970`), matching each call's indentation. (Each of those three sites asserts `self.reference is not None` upstream, so `ffi_static.ref`/`ref_offsets` are non-`None` there.)
+
+- [ ] **Step 5: Use the cache in the track-fused call**
+
+In `python/genvarloader/_dataset/_reconstruct.py`, at the `intervals_and_realign_track_fused(...)` call (`:236-239`) replace:
+
+```python
+                        v_starts=np.ascontiguousarray(
+                            self.haps.variants.start, np.int32
+                        ),
+                        ilens=np.ascontiguousarray(self.haps.variants.ilen, np.int32),
+```
+
+with:
+
+```python
+                        v_starts=self.haps.ffi_static.v_starts,
+                        ilens=self.haps.ffi_static.ilens,
+```
+
+- [ ] **Step 6: Run the cache test**
+
+Run: `pixi run -e dev pytest tests/unit/dataset/test_haps_ffi_cache.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (3 tests).
+
+- [ ] **Step 7: Run parity + scale-guard on both backends**
+
+Run: `pixi run -e dev pytest tests/parity tests/dataset tests/unit tests/integration -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (scale-guard still green — `v_starts` is no longer recast from a memmap per batch).
+
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+- [ ] **Step 8: Lint, format, typecheck, commit**
+
+Run: `pixi run -e dev ruff format python/ tests/ && pixi run -e dev ruff check python/ tests/ && pixi run -e dev typecheck`
+Expected: clean.
+
+```bash
+rtk git add python/genvarloader/_dataset/_haps.py python/genvarloader/_dataset/_reconstruct.py tests/unit/dataset/test_haps_ffi_cache.py
+rtk git commit -m "perf(haps): cache FFI-ready sub-linear per-variant arrays
+
+Compute v_starts(int32)/ilens/alt/ref once per reconstructor instead of
+re-coercing every batch (chiefly the int64->int32 v_starts recast).
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 6: Skip zero-initialization where provably full-write (Component F)
+
+Replace `Array1::zeros(total)` with uninitialized allocation in the fused kernels, **only** for buffers the reconstruct/track core overwrites at every position. Isolated in its own commit so it can be reverted independently — this is the one component where parity could regress if the full-write invariant is wrong.
+
+**Files:**
+- Modify: `src/ffi/mod.rs` (add `uninit_output` helper; apply at the data-buffer allocations `:453`, `:530`, `:669`, `:670`, `:671`; conditionally `:867`)
+
+**Interfaces:**
+- Produces: `fn uninit_output<T: Copy>(len: usize) -> Array1<T>` — an uninitialized owned buffer; safe only when every element is written before any read.
+- **Do NOT touch** the `out_offsets_vec` allocations (`:432`, `:648`) — those are read during incremental accumulation.
+
+- [ ] **Step 1: Establish the parity baseline (both backends)**
+
+Run: `pixi run -e dev maturin develop --release && pixi run -e dev cargo test`
+Expected: PASS (clean starting point before the risky change).
+
+Run: `pixi run -e dev pytest tests/parity/test_reconstruct_haplotypes_parity.py tests/parity/test_fused_haps_parity.py tests/parity/test_fused_tracks_parity.py -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+- [ ] **Step 2: Add the uninitialized-allocation helper**
+
+In `src/ffi/mod.rs`, add near the top of the module (after the imports, before the first `#[pyfunction]`):
+
+```rust
+/// Allocate an output buffer of `len` elements WITHOUT zero-initialization.
+///
+/// SAFETY/INVARIANT: every element is fully overwritten by the reconstruct/track
+/// core before it is read. For in-contract inputs the core writes every output
+/// position; out-of-contract inputs (e.g. a deletion driving `ref_idx` past the
+/// contig end) are already undefined and excluded from the parity oracle by the
+/// overshoot/double-init guards in
+/// tests/parity/test_reconstruct_haplotypes_parity.py, so skipping the zero-init
+/// adds no new observable exposure. `T` is a plain numeric type (u8/i32/f32) with
+/// no invalid bit patterns.
+#[allow(clippy::uninit_vec)]
+fn uninit_output<T: Copy>(len: usize) -> Array1<T> {
+    let mut v: Vec<T> = Vec::with_capacity(len);
+    // SAFETY: see function-level invariant — every element is written before read.
+    unsafe {
+        v.set_len(len);
+    }
+    Array1::from_vec(v)
+}
+```
+
+- [ ] **Step 3: Apply to the plain fused haplotype buffer**
+
+In `src/ffi/mod.rs:453` replace:
+
+```rust
+    let mut out_data: Array1<u8> = Array1::zeros(total);
+```
+
+with:
+
+```rust
+    let mut out_data: Array1<u8> = uninit_output(total);
+```
+
+- [ ] **Step 4: Apply to the spliced fused haplotype buffer**
+
+In `src/ffi/mod.rs:530` replace the same `Array1::zeros(total)` for `out_data` with `uninit_output(total)`.
+
+- [ ] **Step 5: Apply to the annotated fused buffers**
+
+In `src/ffi/mod.rs:669-671` replace:
+
+```rust
+    let mut out_data: Array1<u8> = Array1::zeros(total);
+    let mut annot_v: Array1<i32> = Array1::zeros(total);
+    let mut annot_pos: Array1<i32> = Array1::zeros(total);
+```
+
+with:
+
+```rust
+    let mut out_data: Array1<u8> = uninit_output(total);
+    let mut annot_v: Array1<i32> = uninit_output(total);
+    let mut annot_pos: Array1<i32> = uninit_output(total);
+```
+
+- [ ] **Step 6: Verify the tracks scratch buffer is full-write before converting**
+
+The tracks-fused scratch (`src/ffi/mod.rs:867`, `Array1::<f32>::zeros(scratch_len)`) is filled by `intervals::intervals_to_tracks` and then read by `shift_and_realign_tracks_sparse`. Read `intervals_to_tracks` (in `src/intervals.rs` or wherever the core lives — find with `grep -rn "fn intervals_to_tracks" src/`) and confirm it writes **every** position of the scratch slice for in-contract inputs. If any scratch position can be left unwritten (a gap defaulting to 0 that the downstream read relies on), **leave `:867` as `Array1::zeros`** and add a one-line comment explaining why it must stay zero-initialized. If it is provably full-write, replace `:867`:
+
+```rust
+    let mut scratch = uninit_output::<f32>(scratch_len);
+```
+
+Record your determination in the commit message.
+
+- [ ] **Step 7: Rebuild and run cargo tests + clippy**
+
+Run: `pixi run -e dev maturin develop --release && pixi run -e dev cargo test && pixi run -e dev cargo clippy`
+Expected: PASS, clippy clean (the `#[allow(clippy::uninit_vec)]` is scoped to the helper).
+
+- [ ] **Step 8: Run the reconstruct/track parity suites on both backends**
+
+Run: `pixi run -e dev pytest tests/parity/test_reconstruct_haplotypes_parity.py tests/parity/test_fused_haps_parity.py tests/parity/test_fused_tracks_parity.py tests/parity/test_spliced_haplotypes_parity.py -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS. (If any parity test now fails, the full-write invariant was wrong for that buffer — revert the offending `uninit_output` line back to `Array1::zeros` and re-run.)
+
+- [ ] **Step 9: Full suite + commit**
+
+Run: `pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+```bash
+rtk git add src/ffi/mod.rs
+rtk git commit -m "perf(ffi): skip zero-init of fully-overwritten fused output buffers
+
+Allocate out_data/annot_v/annot_pos (and scratch where verified full-write)
+uninitialized; the reconstruct/track core writes every in-contract position.
+Out-of-contract inputs are already excluded from the parity oracle. Isolated
+for independent revert.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 7: Documentation — SKILL.md + roadmap
+
+Per `CLAUDE.md`, the new public symbol (`migrate`) and the on-disk format bump require a `skills/genvarloader/SKILL.md` update; the roadmap is the source of truth for the migration targets.
+
+**Files:**
+- Modify: `skills/genvarloader/SKILL.md`
+- Modify: `docs/roadmaps/rust-migration.md`
+
+**Interfaces:** none (docs only).
+
+- [ ] **Step 1: Read the current skill and roadmap sections**
+
+Run: `rtk read skills/genvarloader/SKILL.md`
+Read the "open a dataset" workflow section and the "Common gotchas" / "Where to look next" pointer table.
+
+Run: `rtk read docs/roadmaps/rust-migration.md`
+Find the Phase 3 optimization targets (targets 1–2 and the zero-init part of target 3) referenced by the spec.
+
+- [ ] **Step 2: Update SKILL.md**
+
+In `skills/genvarloader/SKILL.md`:
+- In the open-a-dataset workflow, add a note that datasets written by genvarloader < 2.0 must be upgraded once with `genvarloader.migrate(path)` (in place, streaming, idempotent, crash-safe), and that opening a pre-2.0 dataset raises a `ValueError` with that hint.
+- Add `migrate(path)` to the public-API surface listing (it is now in `__all__`).
+- Note that format 2.0 stores track intervals as struct-of-arrays (`starts/ends/values.npy`) rather than the 1.x `intervals.npy` record array — relevant to anyone inspecting a dataset directory on disk.
+- Re-check the "Common gotchas" and "Where to look next" pointer table for accuracy against this change.
+
+- [ ] **Step 3: Update the roadmap**
+
+In `docs/roadmaps/rust-migration.md`:
+- Tick the optimization targets addressed: the track-interval AoS→SoA copy (target 1), the genotype `ascontiguousarray` footgun + sub-linear caching (target 2), and the zero-init skip portion of target 3.
+- Record throughput: re-run `pixi run -e dev pytest tests/benchmarks/test_e2e.py -q --basetemp=$(pwd)/.pytest_tmp` on both `GVL_BACKEND=rust` and `GVL_BACKEND=numba` and note the rust tracks/annotated numbers (expected to close further on numba now the per-batch interval copy is gone). Recorded, not gated.
+- Set the relevant phase status marker (⬜/🚧/✅) and link this PR.
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add skills/genvarloader/SKILL.md docs/roadmaps/rust-migration.md
+rtk git commit -m "docs: document gvl.migrate + format 2.0 SoA; record throughput
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+- [ ] **Step 5: Final full-tree verification before integration**
+
+Run: `pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (whole tree, both dataset and unit).
+
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+Run: `pixi run -e dev cargo test && pixi run -e dev cargo clippy && pixi run -e dev ruff check python/ tests/ && pixi run -e dev typecheck`
+Expected: all clean.
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- Component A (AoS→SoA + version bump) → Task 1, incl. the **two Rust writers** (`bigwig.rs`, `tables.rs`) the spec's "no Rust change" note missed, plus their oracle byte tests, and all four Python/Rust writers + the reader.
+- Component B (version gate) → Task 2.
+- Component C (`gvl.migrate`) → Task 3.
+- Component D (zero-copy FFI + `_ffi_array` guard) → Task 4, incl. the scale-guard gate.
+- Component E (cache sub-linear arrays) → Task 5.
+- Component F (skip zero-init) → Task 6, with the scratch-buffer full-write verification the spec flagged as the one parity-risk site.
+- Testing & parity (round-trip, version gate, scale-guard, FFI-guard) → Tasks 1–5 tests; both-backend parity runs in every task.
+- SKILL.md + roadmap → Task 7.
+
+**Placeholder scan:** every code step shows complete code; every run step shows the exact command and expected result. The one deliberately conditional step (Task 6 Step 6, scratch buffer) gives an explicit decision rule and both outcomes, because correctness there depends on a fact (`intervals_to_tracks` full-write) that must be verified in-repo, not assumed.
+
+**Type/name consistency:** `_ffi_array(arr, dtype, name)` (Task 4) is consumed unchanged in Task 4 call sites. `_HapsFfiStatic` field names (`v_starts`, `ilens`, `alt_alleles`, `alt_offsets`, `ref`, `ref_offsets`) (Task 5) match the kernel kwargs (`v_starts`, `ilens`, `alt_alleles`, `alt_offsets`, `ref_`, `ref_offsets`) — note the kernel kwarg is `ref_` but the cache field is `ref`; the call sites map `ref_=self.ffi_static.ref`. `track_dataset_path` fixture (Task 1) is reused by Tasks 2–5. `DATASET_FORMAT_VERSION` and `_check_dataset_format_version` (Tasks 1–2) are imported consistently. `uninit_output<T>` (Task 6) is applied only to data buffers, never to `out_offsets_vec`.
+
+**Notes carried forward for the implementer:**
+- The second, unused `INTERVAL_DTYPE` at `_types.py:18` is intentionally left untouched (not on any path).
+- `_as_starts_stops` / `_geno_offsets_2d` are intentionally unchanged (output base is not a memmap → never trips the scale-guard).
+- After Rust edits, always `maturin develop --release` before Python tests.
diff --git a/docs/superpowers/plans/2026-06-26-phase-4-measurements.md b/docs/superpowers/plans/2026-06-26-phase-4-measurements.md
new file mode 100644
index 00000000..ba91c1ed
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-26-phase-4-measurements.md
@@ -0,0 +1,88 @@
+# Phase 4 Close-Out: Perf + RSS Measurements
+
+**Date:** 2026-06-26
+**Machine:** Carter HPC (AMD EPYC 7543, linux-64)
+**Corpus:** chr22_geuv (5 samples, 165 e-gene regions)
+**Measured-at code HEAD:** 32132c9 (test(bench): realistic chr22_geuv write/update perf driver)
+**Build:** `maturin develop --release` (abi3, CPython 3.10)
+**NUMBA_NUM_THREADS=1** (single-threaded control)
+
+---
+
+## write() — wall-clock (median of 3)
+
+| Run | wall |
+|-----|------|
+| 1   | 1.959s |
+| 2   | 1.911s |
+| 3   | 1.934s |
+
+**Median: 1.934s**
+
+## write() — peak RSS (memray)
+
+Peak memory usage: **3.520 GB**
+
+---
+
+## update() — wall-clock (median of 3)
+
+| Run | wall |
+|-----|------|
+| 1   | 0.091s |
+| 2   | 0.081s |
+| 3   | 0.081s |
+
+**Median: 0.081s** (track=read-depth-2, samples=5)
+
+## update() — peak RSS (memray)
+
+Peak memory usage: **3.519 GB**
+
+> **Caveat:** run_update() writes the base dataset (untimed gvl.write) and then runs the timed gvl.update in the SAME process. This memray process-peak is therefore dominated by the base-dataset write (≈ the write() peak above), NOT the marginal cost of update(). The update WALL (0.081s) IS correctly isolated to the gvl.update call; update's peak RSS in isolation is not measured by this single-process driver.
+
+---
+
+## Full-tree parity gate
+
+### Rust backend (default)
+```
+984 passed, 21 skipped, 4 xfailed, 1 warning in 277.23s (0:04:37)
+```
+Result: **PASS** (0 failures)
+
+### Numba backend (GVL_BACKEND=numba)
+```
+984 passed, 21 skipped, 4 xfailed, 1 warning in 254.08s (0:04:14)
+```
+Result: **PASS** (0 failures). @slow tests run by default in this repo (no -m "not slow" addopts, no --runslow skip hook). The pre-existing flaky test tests/unit/test_double_buffered_loader.py::test_shm_cleanup_after_close (intermittent /dev/shm gvl- segment leak on the numba backend; rust always passes) did NOT fail this run — not a regression.
+
+---
+
+## Write-path parity (tests/parity)
+
+```
+77 passed, 1 skipped in 79.77s (0:01:19)
+```
+Result: **PASS**
+
+---
+
+## cargo-test + lint + typecheck
+
+| Check | Result |
+|-------|--------|
+| `cargo test --release` | PASS (107 + 4 + 0 = 111 tests; pre-existing `unused variable: n_contigs` warning noted, not a regression) |
+| `ruff check python/ tests/` | PASS (all checks passed) |
+| `ruff format --check python/ tests/` | PASS (after auto-format of _write.py) |
+| `pyrefly check` | PASS (0 errors, 37 suppressed, 392 warnings) |
+
+---
+
+## Notes
+
+- Test infrastructure: added `__init__.py` to `tests/unit/`, `tests/unit/dataset/`,
+  `tests/integration/`, `tests/integration/dataset/` to fix collection collision between
+  two same-named `test_write.py` files (committed separately as fix commit f92e386).
+- `maturin develop --release` produced abi3 wheel `genvarloader-0.35.0-cp310-abi3-linux_x86_64.whl`.
+- memray output files written to worktree root (w.bin, u.bin) to avoid cross-device EXDEV.
diff --git a/docs/superpowers/plans/2026-06-26-rc-alleles-instruction-tuning.md b/docs/superpowers/plans/2026-06-26-rc-alleles-instruction-tuning.md
new file mode 100644
index 00000000..cd2ca1fe
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-26-rc-alleles-instruction-tuning.md
@@ -0,0 +1,292 @@
+# rc_alleles_inplace Instruction-Level Tuning Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Reduce the instruction count of `variants::rc_alleles_inplace` (the only compute kernel from PR #251, never covered by the round-3 #252 pass) by fusing its row→allele mask expansion and delegation into a single pass, byte-identical to today.
+
+**Architecture:** Extract the per-row reverse+complement body (already round-3-vectorized inside `rc_flat_rows_inplace`) into a shared `#[inline]` helper `reverse::rc_row`, then rewrite `rc_alleles_inplace` to walk masked rows → alleles and call `rc_row` directly — deleting a per-call `Vec<bool>` heap alloc+memset, an `Array1` wrap, and a redundant full-allele rescan.
+
+**Tech Stack:** Rust (ndarray, PyO3), `cargo-show-asm` (`cargo asm`), `maturin`, `pixi` (`-e dev`), `pytest` + `hypothesis` (parity), `cargo test`.
+
+**Spec:** `docs/superpowers/specs/2026-06-26-rc-alleles-instruction-tuning-design.md`
+
+## Global Constraints
+
+Every task implicitly includes these. Values copied verbatim from the spec.
+
+- **Parity is sacrosanct:** `rc_alleles_inplace` output must stay **byte-identical** to the seqpro reference on both backends. The migration contract; a change only lands when parity holds.
+- **Gate = parity + instruction-count drop + no throughput regression** (NOT round-3's strict "improve throughput or revert"). This path (`rc_alleles` fires only on negative-strand variants / `RaggedVariants` reads) is wall-clock noise-dominated per the roadmap. Keep iff: parity byte-identical both backends; `cargo asm` instruction count drops; `profile.py --mode variants` rust÷numba **holds** (same session, both backends); and `rc_flat_rows_inplace` asm stays equivalent after the extract.
+- **Risk control on the shared kernel:** `rc_flat_rows_inplace` is on the round-3-tuned haplotype hot path. The `#[inline]` extract must leave its codegen equivalent. If extraction perturbs it, fall back to duplicating the ~6-line complement locally in `rc_alleles_inplace` and leave `rc_flat_rows_inplace` byte-for-byte untouched.
+- **No scope creep:** no on-disk format change, no public API change, no new kernels, no rayon/batch parallelism (Phase 5), no numba/seqpro-reference deletion (Phase 5). No change to `flank_tokens` or `_FlatVariantWindows` (never RC'd).
+- **Always rebuild `--release` before any `cargo asm` / throughput measurement.** `cargo asm` reads the last build's artifact; a stale build gives misleading asm.
+- **Measurement env:** corpus `tests/benchmarks/data/chr22_geuv.gvl`, `NUMBA_NUM_THREADS=1`, `maturin develop --release`, Carter HPC. Report the **rust ÷ numba ratio** measured in the *same session* (shared-node load drifts across sessions).
+- **HPC note:** dataset/parity tests need `--basetemp=$(pwd)/.pytest_tmp` (avoids `os.link` cross-device Errno 18).
+- **Worktrees:** never symlink `.pixi` into the worktree — `maturin develop` repoints the shared env's `.pth`/`.so` and corrupts the parent. Each worktree gets its own fresh pixi env.
+- **Roadmap contract:** this lands under Phase 3, Target-6 / round-3 area of `docs/roadmaps/rust-migration.md`; the roadmap must be updated as part of the work.
+- **Commit trailer:** end every commit message with `Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>`.
+
+---
+
+### Task 1: Worktree + fresh pixi env + baseline asm capture
+
+**Files:**
+- Create: new git worktree directory (outside the repo tree), branch `opt/rc-alleles-instruction-tuning` off `rust-migration`.
+
+**Interfaces:**
+- Consumes: nothing.
+- Produces: an isolated worktree with its own pixi env, a working `--release` build, and the recorded `asm_*_before.txt` baselines all later tasks compare against.
+
+- [ ] **Step 1: Create the worktree via the using-git-worktrees skill**
+
+Use the `superpowers:using-git-worktrees` skill to create a worktree for branch `opt/rc-alleles-instruction-tuning` based on `rust-migration`. Do **not** symlink `.pixi` into it (per Global Constraints).
+
+- [ ] **Step 2: Install a fresh dev pixi env in the worktree**
+
+Run (from the worktree root): `pixi install -e dev`
+Expected: a populated `.pixi/envs/dev` local to the worktree.
+
+- [ ] **Step 3: Release build + variants-mode smoke**
+
+Run: `pixi run -e dev maturin develop --release`
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variants --n-batches 20`
+Expected: a `done wall=... throughput=... batch/s` line, no exception. (If the corpus is missing, build it: `pixi run -e dev python tests/benchmarks/data/build_realistic.py`.)
+
+- [ ] **Step 4: Record the asm baselines (evidence)**
+
+Run: `cargo asm --rust genvarloader::variants::rc_alleles_inplace > asm_rc_alleles_before.txt 2>&1`
+Run: `cargo asm --rust genvarloader::reverse::rc_flat_rows_inplace > asm_rc_flat_before.txt 2>&1`
+Expected: each prints x86-64 assembly for the function. Note the total instruction count of each (used as the before-numbers in Task 2 and Task 3). If `cargo asm` lists candidates instead of a body, copy the exact mangled path it offers and use that verbatim in later tasks.
+
+- [ ] **Step 5: Record the throughput baseline (gate reference)**
+
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variants --n-batches 2000`
+Run: `GVL_BACKEND=numba pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variants --n-batches 2000`
+Record both ms/batch and the rust ÷ numba ratio. This is the number the final change must hold (not regress).
+
+No code change yet; nothing to commit.
+
+---
+
+### Task 2: Extract the shared `reverse::rc_row` helper
+
+**Files:**
+- Modify: `src/reverse.rs` (add `rc_row`; rewrite `rc_flat_rows_inplace`'s masked branch to call it)
+- Test: `src/reverse.rs` `#[cfg(test)] mod tests` (existing reverse/rc tests are the regression lock)
+
+**Interfaces:**
+- Consumes: nothing new.
+- Produces: `pub(crate) fn rc_row(row: &mut [u8])` — reverses `row` then applies the branchless-vectorized ACGT↔TGCA complement (identity for other bytes), byte-identical to the prior inline body. `rc_flat_rows_inplace` keeps its exact signature `(data: &mut [u8], offsets: ArrayView1<i64>, to_rc: ArrayView1<bool>)` and behavior.
+
+- [ ] **Step 1: Confirm the existing reverse tests pass (regression baseline)**
+
+Run: `pixi run -e dev cargo test --lib reverse 2>&1 | tail -5`
+Expected: `test result: ok` (covers `rc_reverses_and_complements_masked_rows_only`, `rc_handles_odd_length_and_n`, `empty_row_and_all_false_are_noops`, `arith_complement_matches_comp_for_all_256_bytes`, the f32/i32 reverse tests). These are the byte-identity lock for the extract.
+
+- [ ] **Step 2: Add `rc_row` and call it from `rc_flat_rows_inplace`**
+
+In `src/reverse.rs`, add `rc_row` (the body is lifted verbatim from the current `rc_flat_rows_inplace` masked branch):
+
+```rust
+/// Reverse a single row of bytes then DNA-complement it in place via the
+/// branchless ACGT↔TGCA arithmetic (identity for every other byte; A/T = XOR
+/// 0x15, C/G = XOR 0x04). `#[inline]` so callers (rc_flat_rows_inplace,
+/// rc_alleles_inplace) inline it back to the prior codegen.
+#[inline]
+pub(crate) fn rc_row(row: &mut [u8]) {
+    row.reverse();
+    for b in row.iter_mut() {
+        let v = *b;
+        let at = (((v == b'A') | (v == b'T')) as u8).wrapping_neg(); // 0xFF if A/T
+        let cg = (((v == b'C') | (v == b'G')) as u8).wrapping_neg(); // 0xFF if C/G
+        *b = v ^ (at & 21) ^ (cg & 4);
+    }
+}
+```
+
+Replace the body of `rc_flat_rows_inplace` with the helper call:
+
+```rust
+/// Reverse AND complement bytes within each masked row via `rc_row`.
+pub fn rc_flat_rows_inplace(
+    data: &mut [u8],
+    offsets: ArrayView1<i64>,
+    to_rc: ArrayView1<bool>,
+) {
+    for i in 0..to_rc.len() {
+        if !to_rc[i] {
+            continue;
+        }
+        let s = offsets[i] as usize;
+        let e = offsets[i + 1] as usize;
+        rc_row(&mut data[s..e]);
+    }
+}
+```
+
+- [ ] **Step 3: Rebuild and run the reverse tests — must still pass**
+
+Run: `pixi run -e dev maturin develop --release`
+Run: `pixi run -e dev cargo test --lib reverse 2>&1 | tail -5`
+Expected: `test result: ok` (unchanged from Step 1 — proves the extract is byte-identical).
+
+- [ ] **Step 4: Confirm `rc_flat_rows_inplace` asm is equivalent (risk gate)**
+
+Run: `cargo asm --rust genvarloader::reverse::rc_flat_rows_inplace > asm_rc_flat_after.txt 2>&1`
+Run: `diff asm_rc_flat_before.txt asm_rc_flat_after.txt; echo "exit=$?"`
+Expected: identical or trivially-equivalent asm (same instruction count; only label/address churn). If the instruction count rose or the loop changed shape, the `#[inline]` extract perturbed the tuned kernel — **revert `rc_flat_rows_inplace` to its original inline body** (leave it byte-for-byte untouched) and instead duplicate the `rc_row` body locally inside `rc_alleles_inplace` in Task 3. Record which path was taken.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/reverse.rs
+git commit -m "refactor(rust): extract reverse::rc_row shared helper
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 3: Fuse `rc_alleles_inplace`
+
+**Files:**
+- Modify: `src/variants/mod.rs` (rewrite `rc_alleles_inplace`, ~lines 88-118)
+- Test: `src/variants/mod.rs` `#[cfg(test)] mod tests` (existing `rc_alleles_*` tests are the regression lock); `tests/parity/test_rc_alleles_parity.py`
+
+**Interfaces:**
+- Consumes: `crate::reverse::rc_row` (Task 2).
+- Produces: `rc_alleles_inplace` keeps its exact signature `(byte_data: &mut [u8], seq_offsets: ArrayView1<i64>, var_offsets: ArrayView1<i64>, to_rc_row: ArrayView1<bool>)` and byte-identical output; no longer allocates a `Vec<bool>` / `Array1` or rescans all alleles.
+
+- [ ] **Step 1: Confirm the existing rc_alleles cargo tests pass (regression baseline)**
+
+Run: `pixi run -e dev cargo test --lib rc_alleles 2>&1 | tail -5`
+Expected: `test result: ok` (`rc_alleles_rcs_only_masked_rows`, `rc_alleles_all_false_is_noop`, `rc_alleles_handles_empty_allele_and_n`). These pin byte-identity through the rewrite.
+
+- [ ] **Step 2: Rewrite `rc_alleles_inplace` as a single fused pass**
+
+In `src/variants/mod.rs`, replace the body of `rc_alleles_inplace` (keep the doc comment; update its last paragraph) with:
+
+```rust
+pub fn rc_alleles_inplace(
+    byte_data: &mut [u8],
+    seq_offsets: ndarray::ArrayView1<i64>,
+    var_offsets: ndarray::ArrayView1<i64>,
+    to_rc_row: ndarray::ArrayView1<bool>,
+) {
+    // Single fused pass: for each masked (b*p) row, reverse-complement each of
+    // its alleles directly via `reverse::rc_row`. `var_offsets` partition the
+    // alleles by row (contiguous, disjoint), so this RCs exactly the alleles the
+    // old per-allele-mask delegation did, in the same order — byte-identical —
+    // without the intermediate `Vec<bool>` alloc or the second full-allele scan.
+    for g in 0..to_rc_row.len() {
+        if !to_rc_row[g] {
+            continue;
+        }
+        let a0 = var_offsets[g] as usize;
+        let a1 = var_offsets[g + 1] as usize;
+        for a in a0..a1 {
+            let s = seq_offsets[a] as usize;
+            let e = seq_offsets[a + 1] as usize;
+            crate::reverse::rc_row(&mut byte_data[s..e]);
+        }
+    }
+}
+```
+
+> If Task 2 Step 4 took the fallback path (kept `rc_flat_rows_inplace` untouched, no shared helper), inline the `rc_row` body here instead of calling `crate::reverse::rc_row` — i.e. `let row = &mut byte_data[s..e]; row.reverse(); for b in row.iter_mut() { ... }` with the same A/T XOR 21, C/G XOR 4 arithmetic.
+
+- [ ] **Step 3: Rebuild and run the rc_alleles cargo tests — must still pass**
+
+Run: `pixi run -e dev maturin develop --release`
+Run: `pixi run -e dev cargo test --lib rc_alleles 2>&1 | tail -5`
+Expected: `test result: ok` (unchanged from Step 1 — proves the fuse is byte-identical).
+
+- [ ] **Step 4: Run the Python parity suite (byte-identical, both backends)**
+
+Run: `pixi run -e dev pytest tests/parity/test_rc_alleles_parity.py -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (the hypothesis parity test + the `_FlatAlleles.reverse_masked` spy test). This compares the rust kernel against the seqpro reference across the allele-batch matrix.
+
+- [ ] **Step 5: Record the asm delta (evidence)**
+
+Run: `cargo asm --rust genvarloader::variants::rc_alleles_inplace > asm_rc_alleles_after.txt 2>&1`
+Run: `diff asm_rc_alleles_before.txt asm_rc_alleles_after.txt; echo "exit=$?"`
+Expected: lower total instruction count than `asm_rc_alleles_before.txt` (the `Vec<bool>` alloc, memset, `Array1::from_vec`, and second scan are gone). Record `<before>→<after>` instruction count.
+
+- [ ] **Step 6: Confirm no throughput regression (gate)**
+
+Run: `pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variants --n-batches 2000`
+Run: `GVL_BACKEND=numba pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variants --n-batches 2000`
+Expected: rust ÷ numba ratio **holds** vs the Task 1 Step 5 baseline (no regression; improvement is a bonus, not required). Record the ratio.
+
+- [ ] **Step 7: Commit**
+
+```bash
+git add src/variants/mod.rs
+git commit -m "perf(rust): fuse rc_alleles_inplace — <before>→<after> instrs, drop Vec<bool> alloc + rescan
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 4: Full-tree gate + roadmap update + finish
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (Target-6 / round-3 area)
+
+**Interfaces:**
+- Consumes: the kept commits from Tasks 2-3 + their recorded asm/ratio deltas.
+- Produces: a landed, fully-verified pass with the roadmap updated per the migration contract.
+
+- [ ] **Step 1: Full pytest tree on BOTH backends**
+
+Run: `pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: both green with the same passed/xfailed profile (byte-identical parity proven on both backends). Read the output; investigate any new failure before proceeding — do NOT claim success without it.
+
+- [ ] **Step 2: cargo tests + lint + format + typecheck + wheel build**
+
+Run: `pixi run -e dev cargo test 2>&1 | tail -5` → `test result: ok`
+Run: `pixi run -e dev ruff check python/ tests/` → clean
+Run: `pixi run -e dev ruff format --check python/ tests/` → clean
+Run: `pixi run -e dev typecheck` → clean
+Run: `pixi run -e dev maturin build 2>&1 | tail -3` → abi3 wheel builds
+
+- [ ] **Step 3: Update the roadmap**
+
+In `docs/roadmaps/rust-migration.md`, under the Target-6 "**✅ Variant-allele RC folded**" block (~lines 491-499), append a dated follow-up note recording the tuning:
+
+```markdown
+   **✅ rc_alleles_inplace instruction-tuned (follow-up, 2026-06-26).** The #251
+   `variants::rc_alleles_inplace` kernel was not in the round-3 (#252) target list;
+   this pass fused its row→allele mask expansion and `rc_flat_rows_inplace` delegation
+   into a single pass via the shared `reverse::rc_row` helper, dropping a per-call
+   `Vec<bool>` alloc+memset, an `Array1` wrap, and a redundant full-allele rescan.
+   Instr <before>→<after> (`cargo asm`); variants-path rust÷numba held (noise-dominated
+   path — gated on parity + instr drop + no regression, not throughput improvement);
+   `rc_flat_rows_inplace` asm unchanged after the extract. Byte-identical parity on both
+   backends. Spec/plan: `docs/superpowers/{specs/2026-06-26-rc-alleles-instruction-tuning-design,plans/2026-06-26-rc-alleles-instruction-tuning}.md`.
+```
+
+Fill `<before>→<after>` with the real numbers recorded in Task 3 Step 5.
+
+- [ ] **Step 4: Commit the roadmap**
+
+```bash
+git add docs/roadmaps/rust-migration.md
+git commit -m "docs(roadmap): record rc_alleles_inplace instruction tuning (Target 6 follow-up)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+- [ ] **Step 5: Finish the branch**
+
+Use the `superpowers:finishing-a-development-branch` skill to integrate `opt/rc-alleles-instruction-tuning` into `rust-migration`. Follow the roadmap precedent of per-target PRs into `rust-migration` (e.g. #248/#249/#250); **no squash merge** (per the `no-squash-merges` note — preserve the real commit history).
+
+---
+
+## Notes for the implementer
+
+- **Why no pre-written asm diffs:** the recorded instruction counts are discovered at execution by running `cargo asm` on this build — fabricating them here would be a placeholder. The transformation itself (fuse + shared helper) is fully specified above; the counts are evidence captured during Tasks 2-3.
+- **One logical change per commit** (Task 2 extract, Task 3 fuse) so either is a clean isolated revert if its asm/throughput gate fails.
+- **Ratios over absolutes:** the Carter node is shared; always re-measure numba in the same session as rust and report the ratio.
+- **The reference IS the oracle:** there is no numba `rc_alleles` kernel; the seqpro path is the byte-identical reference. Parity tests compare rust vs that reference.
diff --git a/docs/superpowers/plans/2026-06-26-rust-migration-phase-4-close-out.md b/docs/superpowers/plans/2026-06-26-rust-migration-phase-4-close-out.md
new file mode 100644
index 00000000..ccf92b56
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-26-rust-migration-phase-4-close-out.md
@@ -0,0 +1,488 @@
+# Rust Migration Phase 4 Close-out Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Close out Rust-migration Phase 4 — delete the last dead write-path numba kernel, capture canonical Carter write/update perf + RSS numbers, confirm write-path parity, and reconcile the roadmap to reality (Phase 4 ✅).
+
+**Architecture:** No new Rust kernel. The default `gvl.write()` / `gvl.update()` path is already Rust-backed (bigWig streaming writer + COITrees table engine; variant IO via genoray). The only remaining write-path numba (`splits_sum_le_value`) is reachable solely through `_write_track_legacy`, the dispatch fall-through for custom `IntervalTrack` types — of which there are zero concrete public implementations. We delete it as dead, replace the fall-through with a hard `TypeError`, then measure and document.
+
+**Tech Stack:** Python (pytest, polars, numpy), Rust (PyO3, abi3), pixi (`-e dev`), memray, numba (read-path references only).
+
+## Global Constraints
+
+- Run all dev tasks through `pixi run -e dev <task>` (this worktree has its own fresh pixi env; no symlinked `.pixi`).
+- Dataset tests need pytest's tmp on the same filesystem as `tests/data`: pass `--basetemp=$(pwd)/.pytest_tmp` (HPC `os.link` cross-device Errno 18).
+- Parity must hold byte-identical across **both** backends (`GVL_BACKEND=rust` default and `GVL_BACKEND=numba`).
+- Measurements: `NUMBA_NUM_THREADS=1`, release build (`maturin develop --release` / `pixi run -e dev` release task), Carter HPC (AMD EPYC 7543, linux-64). Report wall-clock + peak RSS (memray).
+- Conventional-commit messages; end commit messages with `Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>`.
+- Do not touch read-path numba kernels (`padded_slice`, `_assemble_alt_windows`, `apply_site_only_variants`, `_tracks.py` realign) — they are retained Phase-5-deletion references.
+
+---
+
+### Task 1: Delete the dead legacy track path + `splits_sum_le_value`
+
+**Files:**
+- Modify: `python/genvarloader/_dataset/_write.py` (delete `_write_track_legacy` lines 1254-1386; change fall-through at line 1467; drop `splits_sum_le_value` from the import at line 41)
+- Modify: `python/genvarloader/_dataset/_utils.py` (delete `splits_sum_le_value`, lines 165-196)
+- Modify: `tests/unit/test_utils.py` (drop `splits_sum_le_value` from import line 4; delete `test_splits_sum_le_value`, line 63)
+- Modify: `tests/unit/dataset/test_dataset_utils.py` (drop `splits_sum_le_value` from import line 13; delete `test_splits_sum_le_value_docstring_example`, lines 81-82)
+- Modify: `src/lib.rs:54` (stale docstring — bigWig writer emits SoA `starts/ends/values.npy`, not `intervals.npy`)
+- Test: `tests/unit/dataset/test_write.py` (add the new TypeError test; create the file if absent)
+
+**Interfaces:**
+- Consumes: `genvarloader._dataset._write._write_track(out_dir, bed, track, samples, max_mem)` — dispatches `BigWigs`→Rust, `Table`→Rust, else now raises.
+- Produces: `_write_track` raises `TypeError` for any track that is not `BigWigs`/`Table`. No public symbol changes.
+
+- [ ] **Step 1: Write the failing test**
+
+In `tests/unit/dataset/test_write.py` (create if needed):
+
+```python
+from pathlib import Path
+
+import polars as pl
+import pytest
+
+from genvarloader._dataset._write import _write_track
+
+
+def test_write_track_rejects_unsupported_type():
+    """Custom IntervalTrack types are unsupported now that the legacy path is gone."""
+    with pytest.raises(TypeError, match="BigWigs.*Table"):
+        _write_track(Path("/tmp/unused"), pl.DataFrame(), object(), None, 1)
+```
+
+- [ ] **Step 2: Run the test to verify it fails**
+
+Run: `pixi run -e dev pytest tests/unit/dataset/test_write.py::test_write_track_rejects_unsupported_type -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: FAIL — currently the fall-through calls `_write_track_legacy`, which tries to treat `object()` as a track (AttributeError / different error), not `TypeError`.
+
+- [ ] **Step 3: Replace the fall-through and delete `_write_track_legacy`**
+
+In `python/genvarloader/_dataset/_write.py`, change the last line of `_write_track` (line 1467) from:
+
+```python
+    return _write_track_legacy(out_dir, bed, track, samples, max_mem)
+```
+
+to:
+
+```python
+    raise TypeError(
+        f"Unsupported track type {type(track).__name__!r}; "
+        "tracks must be a genvarloader.BigWigs or genvarloader.Table."
+    )
+```
+
+Then delete the entire `_write_track_legacy` function (lines 1254-1386, from `def _write_track_legacy(` up to but not including `def _write_track_rust(`).
+
+- [ ] **Step 4: Delete `splits_sum_le_value` and its import**
+
+In `python/genvarloader/_dataset/_write.py` line 41, change:
+
+```python
+from ._utils import bed_to_regions, regions_to_bed, splits_sum_le_value
+```
+
+to:
+
+```python
+from ._utils import bed_to_regions, regions_to_bed
+```
+
+In `python/genvarloader/_dataset/_utils.py`, delete the `splits_sum_le_value` function (the `@nb.njit(...)` decorator at line 165 through the end of the function body at line 196). Leave `padded_slice` (lines 37-72) untouched.
+
+- [ ] **Step 5: Delete the two `splits_sum_le_value` unit tests**
+
+In `tests/unit/test_utils.py` line 4, change:
+
+```python
+from genvarloader._dataset._utils import bed_to_regions, splits_sum_le_value
+```
+
+to:
+
+```python
+from genvarloader._dataset._utils import bed_to_regions
+```
+
+and delete the `test_splits_sum_le_value` function (starting line 63).
+
+In `tests/unit/dataset/test_dataset_utils.py`, remove `splits_sum_le_value` from the import block (line 13) and delete `test_splits_sum_le_value_docstring_example` (lines 81-82 and its body).
+
+- [ ] **Step 6: Fix the stale Rust docstring**
+
+In `src/lib.rs:54`, change the comment:
+
+```rust
+/// Write intervals.npy + offsets.npy for a bigWig track directly to `out_dir`.
+```
+
+to:
+
+```rust
+/// Write SoA starts/ends/values.npy + offsets.npy for a bigWig track directly to `out_dir`.
+```
+
+- [ ] **Step 7: Run the new test + the utils tests to verify they pass**
+
+Run: `pixi run -e dev pytest tests/unit/dataset/test_write.py::test_write_track_rejects_unsupported_type tests/unit/test_utils.py tests/unit/dataset/test_dataset_utils.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (new TypeError test green; no remaining references to `splits_sum_le_value`).
+
+- [ ] **Step 8: Grep to confirm no dangling references**
+
+Run: `grep -rn "splits_sum_le_value\|_write_track_legacy" python/genvarloader/ tests/ --include="*.py"`
+Expected: no matches.
+
+- [ ] **Step 9: Rebuild Rust + run the write-path test slice on both backends**
+
+Run: `pixi run -e dev pytest tests/dataset tests/unit -q --basetemp=$(pwd)/.pytest_tmp`
+Then: `GVL_BACKEND=numba pixi run -e dev pytest tests/dataset tests/unit -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: both green (pre-existing xfails unchanged: `test_e2e_variants`, `test_haps_property` ×2, `test_parse_idx[missing]`, `test_getitem[no_regions]`).
+
+- [ ] **Step 10: Commit**
+
+```bash
+git add python/genvarloader/_dataset/_write.py python/genvarloader/_dataset/_utils.py \
+        tests/unit/test_utils.py tests/unit/dataset/test_dataset_utils.py \
+        tests/unit/dataset/test_write.py src/lib.rs
+git commit -m "refactor(write): delete dead legacy track path + splits_sum_le_value
+
+_write_track_legacy was reachable only via custom IntervalTrack types (none
+exist; IntervalTrack is unexported). Replace the dispatch fall-through with a
+TypeError and drop the last write-path numba kernel (splits_sum_le_value) and
+its tests. Write path is now numba-free. Fix stale SoA docstring in lib.rs.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 2: Realistic write/update measurement driver
+
+**Files:**
+- Create: `tests/benchmarks/profiling/profile_write_realistic.py`
+
+**Interfaces:**
+- Consumes: helpers + constants from `tests/benchmarks/data/build_realistic.py` — `choose_samples()`, `copy_regions()`, `slice_pgen(samples, bed_path)`, `drop_unsupported_variants(pgen)`, and module constants `SAMPLE_MAP`, `BW_CHR22_DIR`. Also `genvarloader.write`/`genvarloader.update`, `genvarloader.BigWigs`, `genoray.PGEN`.
+- Produces: a CLI `python tests/benchmarks/profiling/profile_write_realistic.py --op {write,update}` printing `op=... corpus=chr22_geuv wall=<s>s (...)`. Times only the `gvl.write` / `gvl.update` call (prep runs untimed). Runnable under `memray run` for peak RSS.
+
+This driver exercises the **full Rust write path** (genoray sparse genotypes + the Rust bigWig streaming writer) on the realistic chr22 corpus, and a real per-sample `BigWigs` track add for `update` (replacing the 60-row synthetic annot smoke).
+
+- [ ] **Step 1: Write the driver**
+
+Create `tests/benchmarks/profiling/profile_write_realistic.py`:
+
+```python
+"""Time gvl.write() and a real per-sample BigWigs gvl.update() on the chr22_geuv corpus.
+
+Exercises the full Rust write path (genoray sparse genotypes + Rust bigWig
+streaming writer). Prep (sample choice, plink2 slice) runs untimed; only the
+gvl.write / gvl.update call is measured.
+
+Usage (needs /carter sources or GVL_BENCH_SOURCE bundle):
+    pixi run -e dev python tests/benchmarks/profiling/profile_write_realistic.py --op write
+    pixi run -e dev python tests/benchmarks/profiling/profile_write_realistic.py --op update
+
+Peak RSS:
+    NUMBA_NUM_THREADS=1 .pixi/envs/dev/bin/memray run -o w.bin \\
+        tests/benchmarks/profiling/profile_write_realistic.py --op write
+    .pixi/envs/dev/bin/memray stats w.bin
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+import polars as pl
+
+_REPO_ROOT = Path(__file__).resolve().parents[3]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+from tests.benchmarks.data import build_realistic as br  # noqa: E402
+
+CORPUS_TAG = "chr22_geuv"
+
+
+def _resolve_bigwig_paths(samples: list[str]) -> dict[str, str]:
+    """Resolve per-sample chr22 bigWig paths exactly as build_realistic.build_dataset."""
+    smap = pl.read_csv(br.SAMPLE_MAP)
+    paths: dict[str, str] = {}
+    for sample, full_path in smap.select("sample", "path").iter_rows():
+        if sample not in samples:
+            continue
+        bw = br.BW_CHR22_DIR / Path(full_path).name
+        if not bw.exists():
+            raise SystemExit(f"Missing chr22 bigwig for {sample}: {bw}")
+        paths[sample] = str(bw)
+    assert set(paths) == set(samples), set(samples) - set(paths)
+    return paths
+
+
+def _prep() -> tuple[list[str], Path, Path, dict[str, str]]:
+    """Untimed prep: choose samples, build regions BED, slice + filter PGEN, resolve bigwigs."""
+    samples = br.choose_samples()
+    bed_path = br.copy_regions()
+    pgen = br.slice_pgen(samples, bed_path)
+    pgen = br.drop_unsupported_variants(pgen)
+    paths = _resolve_bigwig_paths(samples)
+    return samples, pgen, bed_path, paths
+
+
+def run_write(out: Path) -> float:
+    import genvarloader as gvl
+    from genoray import PGEN
+
+    samples, pgen, bed_path, paths = _prep()
+    tracks = gvl.BigWigs("read-depth", paths)
+    t0 = time.perf_counter()
+    gvl.write(
+        path=out,
+        bed=bed_path,
+        variants=PGEN(pgen),
+        tracks=tracks,
+        samples=samples,
+        overwrite=True,
+        extend_to_length=False,
+    )
+    return time.perf_counter() - t0
+
+
+def run_update(out: Path) -> tuple[float, str]:
+    import genvarloader as gvl
+    from genoray import PGEN
+
+    samples, pgen, bed_path, paths = _prep()
+    # Build a base dataset (untimed) to update.
+    gvl.write(
+        path=out,
+        bed=bed_path,
+        variants=PGEN(pgen),
+        tracks=gvl.BigWigs("read-depth", paths),
+        samples=samples,
+        overwrite=True,
+        extend_to_length=False,
+    )
+    # Timed: add a SECOND per-sample BigWigs track via update (Rust bigWig writer).
+    add = gvl.BigWigs("read-depth-2", paths)
+    t0 = time.perf_counter()
+    gvl.update(out, tracks=add, max_mem="4g")
+    wall = time.perf_counter() - t0
+    return wall, f"track=read-depth-2 samples={len(samples)}"
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--op", choices=["write", "update"], required=True)
+    args = p.parse_args()
+
+    with tempfile.TemporaryDirectory() as tmp:
+        out = Path(tmp) / "chr22_geuv_bench.gvl"
+        if args.op == "write":
+            wall = run_write(out)
+            print(f"op=write corpus={CORPUS_TAG} wall={wall:.3f}s")
+        else:
+            wall, info = run_update(out)
+            print(f"op=update corpus={CORPUS_TAG} wall={wall:.3f}s ({info})")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+- [ ] **Step 2: Smoke-run the driver (write) to verify it executes**
+
+Run: `NUMBA_NUM_THREADS=1 pixi run -e dev python tests/benchmarks/profiling/profile_write_realistic.py --op write`
+Expected: prints `op=write corpus=chr22_geuv wall=<s>s`. If it raises `SystemExit` about missing `/carter` sources, set `GVL_BENCH_SOURCE` to the extracted source bundle and retry; if no source bundle is reachable at all, record that and fall back to the 1kg driver in Task 3 (note the fallback in the roadmap).
+
+- [ ] **Step 3: Smoke-run the driver (update)**
+
+Run: `NUMBA_NUM_THREADS=1 pixi run -e dev python tests/benchmarks/profiling/profile_write_realistic.py --op update`
+Expected: prints `op=update corpus=chr22_geuv wall=<s>s (track=read-depth-2 samples=5)`.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add tests/benchmarks/profiling/profile_write_realistic.py
+git commit -m "test(bench): realistic chr22_geuv write/update perf driver
+
+Times gvl.write (PGEN variants + per-sample BigWigs track) and a real
+per-sample BigWigs gvl.update on the chr22_geuv corpus, exercising the full
+Rust write path. Replaces the 60-row synthetic annot smoke for the update gate.
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 3: Capture the gate — perf + RSS + full-tree parity
+
+**Files:** none (measurement + verification only; outputs feed Task 4).
+
+**Interfaces:**
+- Consumes: `profile_write_realistic.py` (Task 2), `memray`, the dual-backend test tree.
+- Produces: recorded numbers — `write()` wall + peak RSS, `update()` wall + peak RSS (corpus `chr22_geuv`, Carter) — and confirmation that the full tree is green on both backends. These numbers are pasted into the roadmap in Task 4.
+
+- [ ] **Step 1: Ensure a release build**
+
+Run: `pixi run -e dev maturin develop --release`
+Expected: builds clean (abi3).
+
+- [ ] **Step 2: Measure `write()` wall-clock (median of 3)**
+
+Run 3×: `NUMBA_NUM_THREADS=1 pixi run -e dev python tests/benchmarks/profiling/profile_write_realistic.py --op write`
+Record the median `wall=` value.
+
+- [ ] **Step 3: Measure `write()` peak RSS under memray**
+
+Run: `NUMBA_NUM_THREADS=1 .pixi/envs/dev/bin/memray run -f -o /tmp/w.bin tests/benchmarks/profiling/profile_write_realistic.py --op write && .pixi/envs/dev/bin/memray stats /tmp/w.bin | grep -i "peak memory"`
+Record peak RSS.
+
+- [ ] **Step 4: Measure `update()` wall-clock (median of 3) + peak RSS**
+
+Run 3×: `NUMBA_NUM_THREADS=1 pixi run -e dev python tests/benchmarks/profiling/profile_write_realistic.py --op update` (record median wall).
+Then: `NUMBA_NUM_THREADS=1 .pixi/envs/dev/bin/memray run -f -o /tmp/u.bin tests/benchmarks/profiling/profile_write_realistic.py --op update && .pixi/envs/dev/bin/memray stats /tmp/u.bin | grep -i "peak memory"`
+Record peak RSS.
+
+- [ ] **Step 5: Confirm write-path parity (already-landed differential tests)**
+
+Run: `pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp` and the table/bigwig write tests: `pixi run -e dev pytest -q -k "table or bigwig or write" tests --basetemp=$(pwd)/.pytest_tmp`
+Expected: green (bigWig byte-identical writer test; Table COITrees numpy-oracle + property tests).
+
+- [ ] **Step 6: Full tree, both backends**
+
+Run: `pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Then: `GVL_BACKEND=numba pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: both green except the known pre-existing xfails.
+
+- [ ] **Step 7: cargo + lint/format/typecheck + abi3**
+
+Run:
+```bash
+pixi run -e dev cargo-test
+pixi run -e dev ruff check python/ tests/
+pixi run -e dev ruff format --check python/ tests/
+pixi run -e dev typecheck
+```
+Expected: all clean/green.
+
+- [ ] **Step 8: Record the captured numbers in a scratch note**
+
+Write the four numbers + machine/corpus/HEAD into `docs/superpowers/plans/2026-06-26-phase-4-measurements.md` (a short scratch file) so Task 4 can transcribe them into the roadmap. Commit:
+
+```bash
+git add docs/superpowers/plans/2026-06-26-phase-4-measurements.md
+git commit -m "docs(bench): record Phase 4 Carter write/update perf + RSS
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 4: Reconcile the roadmap + mark Phase 4 ✅
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (Phase 4 section ~lines 600-610; baseline table ~lines 103-108; notes/decisions log)
+- Verify only: `skills/genvarloader/SKILL.md` (expect no change)
+
+**Interfaces:**
+- Consumes: the four measured numbers from Task 3.
+- Produces: Phase 4 marked ✅ with PR link; baseline table updated; a dated decisions-log entry. No code.
+
+- [ ] **Step 1: Rewrite the Phase 4 section**
+
+In `docs/roadmaps/rust-migration.md`, replace the Phase 4 block (`### Phase 4 — Write / update pipeline 🚧` … through its `**Gate:**` line) with a ✅ version that:
+  - marks the phase ✅ and sets `_PR: <link>_` (fill the PR URL when opened);
+  - states that variant normalization is a **user precondition** (`bcftools norm` / `plink2 --normalize`), not GVL work, and strikes it from scope;
+  - states genotype storage / variant IO (genoray `dense2sparse`) is **deferred to Phase 6 (absorb genoray)**;
+  - keeps the two ✅ slices (bigWig streaming writer; Table COITrees);
+  - records that the dead `_write_track_legacy` + `splits_sum_le_value` path was deleted (write path now numba-free; custom `IntervalTrack` types raise `TypeError`);
+  - records the gate result with the Task-3 numbers.
+
+Example replacement text (fill in the measured numbers):
+
+```markdown
+### Phase 4 — Write / update pipeline ✅
+_PR: <PR-URL>_
+
+The default `gvl.write()` / `gvl.update()` path is fully Rust-backed; the write path is numba-free.
+
+- [x] bigWig interval extraction — single-pass streaming Rust writer (SoA `starts/ends/values.npy`).
+- [x] Table + annot overlap — COITrees Rust engine.
+- [x] Deleted the dead `_write_track_legacy` + `splits_sum_le_value` (the last write-path numba),
+      reachable only via custom `IntervalTrack` types (none exist; `IntervalTrack` is unexported).
+      Unsupported track types now raise `TypeError`.
+- **Variant normalization (left-align, bi-allelic, atomize) is NOT GVL work** — it is a user
+  precondition (`bcftools norm` / `plink2 --normalize`); the write path only validates/rejects
+  non-conforming records. Struck from Phase 4 scope.
+- **Genotype storage / variant IO (genoray `dense2sparse`) deferred to Phase 6 (absorb genoray).**
+
+**Gate (parity — MET):** write-path parity = the landed differential tests (bigWig byte-identical;
+Table COITrees numpy-oracle + property). Full tree green on both backends.
+
+**Gate (throughput/RSS — Carter re-baseline, chr22_geuv):**
+
+| Op | corpus | wall-clock | peak RSS |
+|---|---|---|---|
+| `gvl.write()` (PGEN variants + BigWigs track) | chr22_geuv (5 samples × <N> regions, chr22) | <W> s | <R> GB |
+| `gvl.update()` (add per-sample BigWigs track) | chr22_geuv | <W> s | <R> GB |
+
+> Carter HPC (AMD EPYC 7543, linux-64), `NUMBA_NUM_THREADS=1`, release build, HEAD `<hash>`. The
+> write path is already Rust-only (Python/numba orchestration deleted at landing), so there is no
+> live numba A/B; these are the canonical Phase 4 numbers. The old 1.143 s / 3.593 GB write figure
+> was macOS / 1kg-VCF and is **not comparable**.
+```
+
+- [ ] **Step 2: Annotate the old baseline table row**
+
+In the Baseline metrics table (~line 107), update the `gvl.update()` row: replace the "smoke only" TBD note with a pointer to the Phase 4 chr22_geuv update number, and mark the macOS `gvl.write()` row (line 105) as superseded-for-comparison by the Carter chr22_geuv re-baseline.
+
+- [ ] **Step 3: Add a decisions-log entry**
+
+Prepend to the "Notes & decisions log" section:
+
+```markdown
+- 2026-06-26 (Phase 4 close-out; branch `phase-4-close-out`, PR <URL>): Investigation found the
+  default write/update path already fully Rust-backed (bigWig streaming writer + COITrees table;
+  variant IO via genoray). The roadmap's "variant normalization" bullet was a mischaracterization —
+  GVL never normalizes (it is a bcftools/plink2 user precondition); genotype storage is genoray
+  (→ Phase 6). Deleted the only remaining write-path numba (`splits_sum_le_value` + the dead
+  `_write_track_legacy`; unsupported `IntervalTrack` types now `TypeError`). Captured canonical
+  Carter chr22_geuv write/update wall-clock + peak RSS (no live numba A/B — orchestration was
+  deleted at landing). Full tree green both backends; cargo + lint/format/typecheck clean; abi3
+  builds. Phase 4 ✅.
+```
+
+- [ ] **Step 4: Verify the skill needs no update**
+
+Run: `grep -n "write\|update\|IntervalTrack\|BigWigs\|Table" skills/genvarloader/SKILL.md | head`
+Confirm: no public-API claim changed (no exported symbol, signature, or default changed; `IntervalTrack` is unexported). If the skill documents a "custom IntervalTrack" capability, add a one-line note that only `BigWigs`/`Table` are supported. Otherwise no change.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add docs/roadmaps/rust-migration.md skills/genvarloader/SKILL.md
+git commit -m "docs(roadmap): Phase 4 close-out — write path numba-free, gate captured, scope reconciled
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- Spec A (delete dead legacy path) → Task 1. ✅
+- Spec B (Carter re-baseline write + real update) → Tasks 2–3. ✅
+- Spec C (parity via landed differential tests) → Task 3 steps 5–6. ✅
+- Spec D (roadmap reconciliation, Phase 4 ✅, genoray→Phase 6, SKILL check) → Task 4. ✅
+- Out-of-scope items (genoray, read-path numba, rayon) are not given tasks. ✅
+
+**Placeholder scan:** Measured numbers (`<W>`, `<R>`, `<hash>`, `<PR-URL>`) are intentional fill-at-runtime values produced by Task 3 / at PR time, not vague instructions — every code step has concrete code. No "TBD/add error handling" placeholders.
+
+**Type consistency:** `_write_track(out_dir, bed, track, samples, max_mem)` signature is used consistently (Task 1 test + dispatch). `profile_write_realistic.py` reuses `build_realistic` helper names verified against the source (`choose_samples`, `copy_regions`, `slice_pgen`, `drop_unsupported_variants`, `SAMPLE_MAP`, `BW_CHR22_DIR`). `gvl.BigWigs(name, paths)` and `gvl.update(path, tracks=...)` match the codebase.
diff --git a/docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w2.md b/docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w2.md
new file mode 100644
index 00000000..bdd33a1c
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w2.md
@@ -0,0 +1,67 @@
+# Rust Migration Phase 5 — PR2 (W2): close out #242 with max_jitter>0 dataset-parity coverage
+
+> **For agentic workers:** executed via superpowers:subagent-driven-development. Steps use `- [ ]`.
+
+**Goal:** The #242 `intervals_to_tracks` store-vs-query divergence was already root-caused and FIXED end-to-end (kernel left-clip `s = max(itv.start - query_start, 0); e = min(end, length)` in both backends, merged via PR #244, ancestor of `rust-migration`; issue #242 CLOSED). The investigation (`.superpowers/sdd/w2-investigation.md`) showed the clip is functionally CORRECT, not merely masking. The ONLY residue is that the dataset-level parity suite still pins `max_jitter=0` with **stale** "PanicException landmine" comments, so numba-vs-rust byte-identity is not gated end-to-end over the jittered-track domain. This PR adds that coverage with a hand-computed oracle and de-stales the comments. **No kernel/write-path changes** (user decision: skip the unnecessary upstream coordinate rewrite).
+
+**Branch:** `phase-5-w2`, stacked on `phase-5-w1` (so roadmap edits don't conflict with the open W1 PR #256).
+
+## Global Constraints
+
+- Byte-identical numba/rust parity is the gate. Test work only — do NOT touch `_intervals.py`, `src/intervals.rs`, the write path, or any kernel.
+- The new dataset-parity case MUST be deterministic across backends: write with `max_jitter > 0` but READ at the default `jitter = 0` (a freshly opened dataset has `jitter=0`, `Deterministic: True`, even when `max_jitter>0`). Random read-jitter would desync the two backend reads — do not enable it.
+- The case MUST genuinely exercise the #242 condition: assert that a stored interval start is strictly LESS than its query start (i.e. `regions.npy` expanded start `< input_regions.arrow` original chromStart) for the fixture, so the test is non-vacuous.
+- Backend switching follows the established pattern in `tests/parity/test_dataset_parity.py`: `monkeypatch.setenv("GVL_BACKEND", "rust"|"numba")` then re-read.
+- pytest commands MUST include `--basetemp=$(pwd)/.pytest_tmp` (os.link Errno 18 otherwise). Rust changes need `maturin develop --release` first — but this PR has NO rust changes.
+- Conventional commits; co-author trailer `Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>`.
+
+## Empirically verified facts (from the W2 investigation probe)
+- For region chromStart=100, max_jitter=4: `regions.npy[:, :3] = [[0, 96, 114]]`; `input_regions.arrow` chromStart = 100; default `ds.jitter = 0`.
+- Track-only dataset, constant-5.0 BigWig over chr1:[0,1000), region chr1:100-110, max_jitter=4, jitter=0 read → both backends return `[5.]*10` byte-identically; deterministic across re-reads. Stored start 96 < query 100 (condition hit).
+
+---
+
+## Task 1: Add track-only max_jitter>0 dataset-parity + oracle test
+
+**Files:**
+- Modify: `tests/parity/_fixtures.py` — add a `build_track_dataset_jittered(work_dir, max_jitter)` builder: a track-only dataset with a CONTROLLED BigWig (deterministic, hand-computable signal) and `max_jitter > 0`. Reuse the existing `build_track_dataset` pattern but (a) take `max_jitter` and (b) use a BigWig whose signal over each region is exactly known (e.g. a constant value per contig, or a known piecewise-constant pattern) so the expected painted track is hand-computable.
+- Modify: `tests/parity/test_dataset_parity.py` — add `test_tracks_max_jitter_intervals_parity_and_oracle`.
+
+**Test requirements (the new test):**
+- [ ] Build the jittered track-only dataset with `max_jitter = 4` (or similar > 0).
+- [ ] **Non-vacuity / condition guard:** load `regions.npy` and `input_regions.arrow`; assert at least one stored region start (`regions.npy[:,1]`) is strictly `<` the corresponding original `chromStart` (proves the #242 sub-query condition is exercised). Assert `ds.jitter == 0` after open (deterministic read).
+- [ ] Open `Dataset.open(ds_dir).with_tracks("signal")`. Read `ds[:, :]` under `GVL_BACKEND=rust`, then under `GVL_BACKEND=numba`.
+- [ ] **Byte-identity:** `assert_array_equal` on both track `.data` (float32) and `.offsets` (int64) across backends.
+- [ ] **Hand-computed oracle:** for each (region, sample), the expected track is the known BigWig signal over the ORIGINAL region window `[chromStart, chromEnd)` (jitter=0). Assert the rust output equals this oracle exactly. Keep the BigWig signal simple enough to compute in the test (e.g. constant per contig, or a single known interval covering each region).
+- [ ] **Non-triviality:** assert some output value is non-zero (not a vacuous all-zero match).
+
+- [ ] **Step 1 (TDD-ish):** Write the test. It PASSES on the current (fixed) tree — this is regression coverage for a previously-untested domain, not red→green. The non-vacuity guard (stored start < query start + correct nonzero oracle) is the evidence it would have caught the pre-fix bug (which over-padded/wrapped on exactly this condition).
+- [ ] **Step 2:** Run: `pixi run -e dev pytest tests/parity/test_dataset_parity.py::test_tracks_max_jitter_intervals_parity_and_oracle -v --basetemp=$(pwd)/.pytest_tmp`. Expected PASS, both backends compared, oracle matched.
+- [ ] **Step 3:** Commit.
+  ```
+  test(parity): cover max_jitter>0 intervals_to_tracks end-to-end (numba==rust + oracle, #242)
+  ```
+
+## Task 2: De-stale the landmine comments + roadmap + full verification
+
+**Files:**
+- Modify: `tests/parity/_fixtures.py` — fix the stale "PanicException landmine" docstrings on `build_haps_tracks_dataset` and `build_strand_mixed_dataset`. The `max_jitter=0` there is now retained ONLY because those fixtures compare `ds[:,:]` across backends and want the SIMPLEST deterministic geometry — NOT because of any panic (the kernel left-clip fixed #242, PR #244). Rewrite the comment to state the accurate reason and point to the new `test_tracks_max_jitter_intervals_parity_and_oracle` for the max_jitter>0 coverage. Do NOT change `max_jitter=0` in those builders (lifting them would desync nothing since jitter defaults to 0, but it would change output-length geometry and is out of scope — leave the values, fix only the comments).
+- Modify: `tests/parity/test_dataset_parity.py` — fix the identical stale landmine comment block in `test_tracks_realign_getitem_identical_across_backends` (lines ~150-156).
+- Modify: `docs/roadmaps/rust-migration.md` — add a dated Phase 5 W2 entry: #242 was already fixed (clip, PR #244) and is now end-to-end parity-covered at max_jitter>0 (new test); the stale landmine comments were corrected; #242 stays CLOSED; the upstream coordinate rewrite was intentionally skipped (clip is functionally correct per the W2 investigation). Phase 5 stays 🚧 (W3–W9 remain). Reference `.superpowers/sdd/w2-investigation.md`.
+
+- [ ] **Step 1:** Rewrite the three stale comment blocks accurately (no "PanicException"/"landmine"/"violates the contract" language implying a live bug).
+- [ ] **Step 2:** Add the roadmap W2 entry.
+- [ ] **Step 3:** Full parity suite, both backends:
+  - `pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+  - `GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+  Expected: green, matching profiles.
+- [ ] **Step 4:** Lint + typecheck: `pixi run -e dev ruff check python/ tests/ && pixi run -e dev ruff format --check python/ tests/ && pixi run -e dev typecheck`. (No rust → cargo not required, but harmless.)
+- [ ] **Step 5:** Commit.
+  ```
+  docs(parity,roadmap): correct stale #242 landmine comments; record W2 closure
+  ```
+
+---
+
+## Finish (controller, after final review + user confirm)
+- Open PR `phase-5-w2` → base `phase-5-w1` (stacked) OR `rust-migration` if W1 has merged by then. No squash. Reference #242 (keep closed) + the W2 investigation.
diff --git a/docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w3.md b/docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w3.md
new file mode 100644
index 00000000..ce763c21
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w3.md
@@ -0,0 +1,496 @@
+# Rust Migration Phase 5 — PR3 (W3): Fuse the deferred annotated+spliced reconstruction path
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Collapse the last un-fused FFI seam in haplotype reconstruction by adding a fused Rust kernel `reconstruct_annotated_haplotypes_spliced_fused` for the annotated **and** spliced path, wiring it into `_haps.py`, and parity-gating it byte-identically against the composed numba oracle.
+
+**Architecture:** Three of the four annotated×spliced combinations are already fused into single-FFI-crossing Rust kernels (`reconstruct_haplotypes_fused`, `reconstruct_annotated_haplotypes_fused`, `reconstruct_haplotypes_spliced_fused`). The fourth — annotated **and** spliced — was deferred to Phase 5: on the rust backend it currently runs the un-fused dispatched `reconstruct_haplotypes_from_sparse` core and then folds reverse-complement (RC) in a Python post-pass (`_FlatAnnotatedHaps.reverse_masked`). This PR adds the missing fused kernel — a faithful **merge** of the two existing kernels: the spliced scaffolding (precomputed `out_offsets`, permuted ploidy-1 inputs, no `get_diffs_sparse`) from `reconstruct_haplotypes_spliced_fused`, plus the annotation buffers and the in-kernel RC triple from `reconstruct_annotated_haplotypes_fused`. Every primitive it composes (`reconstruct::reconstruct_haplotypes_from_sparse` with `Some` annotation views, `rc_flat_rows_inplace`, `reverse_flat_rows_inplace`) is already cargo-tested and parity-proven, so correctness reduces to wiring + a dataset-level parity gate.
+
+**Tech Stack:** Rust (PyO3/maturin, `ndarray`), Python (NumPy, Polars), pytest parity suite, numba as the differential oracle.
+
+## Global Constraints
+
+- **Byte-identical numba/rust parity is the landing gate.** numba is the oracle and is NOT deleted in this PR (deletion is W5/W6). Every code path must remain comparable across `GVL_BACKEND=numba|rust`.
+- **RC accounting (the parity-critical invariant):** for the spliced path, RC is applied per **permuted element**. On the **numba** backend RC is applied *externally* in `_query.py::_getitem_spliced` (the `if _active_backend() == "numba"` branch). On the **rust** backend the reconstructor must return output that is **already RC'd**, so `_getitem_spliced` treats rust as a no-op. The new fused kernel therefore folds RC *in-kernel*: `rc_flat_rows_inplace` on the sequence bytes (reverse + complement) and `reverse_flat_rows_inplace` on **both** annotation arrays (reverse only, **no** complement). This is byte-identical to `_FlatAnnotatedHaps.reverse_masked(mask, _COMP)` in `python/genvarloader/_flat.py:170-176`.
+- The `to_rc` mask reaching the reconstructor is already in permuted per-element order (`to_rc_per_elem = to_rc_flat[plan.permutation]` from `_getitem_spliced`); pass it straight through. Its length must equal `out_offsets.len() - 1`.
+- **maturin rebuild gotcha:** `pixi run -e dev pytest` does NOT rebuild the Rust extension. After ANY edit under `src/`, run `pixi run -e dev maturin develop --release` before pytest, or pytest imports the stale binary. `cargo test` compiles from source and is unaffected.
+- **All pytest commands MUST include** `--basetemp=$(pwd)/.pytest_tmp` (os.link cross-device Errno 18 on this HPC otherwise).
+- Conventional commits; co-author trailer `Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>`. No squash on merge; topic branch `phase-5-w3` (off `rust-migration`) → PR into `rust-migration`.
+
+## Reference: the two existing kernels this one merges
+
+- `src/ffi/mod.rs:689-762` `reconstruct_haplotypes_spliced_fused` — takes precomputed `out_offsets`, permuted inputs, ploidy-1 `flat_shifts`/`flat_geno_offset_idx`; allocates only `out_data`; calls the core with `None, None` for the annotation views; RCs sequence bytes in place via `rc_flat_rows_inplace`; returns `out_data` only (caller holds offsets).
+- `src/ffi/mod.rs:789-920` `reconstruct_annotated_haplotypes_fused` — allocates `out_data` + `annot_v` (i32) + `annot_pos` (i32); calls the core with `Some(annot_v.view_mut()), Some(annot_pos.view_mut())`; on RC does `rc_flat_rows_inplace(out_data)` + `reverse_flat_rows_inplace(annot_v)` + `reverse_flat_rows_inplace(annot_pos)`. (It *computes* its own offsets via `get_diffs_sparse`; the spliced kernel does NOT — it receives them.)
+- Python caller to mirror: the non-annotated spliced **rust branch** at `python/genvarloader/_dataset/_haps.py:910-942` shows the exact input prep (`np.ascontiguousarray(...)`, `_as_starts_stops`, `_ffi_array`, `self.ffi_static.*`, `reshape(-1, 1)`, `to_rc` passthrough).
+- Exemplar parity tests: `tests/parity/test_spliced_haplotypes_parity.py` (spy + byte-identity pattern) and `tests/parity/test_haplotypes_dataset_parity.py::test_annotated_haplotypes_mode_dataset_parity` (annotated 3-array comparison via `.haps`/`.var_idxs`/`.ref_coords`).
+
+---
+
+## Task 1: Add the fused `reconstruct_annotated_haplotypes_spliced_fused` kernel, wire it into `_haps.py`, and parity-gate it
+
+**Files:**
+- Modify: `src/ffi/mod.rs` — add `reconstruct_annotated_haplotypes_spliced_fused` (insert after `reconstruct_haplotypes_spliced_fused`, i.e. after line 762).
+- Modify: `src/lib.rs` — register the new pyfunction (after line 44).
+- Modify: `python/genvarloader/_dataset/_haps.py` — add the module-level import (after line 42); rewrite the splice branch of `_reconstruct_annotated_haplotypes` (current lines 1100-1157) to call the fused kernel on the rust backend and drop the Python RC post-pass.
+- Create: `tests/parity/test_annotated_spliced_haplotypes_parity.py` — the parity gate.
+
+**Interfaces:**
+- Produces (Rust → Python FFI): `reconstruct_annotated_haplotypes_spliced_fused(permuted_regions: i32[n,3], flat_shifts: i32[n,1], flat_geno_offset_idx: i64[n,1], out_offsets: i64[n+1], geno_offsets: i64[2,m], geno_v_idxs: i32[], v_starts: i32[], ilens: i32[], alt_alleles: u8[], alt_offsets: i64[], ref_: u8[], ref_offsets: i64[], pad_char: u8, keep: Optional[bool[]], keep_offsets: Optional[i64[]], to_rc: Optional[bool[n]]) -> (out_data: u8[], annot_v: i32[], annot_pos: i32[])`. Note: `out_offsets` is an INPUT (the caller holds the splice plan's `permuted_out_offsets`) and is NOT returned — matching `reconstruct_haplotypes_spliced_fused`.
+
+- [ ] **Step 1: Write the failing parity test**
+
+Create `tests/parity/test_annotated_spliced_haplotypes_parity.py`:
+
+```python
+"""Annotated+spliced haplotypes dataset parity backstop (fused rust entry, Phase 5 W3).
+
+Proves the fused Rust entry ``reconstruct_annotated_haplotypes_spliced_fused`` produces
+byte-identical (haps, var_idxs, ref_coords) output to the composed numba oracle for the
+annotated AND spliced path — including a negative-strand transcript, which exercises the
+in-kernel RC triple (reverse-complement of the sequence bytes + reverse of the two
+annotation arrays, no complement).
+
+Asserts:
+  1. The fused entry actually fires on the rust path and NOT on the numba path (spy).
+  2. All three arrays are byte-identical across backends (haps + var_idxs + ref_coords + offsets).
+  3. RC actually changes the output (rc_neg=True vs rc_neg=False differ) — proves the
+     negative-strand transcript exercises the in-kernel RC path (non-vacuous RC coverage).
+  4. Output is non-trivial (contains non-N bases).
+"""
+
+from __future__ import annotations
+
+from dataclasses import replace
+
+import numpy as np
+import polars as pl
+import pytest
+
+import genvarloader as gvl
+import genvarloader._dataset._haps as _haps_mod
+from genvarloader._ragged import RaggedAnnotatedHaps
+from seqpro.rag import Ragged
+
+pytestmark = pytest.mark.parity
+
+
+def _compare_ragged(numba_out: Ragged, rust_out: Ragged, name: str) -> None:
+    n_data = np.asarray(numba_out.data)
+    r_data = np.asarray(rust_out.data)
+    assert n_data.dtype == r_data.dtype, (
+        f"dtype mismatch for {name}: numba={n_data.dtype}, rust={r_data.dtype}"
+    )
+    np.testing.assert_array_equal(
+        n_data, r_data, err_msg=f"data differs across backends for '{name}'"
+    )
+    np.testing.assert_array_equal(
+        np.asarray(numba_out.offsets, np.int64),
+        np.asarray(rust_out.offsets, np.int64),
+        err_msg=f"offsets differ across backends for '{name}'",
+    )
+
+
+def test_annotated_spliced_haplotypes_parity(phased_svar_gvl, reference, monkeypatch):
+    # --- open in annotated mode, build a spliced dataset with mixed strands inline ---
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = ds.with_seqs("annotated").with_tracks(False)
+
+    n = 4
+    # Group regions 0+1 -> T1 (+ strand), 2+3 -> T2 (- strand). The '-' transcript
+    # exercises the in-kernel RC triple (rc bytes + reverse var_idxs/ref_coords).
+    sub_bed = ds._full_bed[:n].with_columns(
+        pl.Series("transcript_id", ["T1", "T1", "T2", "T2"]),
+        pl.Series("strand", ["+", "+", "-", "-"]),
+    )
+    assert (sub_bed["strand"] == "-").any(), "need a '-' transcript to cover RC"
+    ds = replace(ds, _full_bed=sub_bed).with_settings(splice_info="transcript_id")
+    assert ds.is_spliced, "Dataset should be in spliced mode"
+
+    # --- spy on the fused annotated-spliced entry ---
+    orig = getattr(_haps_mod, "reconstruct_annotated_haplotypes_spliced_fused", None)
+    assert orig is not None, (
+        "reconstruct_annotated_haplotypes_spliced_fused not found on _haps_mod — "
+        "ensure it is imported at module level in _haps.py"
+    )
+    calls = {"n": 0}
+
+    def _spy(*a, **k):
+        calls["n"] += 1
+        return orig(*a, **k)
+
+    monkeypatch.setattr(
+        _haps_mod, "reconstruct_annotated_haplotypes_spliced_fused", _spy
+    )
+
+    # --- rust read (fused path) ---
+    monkeypatch.setenv("GVL_BACKEND", "rust")
+    out_rust = ds[:, :]
+    rust_calls = calls["n"]
+
+    # --- numba read (composed oracle; spy must NOT fire) ---
+    monkeypatch.setenv("GVL_BACKEND", "numba")
+    out_numba = ds[:, :]
+
+    assert calls["n"] == rust_calls, (
+        "fused annotated-spliced spy fired during the numba read — "
+        "the fused entry is being called on the numba path."
+    )
+    assert rust_calls > 0, (
+        "reconstruct_annotated_haplotypes_spliced_fused was NEVER invoked on the rust "
+        "read — the backstop is vacuous. Ensure _haps._reconstruct_annotated_haplotypes "
+        "calls it on the splice path when GVL_BACKEND=rust."
+    )
+
+    assert isinstance(out_rust, RaggedAnnotatedHaps), type(out_rust)
+    assert isinstance(out_numba, RaggedAnnotatedHaps), type(out_numba)
+
+    # --- non-trivial output ---
+    data_u8 = np.asarray(out_rust.haps.data).view(np.uint8)
+    assert data_u8.size > 0 and np.any(data_u8 != np.uint8(ord("N"))), (
+        "annotated-spliced output is empty or all-N padding — comparison is vacuous."
+    )
+
+    # --- RC non-vacuity: rc_neg flips the '-' transcript output (rust backend) ---
+    monkeypatch.setenv("GVL_BACKEND", "rust")
+    out_norc = ds.with_settings(rc_neg=False)[:, :]
+    assert not np.array_equal(
+        np.asarray(out_rust.haps.data), np.asarray(out_norc.haps.data)
+    ), (
+        "RC made no difference — the negative-strand transcript is not exercising the "
+        "in-kernel RC path (check strand propagation / rc_neg default)."
+    )
+
+    # --- byte-identity across backends on all three arrays ---
+    _compare_ragged(out_numba.haps, out_rust.haps, "annotated-spliced.haps")
+    _compare_ragged(out_numba.var_idxs, out_rust.var_idxs, "annotated-spliced.var_idxs")
+    _compare_ragged(
+        out_numba.ref_coords, out_rust.ref_coords, "annotated-spliced.ref_coords"
+    )
+```
+
+If any attribute used above (`_full_bed`, `is_spliced`, `with_seqs("annotated")`, `with_settings(rc_neg=...)`, `RaggedAnnotatedHaps`, `.haps`/`.var_idxs`/`.ref_coords`) does not exist with these exact names, reconcile against the two exemplar tests in the "Reference" section above — do NOT invent names. (`ds._full_bed` and `ds.is_spliced` are used verbatim in `test_spliced_haplotypes_parity.py:87,92`.)
+
+- [ ] **Step 2: Run the test to verify it fails for the right reason**
+
+Run: `pixi run -e dev pytest tests/parity/test_annotated_spliced_haplotypes_parity.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: FAIL at the `orig is not None` assertion (the symbol `reconstruct_annotated_haplotypes_spliced_fused` is not yet imported on `_haps_mod`). This confirms the gate targets the new kernel.
+
+- [ ] **Step 3: Add the fused Rust kernel**
+
+In `src/ffi/mod.rs`, insert immediately after `reconstruct_haplotypes_spliced_fused` (after line 762):
+
+```rust
+/// Fused annotated spliced-haplotype reconstruction: the annotated counterpart of
+/// `reconstruct_haplotypes_spliced_fused`. Reconstructs in one FFI crossing using
+/// precomputed splice output offsets AND fills the two per-nucleotide annotation
+/// arrays (variant index, reference coordinate).
+///
+/// Like the non-annotated splice entry, the Python splice plan already computes the
+/// permutation and `out_offsets` (`splice_plan.permuted_out_offsets`), so this kernel
+/// takes `out_offsets` directly and skips `get_diffs_sparse` / the offset loop.
+///
+/// On `to_rc`, each masked permuted element is reverse-complemented in place
+/// (`rc_flat_rows_inplace` on the sequence bytes) and its annotation rows are reversed
+/// in place (`reverse_flat_rows_inplace`, no complement) — byte-identical to
+/// `_FlatAnnotatedHaps.reverse_masked(mask, _COMP)`.
+///
+/// Returns `(out_data, annot_v, annot_pos)`. `out_offsets` is held by the caller and
+/// not returned (matches `reconstruct_haplotypes_spliced_fused`).
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn reconstruct_annotated_haplotypes_spliced_fused<'py>(
+    py: Python<'py>,
+    permuted_regions: PyReadonlyArray2<i32>,
+    flat_shifts: PyReadonlyArray2<i32>,
+    flat_geno_offset_idx: PyReadonlyArray2<i64>,
+    out_offsets: PyReadonlyArray1<i64>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    alt_alleles: PyReadonlyArray1<u8>,
+    alt_offsets: PyReadonlyArray1<i64>,
+    ref_: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    to_rc: Option<PyReadonlyArray1<bool>>,
+) -> (
+    Bound<'py, PyArray1<u8>>,
+    Bound<'py, PyArray1<i32>>,
+    Bound<'py, PyArray1<i32>>,
+) {
+    use crate::reconstruct;
+
+    let go = geno_offsets.as_array();
+    let go_starts = go.row(0);
+    let go_stops = go.row(1);
+
+    // out_offsets are precomputed by the Python splice plan — use them directly.
+    let out_offsets_a = out_offsets.as_array();
+    let total = out_offsets_a[out_offsets_a.len() - 1] as usize;
+
+    // Allocate the sequence + annotation buffers.
+    let mut out_data: Array1<u8> = uninit_output(total);
+    let mut annot_v: Array1<i32> = uninit_output(total);
+    let mut annot_pos: Array1<i32> = uninit_output(total);
+
+    // Reconstruct all haplotypes + annotations into the owned buffers (reuses batch core).
+    reconstruct::reconstruct_haplotypes_from_sparse(
+        out_data.view_mut(),
+        out_offsets_a,
+        permuted_regions.as_array(),
+        flat_shifts.as_array(),
+        flat_geno_offset_idx.as_array(),
+        go_starts,
+        go_stops,
+        geno_v_idxs.as_array(),
+        v_starts.as_array(),
+        ilens.as_array(),
+        alt_alleles.as_array(),
+        alt_offsets.as_array(),
+        ref_.as_array(),
+        ref_offsets.as_array(),
+        pad_char,
+        keep.as_ref().map(|k| k.as_array()),
+        keep_offsets.as_ref().map(|ko| ko.as_array()),
+        Some(annot_v.view_mut()),   // annot_v_idxs — variant index per nucleotide
+        Some(annot_pos.view_mut()), // annot_ref_pos — reference coordinate per nucleotide
+    );
+
+    // Optional in-place RC per permuted element. Sequence bytes are reverse-complemented;
+    // annotation rows are reversed only (no complement) — matching
+    // _FlatAnnotatedHaps.reverse_masked. out_offsets_a is the permuted per-element
+    // offsets array, so each masked element is transformed in its own byte range.
+    if let Some(to_rc) = to_rc.as_ref() {
+        let m = to_rc.as_array();
+        debug_assert_eq!(
+            m.len(),
+            out_offsets_a.len() - 1,
+            "to_rc mask length must equal number of output rows (offsets.len() - 1)"
+        );
+        crate::reverse::rc_flat_rows_inplace(out_data.as_slice_mut().unwrap(), out_offsets_a, m);
+        crate::reverse::reverse_flat_rows_inplace(annot_v.as_slice_mut().unwrap(), out_offsets_a, m);
+        crate::reverse::reverse_flat_rows_inplace(annot_pos.as_slice_mut().unwrap(), out_offsets_a, m);
+    }
+
+    (
+        out_data.into_pyarray(py),
+        annot_v.into_pyarray(py),
+        annot_pos.into_pyarray(py),
+    )
+}
+```
+
+Verify against the source: confirm `uninit_output`, `crate::reverse::rc_flat_rows_inplace`, and `crate::reverse::reverse_flat_rows_inplace` are the same symbols used by `reconstruct_annotated_haplotypes_fused` (`src/ffi/mod.rs:875-911`) and that `reconstruct::reconstruct_haplotypes_from_sparse`'s parameter order matches the call in `reconstruct_haplotypes_spliced_fused` (`src/ffi/mod.rs:722-742`). If a helper name differs in your tree, use the name the two reference kernels actually use.
+
+- [ ] **Step 4: Register the pyfunction**
+
+In `src/lib.rs`, after line 44 (`reconstruct_haplotypes_spliced_fused`), add:
+
+```rust
+    m.add_function(wrap_pyfunction!(ffi::reconstruct_annotated_haplotypes_spliced_fused, m)?)?;
+```
+
+- [ ] **Step 5: Import the symbol in `_haps.py`**
+
+In `python/genvarloader/_dataset/_haps.py`, in the extension-import block (after line 42, `reconstruct_haplotypes_spliced_fused as reconstruct_haplotypes_spliced_fused,`), add:
+
+```python
+    reconstruct_annotated_haplotypes_spliced_fused as reconstruct_annotated_haplotypes_spliced_fused,
+```
+
+(Match the existing `import X as X` re-export style used by its siblings in that block.)
+
+- [ ] **Step 6: Rewrite the splice branch of `_reconstruct_annotated_haplotypes`**
+
+Replace the current splice-plan block (`python/genvarloader/_dataset/_haps.py:1100-1157`, from the `# ---- splice plan path ----` comment through the final `return haps_rag, annot_v_rag, annot_pos_rag`) with:
+
+```python
+        # ---- splice plan path ----
+        flat_geno_idx, flat_shifts, permuted_regions, keep_perm, keep_offsets_perm = (
+            self._permute_request_for_splice(req)
+        )
+        splice_plan = req.splice_plan
+        per_elem_shape = (splice_plan.permuted_lengths.shape[0], None)
+        off = splice_plan.permuted_out_offsets
+
+        _backend = os.environ.get("GVL_BACKEND", "rust")
+        if _backend == "rust":
+            # Fused path: one FFI crossing. RC is folded in-kernel (sequence bytes
+            # reverse-complemented, annotation rows reversed), so there is NO Python
+            # reverse_masked post-pass. to_rc is already in permuted per-element order
+            # (from _getitem_spliced), and _getitem_spliced treats the rust output as
+            # already-RC'd (its post-pass is numba-only).
+            _to_rc_spliced = (
+                None if to_rc is None else np.ascontiguousarray(to_rc, np.bool_)
+            )
+            out_buf, annot_v_buf, annot_pos_buf = (
+                reconstruct_annotated_haplotypes_spliced_fused(
+                    permuted_regions=np.ascontiguousarray(permuted_regions, np.int32),
+                    flat_shifts=np.ascontiguousarray(
+                        flat_shifts.reshape(-1, 1), np.int32
+                    ),
+                    flat_geno_offset_idx=np.ascontiguousarray(
+                        flat_geno_idx.reshape(-1, 1), np.int64
+                    ),
+                    out_offsets=np.ascontiguousarray(off, np.int64),
+                    geno_offsets=_as_starts_stops(self.genotypes.offsets),
+                    geno_v_idxs=_ffi_array(self.genotypes.data, np.int32, "geno_v_idxs"),
+                    v_starts=self.ffi_static.v_starts,
+                    ilens=self.ffi_static.ilens,
+                    alt_alleles=self.ffi_static.alt_alleles,
+                    alt_offsets=self.ffi_static.alt_offsets,
+                    ref_=self.ffi_static.ref,
+                    ref_offsets=self.ffi_static.ref_offsets,
+                    pad_char=np.uint8(self.reference.pad_char),
+                    keep=None
+                    if keep_perm is None
+                    else np.ascontiguousarray(keep_perm, np.bool_),
+                    keep_offsets=None
+                    if keep_offsets_perm is None
+                    else np.ascontiguousarray(keep_offsets_perm, np.int64),
+                    to_rc=_to_rc_spliced,
+                )
+            )
+        else:
+            # Numba composed oracle path. RC is applied externally in
+            # _getitem_spliced (numba branch), so no to_rc / RC is applied here.
+            total = int(off[-1])
+            out_buf = np.empty(total, np.uint8)
+            annot_v_buf = np.empty(total, V_IDX_TYPE)
+            annot_pos_buf = np.empty(total, np.int32)
+            reconstruct_haplotypes_from_sparse(
+                geno_offset_idx=flat_geno_idx.reshape(-1, 1),
+                out=out_buf,
+                out_offsets=off,
+                regions=permuted_regions,
+                shifts=flat_shifts.reshape(-1, 1),
+                geno_offsets=self.genotypes.offsets,
+                geno_v_idxs=self.genotypes.data,
+                v_starts=self.variants.start,
+                ilens=self.variants.ilen,
+                alt_alleles=self.variants.alt.data.view(np.uint8),
+                alt_offsets=self.variants.alt.offsets,
+                ref=self.reference.reference,
+                ref_offsets=self.reference.offsets,
+                pad_char=self.reference.pad_char,
+                keep=keep_perm,
+                keep_offsets=keep_offsets_perm,
+                annot_v_idxs=annot_v_buf,
+                annot_ref_pos=annot_pos_buf,
+            )
+
+        haps_rag = cast(
+            "Ragged[np.bytes_]",
+            _Flat.from_offsets(out_buf, per_elem_shape, off).view("S1"),
+        )
+        annot_v_rag = cast(
+            "Ragged[V_IDX_TYPE]",
+            _Flat.from_offsets(annot_v_buf, per_elem_shape, off),
+        )
+        annot_pos_rag = cast(
+            "Ragged[np.int32]",
+            _Flat.from_offsets(annot_pos_buf, per_elem_shape, off),
+        )
+        return haps_rag, annot_v_rag, annot_pos_rag
+```
+
+This deletes the old unconditional `reconstruct_haplotypes_from_sparse` call (it now lives only in the numba `else` branch) and the `if ... == "rust" and to_rc is not None: ... reverse_masked(...)` post-pass block (RC is now in-kernel on rust). If removing that block leaves `_FlatAnnotatedHaps` and/or the local `from .._ragged import _COMP` unused in the file, the lint step in Task 2 will catch it — remove the now-dead import(s). Do NOT change `_query.py::_getitem_spliced`: its `if _active_backend() == "numba"` RC guard remains correct (rust output is already RC'd, numba is post-passed there).
+
+- [ ] **Step 7: Rebuild the Rust extension**
+
+Run: `pixi run -e dev maturin develop --release`
+Expected: builds cleanly (the new kernel + registration compile).
+
+- [ ] **Step 8: Run the parity test under both backends**
+
+```bash
+pixi run -e dev pytest tests/parity/test_annotated_spliced_haplotypes_parity.py -v --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS — the spy fires on rust only, RC non-vacuity holds, and all three arrays are byte-identical to numba.
+
+- [ ] **Step 9: Run the broader haplotype parity + reconstruct suites to confirm no regression**
+
+```bash
+pixi run -e dev cargo test --release reconstruct
+pixi run -e dev pytest tests/parity/test_spliced_haplotypes_parity.py tests/parity/test_haplotypes_dataset_parity.py tests/parity/test_annotated_spliced_haplotypes_parity.py -q --basetemp=$(pwd)/.pytest_tmp
+GVL_BACKEND=numba pixi run -e dev pytest tests/parity/test_spliced_haplotypes_parity.py tests/parity/test_haplotypes_dataset_parity.py tests/parity/test_annotated_spliced_haplotypes_parity.py -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: all green on both backends; cargo reconstruct tests pass.
+
+- [ ] **Step 10: Commit**
+
+```bash
+rtk git add src/ffi/mod.rs src/lib.rs python/genvarloader/_dataset/_haps.py tests/parity/test_annotated_spliced_haplotypes_parity.py
+rtk git commit -m "feat(rust): fuse annotated+spliced haplotype reconstruction into one FFI crossing (Phase 5 W3)
+
+Add reconstruct_annotated_haplotypes_spliced_fused — the annotated counterpart of
+reconstruct_haplotypes_spliced_fused. Folds RC in-kernel (bytes RC'd, annotation rows
+reversed) so the Python _FlatAnnotatedHaps.reverse_masked post-pass is dropped on the
+rust backend. Byte-identical to the composed numba oracle (new parity backstop).
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Task 2: Resolve the roadmap deferral note + full-tree both-backend verification
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` — update the deferral note (around line 285) and add a dated Phase 5 W3 entry.
+
+- [ ] **Step 1: Update the roadmap**
+
+Find the note (near `docs/roadmaps/rust-migration.md:285`) that reads, in part: "*(The annotated+spliced intersection remains on the unfused dispatched rust core — still parity-gated and rust-by-default — with fusion deferred to Phase 5.)*". Rewrite it to state the intersection is now fused via `reconstruct_annotated_haplotypes_spliced_fused` (one FFI crossing, RC folded in-kernel), byte-identical to the composed numba oracle, covered by `tests/parity/test_annotated_spliced_haplotypes_parity.py`. Then add a dated Phase 5 W3 entry to the Notes & decisions log recording: the fourth (and final) annotated×spliced combination is now fused; all four reconstruction combinations cross the FFI boundary exactly once on the rust backend; numba remains the oracle (deletion is W5/W6); Phase 5 stays 🚧 (W4–W9 remain). Reference the new test and the PR. Do NOT mark Phase 5 ✅.
+
+- [ ] **Step 2: Full parity suite, both backends**
+
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: green on both backends, matching pass/skip profiles.
+
+- [ ] **Step 3: Full tree (catch stale references in tests/unit and tests/dataset), both backends not required but rust must be green**
+
+```bash
+pixi run -e dev pytest tests/dataset tests/unit -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: green (no stale references to the deleted post-pass / changed branch).
+
+- [ ] **Step 4: Lint, format, typecheck, cargo**
+
+```bash
+pixi run -e dev ruff check python/ tests/
+pixi run -e dev ruff format --check python/ tests/
+pixi run -e dev typecheck
+pixi run -e dev cargo clippy
+```
+Expected: clean. (If Task 1 left `_FlatAnnotatedHaps`/`_COMP` unused, ruff flags it here — remove the dead import and re-run.)
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): record annotated+spliced fusion; all 4 reconstruction combos now single-FFI (Phase 5 W3)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Finish (controller, after final whole-branch review + user confirm)
+
+- Re-verify the load-bearing gate against a fresh `pixi run -e dev maturin develop --release` build (the parity test + full parity suite, both backends) before the final review.
+- Confirm co-author trailers on every commit.
+- File a GVL issue if any follow-up surfaces (e.g. a Minor deferred); otherwise none required.
+- Push `phase-5-w3`; open PR into `rust-migration` (no squash). Reference the W3 plan and the new parity test.
+
+## Self-Review
+
+- **Spec coverage:** PR3's three spec clauses are all covered — "add a fused rust kernel collapsing its remaining FFI crossings (pattern `reconstruct_*_fused`)" → Task 1 Steps 3-6; "parity-gate against the composed numba oracle while numba still exists" → Task 1 Steps 1, 8, 9 (numba branch retained as `else`); "extend the parity suite to cover it" → new `tests/parity/test_annotated_spliced_haplotypes_parity.py`. The deferral note (roadmap) is resolved in Task 2.
+- **Placeholder scan:** every code step contains complete code (the Rust kernel, the Python branch rewrite, the full test). The only deliberately non-transcribed item is the roadmap prose (Task 2 Step 1), which is a documentation edit with the exact target line and required content enumerated.
+- **Type consistency:** the kernel returns `(u8[], i32[], i32[])` with `out_offsets` as input-only — matching `reconstruct_haplotypes_spliced_fused` (offsets in, not returned) and `reconstruct_annotated_haplotypes_fused` (annotation buffers, RC triple). The Python caller wraps the three buffers with the shared `off`/`per_elem_shape`, identical to the deleted code's wrapping. `V_IDX_TYPE` (Python) ↔ `i32` (Rust `annot_v`) match the existing annotated kernels.
diff --git a/docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w5.md b/docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w5.md
new file mode 100644
index 00000000..eaa47a37
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-26-rust-migration-phase-5-w5.md
@@ -0,0 +1,923 @@
+# Phase 5 W5 — Consolidation: golden-snapshot parity, delete numba, add rayon
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Freeze the numba-oracle parity suites to on-disk golden fixtures, delete the entire numba backend (registry, kernels, `GVL_BACKEND`), and add `rayon` batch parallelism to the rust read-path kernels — gated byte-identical throughout.
+
+**Architecture:** Three strictly-ordered stages in one PR (`phase-5-w5` → `rust-migration`), with clean commit boundaries. **Stage A (snapshot)** must run while numba still exists: it captures rust output to committed `.npz` goldens, cross-checked against the numba oracle at generation time, and rewrites every parity test to assert `rust == golden` (importing rust callables *directly*, never via `_dispatch`). **Stage B (delete)** removes all numba now that the parity suite no longer needs it. **Stage C (rayon)** parallelizes the kernels, gated `serial == parallel` byte-identical against the frozen goldens.
+
+**Tech Stack:** Rust (ndarray, PyO3, rayon), Python (numpy, hypothesis for *generation only*), maturin, pytest.
+
+## Global Constraints
+
+- **Branch:** `phase-5-w5`, already cut off `rust-migration @ efb87ea` (W2/W3/W4 merged). Working dir is the main repo (not a worktree).
+- **Byte-identical parity is the landing gate.** Stage A's goldens are the frozen oracle; every later change must keep `rust == golden`.
+- **Generate goldens from rust, cross-checked against numba.** At generation time (numba present), golden := rust output, and the generator asserts `numba == rust` before saving. This makes the frozen point provably equal to the oracle.
+- **Committed parity tests must NOT import `_dispatch`.** Replay imports rust callables directly from the extension/production wrappers, so Stage B's dispatch deletion does not touch the test suite.
+- **maturin rebuild before pytest:** after ANY `src/` edit run `pixi run -e dev maturin develop --release` before pytest, or the stale `.so` is imported. (`cargo test` compiles from source and is exempt.)
+- **All pytest invocations need** `--basetemp=$(pwd)/.pytest_tmp` (os.link Errno 18 on Carter).
+- **Conventional commits** with trailer `Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>`. Use `rtk` prefix on git commands. No squash.
+- **Rayon gating:** each parallelized kernel takes a `parallel: bool` (computed Python-side via `should_parallelize(...)`); the `else` serial branch stays as the byte-identity reference; thread count comes from rayon's global pool via `RAYON_NUM_THREADS`. Follow the existing `get_reference` idiom in `src/reference/mod.rs:56-120` exactly — `split_at_mut` chain → `Vec<&mut [_]>` → `into_par_iter()`. **Do NOT** put raw `*mut` pointers into a rayon closure (not `Send`; won't compile / unsound to force).
+- **Three commit boundaries** inside the one PR: `snapshot…`, `delete numba…`, `rayon…` (each stage's tasks roll up into its boundary; intermediate task commits are fine).
+
+---
+
+## File Structure
+
+**Stage A — new files:**
+- `tests/parity/_golden.py` — snapshot/replay infrastructure: deterministic example collection, object-array `.npz` save/load, `RUST_KERNELS` name→callable table, replay-assert helpers mirroring the 4 `_harness.py` shapes.
+- `tests/parity/generate_goldens.py` — regeneration driver (run manually while numba present; commits `.npz`). A per-kernel registry table drives it.
+- `tests/parity/golden/*.npz` — committed frozen fixtures (one per kernel/test).
+- `tests/parity/test_import_no_numba.py` — (added Stage B) import-guard.
+
+**Stage A — modified:** every `tests/parity/test_*_parity.py` (convert from cross-backend to golden replay); `tests/parity/_harness.py` (helpers gain golden-replay variants or are superseded by `_golden.py`).
+
+**Stage B — modified:** `python/genvarloader/_dispatch.py` (deleted); the 6 production modules with `get(name)(...)` call sites and `register()` blocks (`_reference.py`, `_intervals.py`, `_genotypes.py`, `_flat_variants.py`, `_rag_variants.py`, `_reconstruct.py`); the backend-conditional branch sites (`_query.py`, `_haps.py`, `_reconstruct.py`, `_tracks.py`, `_reference.py`); the 11 `import numba` files; `_threads.py`, `_ragged.py`, `__init__.py`; `pyproject.toml`, `pixi.toml`.
+
+**Stage C — modified:** `src/reconstruct/mod.rs`, `src/tracks/mod.rs`, `src/genotypes/mod.rs`, `src/intervals.rs`, plus the FFI wrappers in `src/ffi/mod.rs` that gain a `parallel` arg, and the Python callers that pass it; `python/genvarloader/_threads.py` (RAYON_NUM_THREADS); `docs/roadmaps/rust-migration.md`.
+
+---
+
+# STAGE A — Golden snapshot (numba still present)
+
+### Task A1: Golden infrastructure (`_golden.py`)
+
+**Files:**
+- Create: `tests/parity/_golden.py`
+- Create: `tests/parity/golden/.gitkeep`
+- Test: `tests/parity/test_golden_infra.py`
+
+**Interfaces:**
+- Produces:
+  - `GOLDEN_DIR: Path` — `Path(__file__).parent / "golden"`.
+  - `collect_examples(strategy, n: int) -> list` — deterministic draw of `n` examples from a hypothesis strategy (no DB, derandomized).
+  - `save_golden(name: str, cases: list) -> None` — write `GOLDEN_DIR/{name}.npz` as a single object array `cases` (allow_pickle).
+  - `load_golden(name: str) -> list` — read it back.
+  - `RUST_KERNELS: dict[str, Callable]` — kernel-name → rust callable, imported directly (verified against each `register(..., rust=…)` in production).
+  - `replay_return(name, cases)`, `replay_tuple(name, cases)`, `replay_inplace(name, cases, out_factory, out_index)`, `replay_dict(name, cases)` — load-free replay helpers taking pre-loaded `cases`, each asserting `rust(*inputs)` byte-identical to the stored golden (dtype + shape + values), mirroring the 4 `_harness.py` shapes.
+
+- [ ] **Step 1: Write the failing test**
+
+```python
+# tests/parity/test_golden_infra.py
+"""Self-tests for the golden snapshot/replay infrastructure."""
+from __future__ import annotations
+
+import numpy as np
+from hypothesis import strategies as st
+
+from tests.parity import _golden
+
+
+def test_collect_examples_deterministic():
+    s = st.integers(0, 1_000_000)
+    a = _golden.collect_examples(s, 20)
+    b = _golden.collect_examples(s, 20)
+    assert a == b
+    assert len(a) == 20
+
+
+def test_save_load_roundtrip_mixed(tmp_path, monkeypatch):
+    monkeypatch.setattr(_golden, "GOLDEN_DIR", tmp_path)
+    cases = [
+        ((np.arange(3, dtype=np.int32), None, 5), np.arange(3, dtype=np.int32) * 2),
+        ((np.zeros(0, np.uint8),), np.zeros(0, np.uint8)),
+    ]
+    _golden.save_golden("demo", cases)
+    back = _golden.load_golden("demo")
+    assert len(back) == 2
+    np.testing.assert_array_equal(back[0][0][0], cases[0][0][0])
+    assert back[0][0][1] is None
+    assert back[0][0][2] == 5
+
+
+def test_rust_kernels_table_callable():
+    # Every registered name resolves to a real callable imported directly.
+    assert _golden.RUST_KERNELS, "RUST_KERNELS is empty"
+    for name, fn in _golden.RUST_KERNELS.items():
+        assert callable(fn), f"{name} -> {fn!r} not callable"
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+Run: `pixi run -e dev pytest tests/parity/test_golden_infra.py -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: FAIL — `ModuleNotFoundError: tests.parity._golden`.
+
+- [ ] **Step 3: Write `_golden.py`**
+
+```python
+# tests/parity/_golden.py
+"""Frozen-golden snapshot + replay for the parity suite.
+
+Goldens are generated from the RUST implementation and cross-checked against
+the numba oracle at generation time (see generate_goldens.py). Replay imports
+rust callables DIRECTLY — never via _dispatch — so these tests survive the
+numba/dispatch deletion in Stage B.
+"""
+from __future__ import annotations
+
+from collections.abc import Callable
+from pathlib import Path
+
+import numpy as np
+from hypothesis import HealthCheck, Phase, given, settings
+
+GOLDEN_DIR = Path(__file__).parent / "golden"
+
+
+def collect_examples(strategy, n: int) -> list:
+    """Deterministically draw ``n`` examples from a hypothesis strategy.
+
+    Derandomized + no database + generate-only phase ⇒ stable across runs for a
+    fixed hypothesis version. Inputs are frozen INTO the golden, so the replay
+    test never re-runs hypothesis.
+    """
+    out: list = []
+
+    @settings(
+        max_examples=n,
+        derandomize=True,
+        database=None,
+        phases=[Phase.generate],
+        suppress_health_check=list(HealthCheck),
+        deadline=None,
+    )
+    @given(strategy)
+    def _collect(ex):
+        if len(out) < n:
+            out.append(ex)
+
+    _collect()
+    return out
+
+
+def save_golden(name: str, cases: list) -> None:
+    GOLDEN_DIR.mkdir(parents=True, exist_ok=True)
+    np.savez_compressed(GOLDEN_DIR / f"{name}.npz", cases=np.array(cases, dtype=object))
+
+
+def load_golden(name: str) -> list:
+    data = np.load(GOLDEN_DIR / f"{name}.npz", allow_pickle=True)
+    return list(data["cases"])
+
+
+# --- direct rust-callable table -------------------------------------------------
+# Each entry MUST equal the `rust=` argument of the matching register(...) call in
+# production. Verify each against the dispatch map before trusting it.
+def _build_rust_kernels() -> dict[str, Callable]:
+    from genvarloader import genvarloader as _ext  # compiled extension
+
+    table: dict[str, Callable] = {
+        "intervals_to_tracks": _ext.intervals_to_tracks,
+        "tracks_to_intervals": _ext.tracks_to_intervals,
+        "get_diffs_sparse": _ext.get_diffs_sparse,
+        "choose_exonic_variants": _ext.choose_exonic_variants,
+        "gather_alleles": _ext.gather_alleles,
+        "gather_rows_i32": _ext.gather_rows_i32,
+        "gather_rows_f32": _ext.gather_rows_f32,
+        "compact_keep_i32": _ext.compact_keep_i32,
+        "compact_keep_f32": _ext.compact_keep_f32,
+        "fill_empty_scalar_i32": _ext.fill_empty_scalar_i32,
+        "fill_empty_scalar_f32": _ext.fill_empty_scalar_f32,
+        "fill_empty_fixed_i32": _ext.fill_empty_fixed_i32,
+        "fill_empty_fixed_f32": _ext.fill_empty_fixed_f32,
+        "fill_empty_seq_u8": _ext.fill_empty_seq_u8,
+        "fill_empty_seq_i32": _ext.fill_empty_seq_i32,
+        "get_reference": _ext.get_reference,
+        "reconstruct_haplotypes_from_sparse": _ext.reconstruct_haplotypes_from_sparse,
+        "shift_and_realign_tracks_sparse": _ext.shift_and_realign_tracks_sparse,
+        "rc_alleles": _ext.rc_alleles,
+    }
+    # NOTE: kernels whose `rust=` is a PYTHON WRAPPER (not a bare extension fn) —
+    # e.g. assemble_variant_buffers (u8/i32 dtype dispatch). Add those by importing
+    # the SAME wrapper the registration used; ground-truth against the register() call.
+    return table
+
+
+RUST_KERNELS: dict[str, Callable] = _build_rust_kernels()
+
+
+def _eq(name: str, i: int, got, exp) -> None:
+    got = np.asarray(got)
+    exp = np.asarray(exp)
+    assert got.dtype == exp.dtype, f"{name}[{i}]: dtype {got.dtype} != {exp.dtype}"
+    assert got.shape == exp.shape, f"{name}[{i}]: shape {got.shape} != {exp.shape}"
+    np.testing.assert_array_equal(got, exp, err_msg=f"{name}[{i}] value mismatch")
+
+
+def replay_return(name: str, cases: list) -> None:
+    fn = RUST_KERNELS[name]
+    for ci, (inputs, golden) in enumerate(cases):
+        _eq(f"{name}#{ci}", 0, fn(*inputs), golden)
+
+
+def replay_tuple(name: str, cases: list) -> None:
+    fn = RUST_KERNELS[name]
+    for ci, (inputs, golden) in enumerate(cases):
+        got = fn(*inputs)
+        got = got if isinstance(got, tuple) else (got,)
+        gold = golden if isinstance(golden, tuple) else (golden,)
+        assert len(got) == len(gold), f"{name}#{ci}: tuple len {len(got)} != {len(gold)}"
+        for j, (a, b) in enumerate(zip(got, gold)):
+            _eq(f"{name}#{ci}", j, a, b)
+
+
+def replay_inplace(name: str, cases: list, out_factory: Callable, out_index: int) -> None:
+    fn = RUST_KERNELS[name]
+    for ci, (inputs, golden) in enumerate(cases):
+        out = out_factory(inputs)
+        args = list(inputs)
+        args.insert(out_index, out)
+        fn(*args)
+        _eq(f"{name}#{ci}", 0, out, golden)
+
+
+def replay_dict(name: str, cases: list) -> None:
+    fn = RUST_KERNELS[name]
+    for ci, (inputs, golden) in enumerate(cases):
+        got = fn(*inputs)
+        assert set(got) == set(golden), f"{name}#{ci}: keys {set(got)} != {set(golden)}"
+        for k in sorted(golden):
+            _eq(f"{name}#{ci}:{k}.data", 0, np.asarray(got[k][0]), np.asarray(golden[k][0]))
+            _eq(f"{name}#{ci}:{k}.off", 1,
+                np.asarray(got[k][1], np.int64), np.asarray(golden[k][1], np.int64))
+```
+
+Note: `replay_inplace`'s `out_factory` takes `inputs` (so it can size the out buffer from `total_out` carried in the frozen case — the in-place strategies return `(total_out, inputs)`).
+
+- [ ] **Step 4: Run the self-test**
+
+Run: `pixi run -e dev pytest tests/parity/test_golden_infra.py -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS (3 tests). If `RUST_KERNELS` raises on a missing extension symbol, ground-truth that symbol's name against `src/lib.rs` and the matching `register()` call.
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add tests/parity/_golden.py tests/parity/test_golden_infra.py tests/parity/golden/.gitkeep
+rtk git commit -m "test(parity): golden snapshot/replay infrastructure (Phase 5 W5)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task A2: Golden generator + freeze kernel-level goldens
+
+**Files:**
+- Create: `tests/parity/generate_goldens.py`
+- Create: `tests/parity/golden/<kernel>.npz` (committed artifacts)
+- Test: regeneration is the test (the generator asserts numba==rust per case).
+
+**Interfaces:**
+- Consumes: `_golden.{collect_examples,save_golden,RUST_KERNELS}`, `strategies.*`, `genvarloader._dispatch.backends` (numba oracle — generation-time only).
+- Produces: one `.npz` per kernel-level test, plus an `output_adapter` per kernel that normalizes `(numba_out, rust_out)` to comparable form and produces the stored golden.
+
+**Kernel registry table (drives the generator).** Each row: kernel name, strategy factory, output shape (`return`/`tuple`/`inplace`/`dict`), N examples. Ground-truth the strategy names against `tests/parity/strategies.py` and each kernel's argument count against its existing `test_*_parity.py`.
+
+| Golden name | Strategy | Shape | N |
+|---|---|---|---|
+| `intervals_to_tracks` | `intervals_to_tracks_inputs()` | inplace (out_index per existing test) | 200 |
+| `get_diffs_sparse` | `get_diffs_sparse_inputs()` | tuple | 200 |
+| `choose_exonic_variants` | `choose_exonic_variants_inputs()` | tuple | 200 |
+| `gather_rows_i32` | `gather_rows_inputs(np.int32)` | tuple | 100 |
+| `gather_rows_f32` | `gather_rows_inputs(np.float32)` | tuple | 100 |
+| `gather_alleles` | `gather_alleles_inputs()` | tuple | 100 |
+| `compact_keep_i32` | `compact_keep_inputs(np.int32)` | tuple | 100 |
+| `compact_keep_f32` | `compact_keep_inputs(np.float32)` | tuple | 100 |
+| `fill_empty_scalar_i32` | `fill_empty_scalar_inputs(np.int32)` | tuple | 100 |
+| `fill_empty_scalar_f32` | `fill_empty_scalar_inputs(np.float32)` | tuple | 100 |
+| `fill_empty_fixed_i32` | `fill_empty_fixed_inputs(np.int32)` | tuple | 100 |
+| `fill_empty_fixed_f32` | `fill_empty_fixed_inputs(np.float32)` | tuple | 100 |
+| `fill_empty_seq_u8` | `fill_empty_seq_inputs(np.uint8)` | tuple | 100 |
+| `fill_empty_seq_i32` | `fill_empty_seq_inputs(np.int32)` | tuple | 100 |
+| `tracks_to_intervals` | `tracks_to_intervals_inputs()` | tuple | 200 |
+| `get_reference` | `get_reference_inputs()` | return | 200 |
+| `shift_and_realign_tracks_sparse` | `shift_and_realign_tracks_inputs()` | inplace (out_index 0; case carries `total_out`) | 200 |
+| `reconstruct_haplotypes_from_sparse` | `reconstruct_haplotypes_inputs()` | inplace (out_index 0; case carries `total_out`) | 200 |
+
+(`rc_alleles`, `assemble_variant_buffers`, and the PRNG functions are handled in A4/A5 — non-standard shapes/fixtures.)
+
+- [ ] **Step 1: Write `generate_goldens.py`**
+
+```python
+# tests/parity/generate_goldens.py
+"""Regenerate frozen golden fixtures for the parity suite.
+
+RUN MANUALLY while numba is still installed (Stage A):
+    pixi run -e dev python -m tests.parity.generate_goldens
+
+For each kernel: draw N deterministic examples, compute the golden from RUST,
+and assert the numba oracle agrees BEFORE saving. After numba deletion this
+script still regenerates from rust (the numba cross-check is skipped if the
+backend is gone).
+"""
+from __future__ import annotations
+
+import numpy as np
+
+from genvarloader import _dispatch
+from tests.parity import _golden, strategies
+
+# (name, strategy, shape, n, extra) — see plan table. `inplace` carries an
+# out_factory/out_index; the strategy returns (total_out, inputs) for those.
+RETURN, TUPLE, INPLACE = "return", "tuple", "inplace"
+
+SPEC = [
+    ("get_diffs_sparse", strategies.get_diffs_sparse_inputs(), TUPLE, 200, None),
+    ("get_reference", strategies.get_reference_inputs(), RETURN, 200, None),
+    # ... fill in remaining rows from the plan table ...
+]
+
+# in-place kernels: strategy yields (total_out, inputs); out inserted at index 0.
+INPLACE_SPEC = [
+    ("intervals_to_tracks", strategies.intervals_to_tracks_inputs(), 200,
+     lambda inp: np.zeros(int(inp[-1][-1]), np.float32), 7),  # out_index per existing test
+    ("shift_and_realign_tracks_sparse", strategies.shift_and_realign_tracks_inputs(), 200,
+     lambda total_out: np.zeros(total_out, np.float32), 0),
+    ("reconstruct_haplotypes_from_sparse", strategies.reconstruct_haplotypes_inputs(), 200,
+     lambda total_out: np.zeros(total_out, np.uint8), 0),
+]
+
+
+def _normalize(out):
+    if isinstance(out, tuple):
+        return tuple(np.asarray(x) for x in out)
+    if isinstance(out, dict):
+        return {k: (np.asarray(v[0]), np.asarray(v[1])) for k, v in out.items()}
+    return np.asarray(out)
+
+
+def _assert_oracle(name, a, b):
+    # numba (a) vs rust (b) — both already normalized
+    if isinstance(a, tuple):
+        assert len(a) == len(b)
+        for x, y in zip(a, b):
+            np.testing.assert_array_equal(x, y, err_msg=f"{name} oracle mismatch")
+    elif isinstance(a, dict):
+        assert set(a) == set(b)
+        for k in a:
+            np.testing.assert_array_equal(a[k][0], b[k][0])
+            np.testing.assert_array_equal(np.asarray(a[k][1], np.int64),
+                                          np.asarray(b[k][1], np.int64))
+    else:
+        np.testing.assert_array_equal(a, b, err_msg=f"{name} oracle mismatch")
+
+
+def _have_numba(name):
+    try:
+        _dispatch.backends(name)
+        return True
+    except Exception:
+        return False
+
+
+def gen_value_kernels():
+    for name, strat, shape, n, _ in SPEC:
+        examples = _golden.collect_examples(strat, n)
+        rust = _golden.RUST_KERNELS[name]
+        nb = _dispatch.backends(name)[0] if _have_numba(name) else None
+        cases = []
+        for inp in examples:
+            r = _normalize(rust(*inp))
+            if nb is not None:
+                _assert_oracle(name, _normalize(nb(*inp)), r)
+            cases.append((inp, r))
+        _golden.save_golden(name, cases)
+        print(f"  {name}: {len(cases)} cases")
+
+
+def gen_inplace_kernels():
+    for name, strat, n, out_factory, out_index in INPLACE_SPEC:
+        examples = _golden.collect_examples(strat, n)
+        rust = _golden.RUST_KERNELS[name]
+        nb = _dispatch.backends(name)[0] if _have_numba(name) else None
+        cases = []
+        for ex in examples:
+            # strategy returns (total_out, inputs) for shift/reconstruct;
+            # intervals_to_tracks returns the inputs tuple directly.
+            if isinstance(ex, tuple) and len(ex) == 2 and np.isscalar(ex[0]):
+                total_out, inputs = ex
+                of = lambda _inp, t=total_out: out_factory(t)
+            else:
+                inputs = ex
+                of = out_factory
+            out_r = of(inputs)
+            args = list(inputs); args.insert(out_index, out_r); rust(*args)
+            if nb is not None:
+                out_n = of(inputs)
+                an = list(inputs); an.insert(out_index, out_n); nb(*an)
+                np.testing.assert_array_equal(out_n, out_r, err_msg=f"{name} oracle")
+            cases.append((inputs, np.asarray(out_r)))
+        _golden.save_golden(name, cases)
+        print(f"  {name}: {len(cases)} cases")
+
+
+if __name__ == "__main__":
+    print("Generating value-kernel goldens...")
+    gen_value_kernels()
+    print("Generating in-place-kernel goldens...")
+    gen_inplace_kernels()
+    print("Done.")
+```
+
+Fill in the full `SPEC` list from the plan table. Ground-truth `intervals_to_tracks`'s `out_index` and out dtype/shape against its existing `test_intervals_to_tracks_parity.py` (it uses `assert_inplace_kernel_parity`).
+
+- [ ] **Step 2: Generate the goldens**
+
+Run: `pixi run -e dev python -m tests.parity.generate_goldens`
+Expected: prints each kernel's case count; **no oracle-mismatch assertion**. If a mismatch fires, that is a real numba/rust divergence on a generated input — STOP and investigate per the numba-oracle-bug policy (check whether numba is the buggy one) before freezing.
+
+- [ ] **Step 3: Verify the goldens are non-trivial**
+
+Run: `pixi run -e dev python -c "from tests.parity import _golden; import numpy as np; c=_golden.load_golden('get_reference'); print(len(c), np.asarray(c[0][1]).shape)"`
+Expected: 200 and a non-empty shape.
+
+- [ ] **Step 4: Commit (goldens + generator)**
+
+```bash
+rtk git add tests/parity/generate_goldens.py tests/parity/golden/*.npz
+rtk git commit -m "test(parity): freeze kernel-level golden fixtures (Phase 5 W5)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task A3: Convert kernel-level parity tests to golden replay
+
+**Files:**
+- Modify: all kernel-level `tests/parity/test_*_parity.py` (the ~14 using `_dispatch.backends` via `_harness`).
+- Test: the converted tests themselves.
+
+**Interfaces:**
+- Consumes: `_golden.{load_golden, replay_return, replay_tuple, replay_inplace, replay_dict}`.
+
+**Conversion pattern (apply to every kernel-level test).** Replace the `@given(strategy)` + `assert_kernel_parity*` body with a one-shot golden replay. Example — `test_get_diffs_sparse_parity.py`:
+
+- [ ] **Step 1: Rewrite one test as the reference conversion**
+
+```python
+# tests/parity/test_get_diffs_sparse_parity.py
+"""get_diffs_sparse: rust vs frozen golden (oracle frozen Phase 5 W5)."""
+from __future__ import annotations
+
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_get_diffs_sparse_golden():
+    cases = _golden.load_golden("get_diffs_sparse")
+    assert cases, "empty golden"
+    _golden.replay_tuple("get_diffs_sparse", cases)
+```
+
+- [ ] **Step 2: Run it (rust backend)**
+
+Run: `pixi run -e dev pytest tests/parity/test_get_diffs_sparse_parity.py -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+- [ ] **Step 3: Convert the remaining kernel-level tests** following the same pattern, choosing the matching replay helper:
+  - `replay_tuple`: get_diffs_sparse, choose_exonic_variants, gather_rows (i32/f32), gather_alleles, compact_keep (i32/f32), fill_empty_scalar/fixed/seq (all dtype variants), tracks_to_intervals.
+  - `replay_return`: get_reference.
+  - `replay_inplace`: intervals_to_tracks (out_index/out_factory from its old test), shift_and_realign_tracks_sparse, reconstruct_haplotypes_from_sparse.
+  - For multi-dtype files (e.g. `test_flat_variants_parity.py` covering many fill/gather kernels), one `test_<kernel>_golden()` per golden name.
+  - Delete the now-unused `@given`, `strategies` imports, and `_harness`/`_dispatch` imports from each converted file.
+
+- [ ] **Step 4: Run all converted kernel-level tests (rust)**
+
+Run: `pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp -k "golden"`
+Expected: all PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add tests/parity/
+rtk git commit -m "test(parity): replay kernel-level parity against frozen goldens (Phase 5 W5)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task A4: Snapshot + convert dataset-level (`GVL_BACKEND`-flip) tests
+
+**Files:**
+- Modify: `generate_goldens.py` (add dataset-golden generation), `_golden.py` (add `save/load` for Ragged-shaped outputs if needed).
+- Modify: `test_dataset_parity.py`, `test_haplotypes_dataset_parity.py`, `test_spliced_haplotypes_parity.py`, `test_annotated_spliced_haplotypes_parity.py`, `test_fused_haps_parity.py`, `test_fused_tracks_parity.py`, `test_reference_dataset_parity.py`, `test_reference_fetch_parity.py`, `test_variants_dataset_parity.py` (all `GVL_BACKEND`-flip tests).
+- Create: `tests/parity/golden/ds_*.npz`.
+
+**Conversion pattern.** Each test currently: builds a deterministic dataset (session fixtures `phased_svar_gvl`, `build_*` seeded) → reads `ds[r,s]` under numba and rust → compares. Convert to: snapshot the agreed output's constituent arrays to `.npz` (generated while numba present, cross-checked) → test reads `ds[r,s]` under rust only → compares against golden. **Keep the spy guards** (they prove the rust kernel fires; still valid). **Delete** the `monkeypatch.setenv("GVL_BACKEND", ...)` flips and the numba read.
+
+- [ ] **Step 1: Add a dataset-output serializer to `_golden.py`**
+
+```python
+def flatten_output(out):
+    """Serialize a dataset __getitem__ result to a dict of arrays for golden storage.
+
+    Handles Ragged (.data/.offsets), RaggedAnnotatedHaps (.haps/.var_idxs/.ref_coords),
+    plain ndarray, and tuples thereof. Returns a JSON-able structure of np arrays.
+    """
+    import numpy as np
+    from seqpro.rag import Ragged
+    from genvarloader._ragged import RaggedAnnotatedHaps
+
+    if isinstance(out, RaggedAnnotatedHaps):
+        return {"kind": "annot",
+                "haps": (np.asarray(out.haps.data), np.asarray(out.haps.offsets, np.int64)),
+                "var_idxs": (np.asarray(out.var_idxs.data), np.asarray(out.var_idxs.offsets, np.int64)),
+                "ref_coords": (np.asarray(out.ref_coords.data), np.asarray(out.ref_coords.offsets, np.int64))}
+    if isinstance(out, Ragged):
+        return {"kind": "ragged",
+                "data": np.asarray(out.data), "offsets": np.asarray(out.offsets, np.int64)}
+    if isinstance(out, tuple):
+        return {"kind": "tuple", "items": [flatten_output(o) for o in out]}
+    return {"kind": "array", "data": np.asarray(out)}
+
+
+def assert_output_matches_golden(out, golden) -> None:
+    """Assert a fresh dataset output equals a flattened golden (byte-identical)."""
+    got = flatten_output(out)
+    assert got["kind"] == golden["kind"], f"kind {got['kind']} != {golden['kind']}"
+    # ... recursively compare arrays via _eq ... (mirror flatten_output structure)
+```
+
+(Implement the recursive comparison in `assert_output_matches_golden` mirroring `flatten_output`'s branches.)
+
+- [ ] **Step 2: Add dataset-golden generation to `generate_goldens.py`**
+
+For each dataset test, build the same fixture/dataset the test uses, read `ds[r,s]` under **numba** and **rust** (env flip — generation time only), assert equal, then `save_golden("ds_<testname>", flatten_output(rust_out))`. Use a `gen_dataset_goldens()` function driven by a small table of `(golden_name, build_fn, index)`.
+
+- [ ] **Step 3: Convert one dataset test as the reference** — `test_haplotypes_dataset_parity.py`:
+
+```python
+def test_haplotypes_mode_dataset_golden(phased_svar_gvl, reference, monkeypatch):
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference).with_seqs("haplotypes")
+    # spy guard stays — proves the fused rust kernel fires
+    orig = _haps_mod.reconstruct_haplotypes_fused
+    calls = {"n": 0}
+    def _spy(*a, **k):
+        calls["n"] += 1
+        return orig(*a, **k)
+    monkeypatch.setattr(_haps_mod, "reconstruct_haplotypes_fused", _spy)
+
+    out_rust = ds[:, :]
+    assert calls["n"] > 0, "fused rust kernel never fired — vacuous"
+    # non-triviality + golden compare
+    _golden.assert_output_matches_golden(out_rust, _golden.load_flat_golden("ds_haplotypes_mode"))
+```
+
+(`load_flat_golden` = `load_golden` returning the single flattened dict; add a thin variant or store as a 1-element `cases` list.)
+
+- [ ] **Step 4: Regenerate dataset goldens + run**
+
+```bash
+pixi run -e dev python -m tests.parity.generate_goldens
+pixi run -e dev maturin develop --release   # only if src changed (it didn't here)
+pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: all PASS on rust.
+
+- [ ] **Step 5: Convert remaining dataset tests + commit** (same pattern; keep each spy guard; drop the env flips).
+
+```bash
+rtk git add tests/parity/
+rtk git commit -m "test(parity): replay dataset-level parity against frozen goldens (Phase 5 W5)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task A5: Snapshot + convert PRNG direct-import tests; Stage-A gate
+
+**Files:**
+- Modify: `test_prng_parity.py`, `test_rc_alleles_parity.py`, `test_assemble_variant_buffers_parity.py`.
+- Create: `tests/parity/golden/prng_*.npz`, `rc_alleles.npz`, `assemble_variant_buffers.npz`.
+
+- [ ] **Step 1: Freeze PRNG tables.** In `generate_goldens.py`, add a `gen_prng()` that builds a table of `(input → numba _xorshift64/_hash4 output)` over a deterministic input list, asserts the rust `_debug_*` equals it, and saves. Convert `test_prng_parity.py` to load the table and assert rust `_debug_xorshift64`/`_hash4` == frozen output (no numba import).
+
+- [ ] **Step 2: Freeze `rc_alleles` + `assemble_variant_buffers`.** These use bespoke strategies/fixed arrays (see their existing tests). Add generation entries (rust golden + numba cross-check) and convert the tests to replay. For `assemble_variant_buffers` (dict-returning, dtype-dispatched wrapper), add its rust wrapper to `RUST_KERNELS` and use `replay_dict`.
+
+- [ ] **Step 3: Regenerate everything + full parity suite gate**
+
+```bash
+pixi run -e dev python -m tests.parity.generate_goldens
+pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: entire `tests/parity` green on the default rust backend.
+
+- [ ] **Step 4: Prove no committed parity test imports `_dispatch`**
+
+Run: `rtk grep -rn "_dispatch\|GVL_BACKEND\|_harness" tests/parity/test_*.py`
+Expected: **no matches** in committed test files (allowed only in `generate_goldens.py`). Fix any stragglers.
+
+- [ ] **Step 5: Cross-check goldens still equal numba one final time** (the generator already asserts this; re-run to confirm clean), then commit the snapshot stage boundary.
+
+```bash
+rtk git add tests/parity/
+rtk git commit -m "test(parity): freeze PRNG/rc_alleles/assemble goldens; Stage-A snapshot complete (Phase 5 W5)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+# STAGE B — Delete numba
+
+> Goldens now guard rust independently of numba. Safe to delete.
+
+### Task B1: Replace dispatched call sites with direct rust; delete the registry
+
+**Files:**
+- Delete: `python/genvarloader/_dispatch.py`
+- Modify: `_reference.py`, `_intervals.py`, `_genotypes.py`, `_flat_variants.py`, `_rag_variants.py`, `_reconstruct.py` (22 `get(name)(...)` call sites + 20 `register()` blocks).
+
+**Interfaces:**
+- Consumes: the dispatch map (kernel name → rust symbol) from the W5 investigation. Each `get("name")(args)` becomes a direct call to the rust callable that `register(name, rust=…)` named.
+
+- [ ] **Step 1:** For each of the 22 call sites, replace `get("kernel")(args)` with the direct rust callable (already imported at module scope as `_<kernel>_rust` or `from ..genvarloader import <kernel>`). Delete the paired `register(...)` block. Use the dispatch investigation's "replace-with-rust-symbol" column as the authority; verify each rust symbol is already imported in that module (it is — both backends were imported for registration).
+- [ ] **Step 2:** Delete `python/genvarloader/_dispatch.py` and every `from .._dispatch import ...` / `import genvarloader._dispatch` line (including the `# noqa: F401 — triggers register(...)` import lines in any remaining non-parity modules). ALSO delete the now-dead test infra that depended on `_dispatch`: `tests/parity/_harness.py` (the old cross-backend assert helpers — fully superseded by `_golden.py`) and `tests/parity/test_harness_tuple.py` (its meta-test, the only remaining `_harness` consumer). Confirm no other file imports `_harness` before deleting.
+- [ ] **Step 2b (test-infra spy rewrite — REQUIRED, else dataset goldens go vacuous):** `tests/parity/_golden.py::make_kernel_spy` currently spies by MUTATING the dispatch registry (`_disp.register(name, rust=spy, …)`). Once Step 1 makes call sites direct, registry mutation intercepts nothing — the spy never fires and the dataset tests' `assert calls["n"] > 0` guards fail. Rewrite `make_kernel_spy` to monkeypatch the DIRECT rust symbol at its production call site (the module-level name the converted call site now uses — e.g. `_genotypes.reconstruct_haplotypes_from_sparse`, `_tracks.shift_and_realign_tracks_sparse`, etc.), mirroring how the fused-path spies already monkeypatch `_haps_mod.reconstruct_*_fused`. It must remain a counting wrapper returning a `restore()`. Remove the function-local `from genvarloader import _dispatch` import. Verify each converted dataset test's spy still fires (`calls["n"] > 0`) after the rewrite.
+- [ ] **Step 3: Rebuild + run the read-path tests**
+
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev pytest tests/parity tests/dataset tests/unit -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS (goldens + dataset/unit). A `KeyError: no kernel registered` or `ModuleNotFoundError: _dispatch` means a missed call site — fix it.
+- [ ] **Step 4: Commit.**
+
+---
+
+### Task B2: Collapse backend-conditional branches; delete `GVL_BACKEND`
+
+**Files:**
+- Modify: `_query.py` (delete `_active_backend()` + the two `if _active_backend()=="numba"` RC post-pass branches — keep the rust in-kernel-RC behavior), `_haps.py` (4 `if _backend=="rust"` fused-vs-composed forks → keep fused), `_reconstruct.py` (2 forks → keep fused), `_reference.py` (3 backend branches → keep rust: always call `get_reference` with the 7-arg rust signature incl. `to_rc`; drop the numba post-pass), `_tracks.py` (2 `if ...=="rust"` RC post-pass branches → now unconditional).
+
+**Critical:** the RC accounting must stay byte-identical. On rust, RC is folded in-kernel; the deleted numba branches were the *external* post-pass. Removing the `=="numba"` branch and keeping the rust path is correct **only if** the rust path already RC's in-kernel — which the W3/earlier work established. The goldens enforce this.
+
+- [ ] **Step 1:** Delete `_active_backend()` and every `os.environ.get("GVL_BACKEND")` / `== "numba"` / `== "rust"` branch, keeping the rust arm inline. For `_reference.py:get_reference()`, drop the 6-vs-7-arg conditional — always pass `to_rc`.
+- [ ] **Step 2: Rebuild + run the full read path + the strand/RC-heavy goldens**
+
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev pytest tests/parity tests/dataset tests/unit -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS — especially the spliced/annotated/strand-mixed dataset goldens (the RC-sensitive ones).
+- [ ] **Step 3: Commit.**
+
+---
+
+### Task B3: Delete numba kernels + imports; refactor `_threads.py` and `_ragged.py`
+
+**Files:**
+- Modify (delete `@njit`/`@nb.vectorize` bodies + `import numba`): `_flat_variants.py`, `_genotypes.py`, `_intervals.py`, `_reference.py`, `_tracks.py`, `_flat.py`, `_flat_flanks.py`, `_dataset/_utils.py`, `_variants/_sitesonly.py`, `_ragged.py`, `_threads.py` (28 njit + 1 vectorize total).
+- Refactor: `_threads.py` (OS thread detection, no numba), `_ragged.py` (keep `_COMP`, drop `@nb.vectorize` on `ufunc_comp_dna`), `__init__.py` (rename/adjust the `cap_numba_threads()` call).
+
+- [ ] **Step 1: Refactor `_threads.py`** to drop numba:
+
+```python
+# python/genvarloader/_threads.py
+from __future__ import annotations
+import os
+
+_MIN_BYTES_PER_THREAD = 1 << 20  # 1 MiB
+_NUM_THREADS: int | None = None
+
+
+def _detect_cpus() -> int:
+    try:
+        return max(1, len(os.sched_getaffinity(0)))  # respects cgroup cpuset (Linux)
+    except AttributeError:
+        return max(1, os.cpu_count() or 1)
+
+
+def _resolve_num_threads() -> int:
+    env = os.environ.get("GVL_NUM_THREADS")
+    if env:
+        try:
+            return max(1, int(env))
+        except ValueError:
+            pass
+    return _detect_cpus()
+
+
+def cap_threads() -> int:
+    """Resolve worker count once and pin rayon's pool via RAYON_NUM_THREADS.
+
+    Must run before the first rust parallel call (rayon reads RAYON_NUM_THREADS
+    at global-pool init). Idempotent.
+    """
+    global _NUM_THREADS
+    if _NUM_THREADS is None:
+        _NUM_THREADS = _resolve_num_threads()
+        os.environ.setdefault("RAYON_NUM_THREADS", str(_NUM_THREADS))
+    return _NUM_THREADS
+
+
+def num_threads() -> int:
+    return cap_threads()
+
+
+def should_parallelize(total_bytes: int) -> bool:
+    return total_bytes >= num_threads() * _MIN_BYTES_PER_THREAD
+```
+
+Update `__init__.py`: replace the `cap_numba_threads()` call with `cap_threads()` (keep it at import so `RAYON_NUM_THREADS` is set before any read). Update `_reference.py`'s `should_parallelize` import if the call signature changed (it didn't).
+
+- [ ] **Step 2: `_ragged.py`** — remove the `@nb.vectorize` decorator and the `import numba as nb`. Keep `_COMP`. If `ufunc_comp_dna` is still referenced, replace it with a plain numpy LUT apply (`_COMP[arr]`); if unused after numba deletion, delete it. Ground-truth its usages first.
+
+- [ ] **Step 2b (PRODUCTION numba fallbacks — REPLACE with numpy, do NOT delete):** Four wrappers in `_flat_variants.py` route int32/float32 to typed rust cores but fall back to a numba kernel for **arbitrary dtypes** (custom VCF FORMAT fields, issue #231 — "values are never silently down-cast"): `_gather_rows` → `_gather_rows_numba`, `_compact_keep` → `_compact_keep_numba`, `_fill_empty_scalar` → `_fill_empty_scalar_numba`, `_fill_empty_fixed` → `_fill_empty_fixed_numba`. These are **live production paths**, NOT dead code — deleting them regresses #231. Replace each `_*_numba` fallback with a pure-numpy, dtype-preserving implementation (these are simple ragged ops: per-row gather by `geno_offset_idx`/offsets; compact by boolean `keep` mask per row; fill empty rows with a dummy/scalar). Keep the i32/f32 rust fast paths. **Gate:** the 4 dtype-regression tests in `test_flat_variants_parity.py` (`test_gather_rows_dtype_regression`, `test_compact_keep_dtype_regression`, `test_fill_empty_scalar_dtype_regression`, `test_fill_empty_fixed_dtype_regression`, which exercise int16/int64) must still pass — they are the numpy replacements' correctness gate. (`test_fill_empty_seq_dtype_regression` already uses int32 → rust; unaffected.) Do this BEFORE Step 3's blanket deletion so the fallbacks have replacements.
+
+- [ ] **Step 3:** Delete every remaining `@nb.njit` body and `import numba`/`import numba as nb` across the 9 kernel modules — **except the 4 production fallbacks handled in Step 2b** (those are now numpy, no `@njit`). For helper njit functions only used by other njit functions (e.g. `reconstruct_haplotype_from_sparse`, `_xorshift64`, `_hash4`, `padded_slice`, `_get_reference_row`), delete them too — rust owns these paths now. Verify nothing non-numba still imports them (grep each symbol).
+
+- [ ] **Step 4: Rebuild + full tree**
+
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp
+pixi run -e dev ruff check python/ tests/
+pixi run -e dev typecheck
+```
+Expected: full tree green; no `import numba` remains (`rtk grep -rn "import numba\|@nb\.\|@numba\.\|nb.prange" python/` → no matches).
+- [ ] **Step 5: Commit.**
+
+---
+
+### Task B4: Drop numba/llvmlite deps; import-guard; Stage-B gate
+
+**Files:**
+- Modify: `pyproject.toml` (remove `numba>=…`; remove `@nb.njit`/`@numba.njit` coverage exclusions; remove the `parity: byte-identical numba-vs-rust` marker description if it names numba), `pixi.toml` (remove `numba = "==0.59.1"` from the py310 feature and any other env).
+- Create: `tests/parity/test_import_no_numba.py`.
+
+**RELAXED GUARD (user decision 2026-06-27):** `import genvarloader` still pulls numba+llvmlite transitively via seqpro 0.20.0 (eager numba import in seqpro itself), which genvarloader cannot control. So the guard asserts genvarloader's OWN source is numba-free (achievable + verified), NOT the whole import graph. A seqpro follow-up issue tracks the eager import (it blocks the full W6 RSS drop).
+
+- [ ] **Step 1: Write the own-code import-guard test**
+
+```python
+# tests/parity/test_import_no_numba.py
+"""genvarloader's OWN modules must not import numba (Phase 5 W5).
+
+NOTE: `import genvarloader` may still pull numba transitively via seqpro
+(seqpro 0.20.0 eagerly imports numba). That is outside genvarloader's control;
+this guard asserts genvarloader's own source is numba-free. See the seqpro
+follow-up issue for the transitive import and the W6 RSS impact.
+"""
+from __future__ import annotations
+
+import pathlib
+
+import genvarloader
+
+
+def test_genvarloader_own_code_imports_no_numba():
+    pkg_dir = pathlib.Path(genvarloader.__file__).parent
+    offenders: list[str] = []
+    for py in pkg_dir.rglob("*.py"):
+        for ln, line in enumerate(py.read_text().splitlines(), 1):
+            s = line.strip()
+            if s.startswith("import numba") or s.startswith("from numba"):
+                offenders.append(f"{py.relative_to(pkg_dir)}:{ln}: {s}")
+    assert not offenders, "genvarloader modules import numba:\n" + "\n".join(offenders)
+```
+
+- [ ] **Step 2: Run it (expect PASS — B3 already removed all numba from genvarloader), then drop genvarloader's DIRECT numba dep**
+
+Run: `pixi run -e dev pytest tests/parity/test_import_no_numba.py -q --basetemp=$(pwd)/.pytest_tmp` → PASS.
+Then remove genvarloader's OWN `numba` dependency from `pyproject.toml` and `pixi.toml` (genvarloader no longer uses it directly). NOTE: numba will likely remain INSTALLED in the env because seqpro depends on it — that is expected and fine; the own-code guard does not require numba to be absent from the environment. Re-solve (`pixi install`) and confirm the env still builds. Do NOT remove numba if doing so breaks the seqpro dependency solve — if seqpro pins numba, just remove genvarloader's direct declaration and leave the transitive one.
+
+- [ ] **Step 3: Full tree + guard gate**
+
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp
+pixi run -e dev cargo test --release
+```
+Expected: full tree green; import-guard PASS; cargo green.
+- [ ] **Step 4: Commit the delete-numba stage boundary.**
+
+```bash
+rtk git commit -am "feat: delete numba backend — rust-only read path (Phase 5 W5)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+# STAGE C — Rayon batch parallelism
+
+> Each kernel gains a `parallel: bool`; the serial branch is the byte-identity reference. Gate every kernel: `serial == parallel` and both `== golden`.
+
+### Task C1: Parallelize `reconstruct_haplotypes_from_sparse`
+
+**Files:**
+- Modify: `src/reconstruct/mod.rs` (the `for k in 0..n_work` loop, lines 312-388), `src/ffi/mod.rs` (the FFI wrappers that call it — add a `parallel` arg, thread it through the 4 fused entries), the Python callers in `_haps.py`/`_reconstruct.py`/`_genotypes.py` (pass `should_parallelize(total_out_bytes)`).
+- Test: `tests/parity/test_rayon_equivalence.py` (new) — serial vs parallel byte-identity over the frozen goldens.
+
+**Interfaces:**
+- The core fn gains `parallel: bool`. Use the `get_reference` idiom: pre-carve the three output buffers (`out`, optional `annot_v_idxs`, optional `annot_ref_pos`) into disjoint per-`k` chunks via `split_at_mut` chains, then `chunks.into_par_iter().enumerate().for_each(...)`. **Do not** move raw `*mut` pointers into the closure — carve `&mut [_]` slices (which are `Send`).
+
+- [ ] **Step 1: Write the failing rayon-equivalence test**
+
+```python
+# tests/parity/test_rayon_equivalence.py
+"""Serial vs parallel rust output must be byte-identical (and == golden)."""
+from __future__ import annotations
+import numpy as np
+import pytest
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_reconstruct_haplotypes_serial_eq_parallel():
+    cases = _golden.load_golden("reconstruct_haplotypes_from_sparse")
+    fn = _golden.RUST_KERNELS["reconstruct_haplotypes_from_sparse"]
+    for ci, (inputs, golden) in enumerate(cases):
+        outs = {}
+        for parallel in (False, True):
+            out = np.zeros(golden.shape, golden.dtype)
+            args = list(inputs)
+            args.insert(0, out)
+            fn(*args, parallel=parallel)  # signature gains keyword `parallel`
+            outs[parallel] = out
+        np.testing.assert_array_equal(outs[False], outs[True], err_msg=f"case {ci}")
+        np.testing.assert_array_equal(outs[True], golden, err_msg=f"case {ci} vs golden")
+```
+
+(If the FFI signature passes `parallel` positionally, adjust the call. Decide the FFI arg convention and keep it consistent across kernels.)
+
+- [ ] **Step 2: Run — expect FAIL** (`parallel` kwarg not accepted yet).
+- [ ] **Step 3: Implement** the `parallel` branch in `reconstruct_haplotypes_from_sparse` (chunk-carve the 3 buffers, `into_par_iter`), thread `parallel` through `src/ffi/mod.rs` (the bare entry + the 4 fused entries that wrap the core), and pass `should_parallelize(...)` from the Python callers. `use rayon::prelude::*;` is already imported in `reference/mod.rs`; add it to `reconstruct/mod.rs`.
+- [ ] **Step 4: Rebuild + run** the new test + the reconstruct golden + the haps dataset goldens.
+
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev cargo test --release reconstruct
+pixi run -e dev pytest tests/parity -q --basetemp=$(pwd)/.pytest_tmp
+```
+Expected: PASS (serial==parallel==golden).
+- [ ] **Step 5: Commit.**
+
+---
+
+### Task C2: Parallelize the track kernels
+
+**Files:**
+- Modify: `src/tracks/mod.rs` (`shift_and_realign_tracks_sparse` outer `for query` loop at 470; `tracks_to_intervals` Pass 1 @569 and Pass 2 @615 — parallelize each pass, keep the sequential cumsum between), `src/ffi/mod.rs` (+ `intervals_and_realign_track_fused`), Python callers (`_reconstruct.py`, `_intervals.py`).
+- Test: extend `test_rayon_equivalence.py` with `shift_and_realign_tracks_sparse` and `tracks_to_intervals`.
+
+- [ ] **Step 1:** Add serial-vs-parallel cases for both kernels (load their goldens, run `parallel` False/True, assert equal + == golden).
+- [ ] **Step 2:** Implement `parallel` in each, using the chunk-carve idiom (outer-query parallelism). For `tracks_to_intervals`, parallelize Pass 1 and Pass 2 independently; the cumsum stays serial.
+- [ ] **Step 3: Rebuild + run** the new cases + track goldens + `cargo test --release tracks`.
+- [ ] **Step 4: Commit.**
+
+---
+
+### Task C3: Parallelize `get_diffs_sparse` + `intervals_to_tracks`
+
+**Files:**
+- Modify: `src/genotypes/mod.rs` (`get_diffs_sparse` outer `for query` @27), `src/intervals.rs` (`intervals_to_tracks` `for query` @45), FFI + Python callers.
+- Test: extend `test_rayon_equivalence.py`.
+
+- [ ] **Step 1–4:** Same recipe: add serial-vs-parallel golden cases, implement `parallel` (outer-query par; `get_diffs_sparse` writes disjoint `diffs[[query,hap]]` cells — carve per-query or use a parallel row iterator over the 2D array), rebuild, run goldens + `cargo test --release`, commit.
+
+(`get_reference` is already parallel — no work.)
+
+---
+
+### Task C4: Roadmap + Stage-C gate
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (tick W5/W6/W7 tasks; add a dated Notes entry: numba deleted, golden snapshot scheme, rayon kernels; set Phase 5 marker — leave 🚧 until PR6/W8-W9 measure-and-merge; record PR placeholder for backfill).
+
+- [ ] **Step 1: Full-tree final gate**
+
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp
+pixi run -e dev cargo test --release
+pixi run -e dev ruff check python/ tests/ && pixi run -e dev ruff format --check python/ tests/
+pixi run -e dev typecheck
+pixi run -e dev cargo clippy --release
+```
+Expected: all green; import-guard green; serial==parallel across all kernels.
+- [ ] **Step 2:** Update the roadmap; commit the rayon stage boundary.
+
+```bash
+rtk git commit -am "perf(rust): rayon batch parallelism, gated byte-identical (Phase 5 W5)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+## Self-Review
+
+- **Spec coverage:** (a) golden snapshot → Tasks A1–A5 (infra, generate, convert all 3 mechanisms, gate, no-`_dispatch` proof). (b) delete numba → B1–B4 (dispatch, conditionals, kernels+imports, deps+import-guard). (c) rayon → C1–C4 (reconstruct, tracks, diffs/intervals, gate). The "neither numba nor llvmlite imported" assertion is B4. The `parallel:bool`+`RAYON_NUM_THREADS` gating is C1 + B3's `_threads.py`.
+- **Placeholder scan:** the per-kernel `SPEC` list in A2 and the "convert remaining tests" steps are data-driven repetitions of a fully-shown pattern (DRY), not placeholders — each names the exact strategy, shape, and replay helper. The rust kernel bodies in Stage C are referenced by file:line with the canonical `get_reference` idiom shown verbatim, rather than transcribed (they are 80+ lines and would go stale).
+- **Type consistency:** `RUST_KERNELS` (name→callable), `collect_examples`/`save_golden`/`load_golden`, and the four `replay_*` helpers are defined in A1 and consumed unchanged in A3–A5 and C1–C3. `should_parallelize`/`cap_threads`/`num_threads` defined in B3 and consumed in C1–C3. `parallel: bool` FFI convention chosen in C1 and reused in C2–C3.
+- **Risks flagged for the controller:** (1) `RUST_KERNELS` has a few Python-wrapper kernels (`assemble_variant_buffers`, possibly `get_reference`/`shift_and_realign_tracks`/`reconstruct_haplotypes_from_sparse`) whose `rust=` is not a bare extension symbol — the implementer must ground-truth each against its `register()` call. (2) `collect_examples` determinism depends on the pinned hypothesis version; goldens are regenerated only intentionally. (3) Stage B's RC-branch collapse is the parity-critical step — the strand/spliced/annotated dataset goldens are its gate. (4) Rayon `Send`: carve `&mut [_]` slices, never raw `*mut` in the closure.
diff --git a/docs/superpowers/plans/2026-06-26-rust-migration-phase-5.md b/docs/superpowers/plans/2026-06-26-rust-migration-phase-5.md
new file mode 100644
index 00000000..9c301c2c
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-26-rust-migration-phase-5.md
@@ -0,0 +1,325 @@
+# Rust Migration Phase 5 Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Finish the Rust migration's Phase 5 — fix the remaining numba/rust correctness divergences, fuse the last deferred read path, freeze the numba oracle as golden fixtures, delete numba, add rayon, and merge `rust-migration → main` once a final `__getitem__` benchmark shows rust at parity-or-better.
+
+**Architecture:** Phase 5 is a strict sequential pipeline of distinct PRs into the `rust-migration` integration branch. Correctness fixes (W1, W2) and the fusion (W3) must land **while numba still exists** as the differential oracle; the final numba-vs-rust verdict (W4) must be captured **before** deletion; only then is it safe to golden-snapshot (W5) and delete numba (W6), add rayon (W7), measure RSS (W8), and merge (W9). **This document fully specifies PR1 (W1).** PR2–PR6 (W2–W9) are scoped at the end and each gets its own detailed plan written at its turn — W2 in particular requires a coordinate-math investigation whose root cause is not yet known and therefore cannot be bite-sized in advance.
+
+**Tech Stack:** Rust (ndarray, PyO3, rayon), Python (numpy, numba — being deleted), pixi (`-e dev`), maturin, pytest + hypothesis, cargo test, memray, pytest-benchmark.
+
+## Global Constraints
+
+- Spec: `docs/superpowers/specs/2026-06-26-rust-migration-phase-5-design.md`. Roadmap (source of truth, must be updated): `docs/roadmaps/rust-migration.md` (Phase 5).
+- Byte-identical parity is the landing gate for every kernel change; numba is the oracle until W6 deletes it (W5 freezes it to golden fixtures first).
+- Benchmark parity verdict is **single-thread**: `NUMBA_NUM_THREADS=1`, rayon threads=1, `maturin develop --release`, corpus `chr22_geuv.gvl` (format 2.0), Carter HPC (AMD EPYC 7543, linux-64). Node is shared/noisy — use within-session ratios + pedantic min; the durable signal is parity + the recorded instruction-count reductions.
+- Dataset/parity tests on the HPC need `--basetemp=$(pwd)/.pytest_tmp` (numba write path's `os.link` fails cross-device, Errno 18).
+- Numba-oracle-bug policy: a numba-vs-rust divergence where numba is buggy gets an issue + an isolated fix PR + un-exclusion from parity. W1 and W2 follow this.
+- Per-kernel rust core lives in `src/`; PyO3 only in `src/ffi/`. No `unsafe` unless justified by a profile.
+- Commits: conventional-commit style; no squash on the final merge (preserve history). Co-author trailer on commits:
+  `Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>`.
+
+---
+
+## PR1 (W1): Fix the haplotype/track trailing-fill divergence in BOTH kernels
+
+**Why this is "fix both," not "fix numba to match rust":** reading the actual code, *neither* kernel is correct in the overshoot sub-domain (a deletion drives `ref_idx` past the contig end with output still unfilled). The roadmap's "rust is correct" was an assertion about an untested, parity-excluded sub-domain. Concretely, with `ref=[1,2,3,4]`, a deletion at pos 2 with `ilen=-5` (so `v_ref_end = 2+5+1 = 8`), `out_len=8`, `pad_char=0`:
+
+- Correct output: ref consumed `[1,2]`, allele `[50]`, then **ref is exhausted** → pad the entire tail → `[1,2,50,0,0,0,0,0]`.
+- Current **numba** (`_genotypes.py:508`): `writable_ref = min(5, 4-8) = -4`, `out_end_idx = 3 + (-4) = -1`; `out[3:-1] = ref[8:4]` is a numpy shape mismatch inside njit → SystemError / unwritten tail (the bug).
+- Current **rust** (`src/reconstruct/mod.rs:245`): `out_end_idx = (3 + (-4)).max(0) = 0`; then `out[0..8] = pad` → `[0,0,0,0,0,0,0,0]` — **overwrites the valid prefix** `[1,2,50]`.
+
+**The fix (both kernels):** when `ref` is exhausted (`writable_ref <= 0`), clamp `out_end_idx` to `out_idx` (not 0) so the right-pad fills exactly the unfilled tail `out[out_idx:length]`. In numba this is `writable_ref = max(0, min(unfilled_length, len(ref) - ref_idx))`. The same latent pattern exists in the track-realign kernels (`_tracks.py:396` numba) — apply the identical clamp.
+
+**Files:**
+- Modify: `src/reconstruct/mod.rs:208-260` (rust haplotype trailing-fill; the `else` branch at 240-246) + its in-module test block.
+- Modify: `python/genvarloader/_dataset/_genotypes.py:508` (numba haplotype singular kernel).
+- Modify: `python/genvarloader/_dataset/_tracks.py:396` (numba track singular kernel).
+- Verify/Modify: rust track-realign trailing-fill in `src/tracks*` (check for the same `.max(0)` pattern).
+- Test (new): `tests/unit/dataset/test_reconstruct_trailing_fill.py` (numba + rust correctness, deterministic).
+- Test (new): `src/reconstruct/mod.rs` cargo unit test `overshoot_ref_past_contig`.
+- Modify: `tests/parity/test_reconstruct_haplotypes_parity.py` (remove the 3 exclusion guards once the divergence is gone).
+- Check: `tests/parity/test_shift_and_realign_tracks_parity.py`, `tests/parity/test_dataset_parity.py`, `tests/parity/strategies.py`, `tests/parity/_fixtures.py` for analogous overshoot/`max_jitter` exclusions tied to this divergence.
+
+**Interfaces:**
+- Consumes: `reconstruct_haplotype_from_sparse(v_idxs, v_starts, ilens, shift, alt_alleles, alt_offsets, ref, ref_start, out, pad_char, keep=None, annot_v_idxs=None, annot_ref_pos=None)` — numba singular kernel, `@nb.njit(nogil=True, cache=True)`, directly importable from `genvarloader._dataset._genotypes`.
+- Produces: no signature changes. Behavior change only: overshoot inputs now produce full-tail-pad output, byte-identical across numba and rust.
+
+### Task 1: Characterize the rust overshoot bug (cargo, failing test)
+
+**Files:**
+- Test: `src/reconstruct/mod.rs` (add to the `#[cfg(test)] mod tests` block, alongside `deletion`/`del_spanning_ref_start`).
+
+- [ ] **Step 1: Write the failing cargo test**
+
+Add next to the existing `run(...)`-helper tests (the helper signature is
+`run(v_idxs, v_starts, ilens, shift, alt_alleles, alt_offsets, ref, ref_start, out_len, pad_char, keep, annotate)`):
+
+```rust
+// -------------------------------------------------------------------------
+// Case: deletion drives ref_idx past the contig end (overshoot).
+// ref = [1,2,3,4] (len 4), ref_start=0, out_len=8.
+// variant at pos=2, ilen=-5, allele=[50] (anchor).
+//   v_ref_end = 2 - min(0,-5) + 1 = 8  → ref_idx advances to 8 (> len 4).
+// Processing: ref[0..2]=[1,2], allele=[50] → out_idx=3.
+// Final clause: unfilled=5, ref exhausted (writable_ref = min(5, 4-8) = -4 <= 0).
+// CORRECT: no ref left → pad the whole tail → [1,2,50,0,0,0,0,0].
+// (Pre-fix rust over-pads from index 0 → all zeros.)
+// -------------------------------------------------------------------------
+#[test]
+fn overshoot_ref_past_contig() {
+    let (out, _av, _ap) = run(
+        &[0],
+        &[2],          // v_pos=2
+        &[-5],         // ilen=-5 (deletion past contig end)
+        0,             // shift
+        &[50u8],       // anchor allele
+        &[0i64, 1],
+        &[1, 2, 3, 4], // ref, len 4
+        0,             // ref_start
+        8,             // out_len
+        0,             // pad_char
+        None,
+        false,
+    );
+    assert_eq!(out, vec![1, 2, 50, 0, 0, 0, 0, 0]);
+}
+```
+
+- [ ] **Step 2: Run the test to verify it FAILS**
+
+Run: `pixi run -e dev cargo test --lib reconstruct::tests::overshoot_ref_past_contig`
+Expected: FAIL — actual `[0, 0, 0, 0, 0, 0, 0, 0]` (rust over-pads from index 0).
+
+- [ ] **Step 3: Commit the failing test**
+
+```bash
+rtk git add src/reconstruct/mod.rs
+rtk git commit -m "test(reconstruct): pin correct full-tail-pad on ref overshoot (failing)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+### Task 2: Fix the rust trailing-fill clamp
+
+**Files:**
+- Modify: `src/reconstruct/mod.rs:240-246` (the `else` branch) + the stale comments at 211-218.
+
+- [ ] **Step 1: Apply the clamp-to-`out_idx` fix**
+
+Replace the `else` branch (currently `(out_idx + writable_ref).max(0)`) so an exhausted ref pads exactly the unfilled tail:
+
+```rust
+        } else {
+            // writable_ref <= 0: ref exhausted (ref_idx at/after contig end).
+            // No reference bytes remain to copy, so the entire unfilled tail
+            // out[out_idx..length] must be padded. Clamp out_end_idx to out_idx
+            // (NOT 0) so the right-pad below fills exactly out[out_idx..length]
+            // and never overwrites already-written positions.
+            out_idx
+        };
+```
+
+Also fix the now-inaccurate comment block at lines 211-218 (it describes mirroring numpy's negative-index behavior, which was the bug). Replace with a one-line note that the tail is padded when ref is exhausted.
+
+- [ ] **Step 2: Run the cargo test to verify it PASSES**
+
+Run: `pixi run -e dev cargo test --lib reconstruct::tests::overshoot_ref_past_contig`
+Expected: PASS — `[1, 2, 50, 0, 0, 0, 0, 0]`.
+
+- [ ] **Step 3: Run the full rust suite (no regressions)**
+
+Run: `pixi run -e dev cargo-test`
+Expected: all pass (the existing `deletion`, `del_spanning_ref_start`, etc. are unaffected — they never overshoot).
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add src/reconstruct/mod.rs
+rtk git commit -m "fix(reconstruct): pad full tail when ref exhausted, not from index 0
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+### Task 3: Characterize + fix the numba haplotype/track kernels
+
+**Files:**
+- Test: `tests/unit/dataset/test_reconstruct_trailing_fill.py` (new).
+- Modify: `python/genvarloader/_dataset/_genotypes.py:508`.
+- Modify: `python/genvarloader/_dataset/_tracks.py:396`.
+
+- [ ] **Step 1: Write the failing numba correctness test**
+
+```python
+"""Correctness of the trailing-fill clause when a deletion exhausts the contig.
+
+The overshoot sub-domain (ref_idx past contig end with output unfilled) was
+historically excluded from parity because numba and rust diverged AND both were
+wrong. Correct behavior: pad the entire unfilled tail (no reference left).
+"""
+
+import numpy as np
+
+from genvarloader._dataset._genotypes import reconstruct_haplotype_from_sparse
+
+
+def test_overshoot_pads_full_tail():
+    # ref=[1,2,3,4], deletion at pos 2 (ilen=-5) -> ref_idx advances to 8 (>4).
+    # out_len=8: [1,2] ref + [50] allele, then ref exhausted -> pad rest with 0.
+    out = np.full(8, 255, dtype=np.uint8)  # 0xFF sentinel: catches unwritten positions
+    reconstruct_haplotype_from_sparse(
+        np.array([0], dtype=np.int32),        # v_idxs
+        np.array([2], dtype=np.int32),        # v_starts
+        np.array([-5], dtype=np.int32),       # ilens
+        0,                                    # shift
+        np.array([50], dtype=np.uint8),       # alt_alleles
+        np.array([0, 1], dtype=np.int64),     # alt_offsets
+        np.array([1, 2, 3, 4], dtype=np.uint8),  # ref
+        0,                                    # ref_start
+        out,                                  # out
+        0,                                    # pad_char
+    )
+    np.testing.assert_array_equal(out, np.array([1, 2, 50, 0, 0, 0, 0, 0], dtype=np.uint8))
+```
+
+- [ ] **Step 2: Run to verify it FAILS**
+
+Run: `pixi run -e dev pytest tests/unit/dataset/test_reconstruct_trailing_fill.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: FAIL — numba leaves the tail unwritten (0xFF sentinel leaks through) or raises a numpy shape error inside the njit kernel.
+
+- [ ] **Step 3: Apply the numba clamp (haplotype kernel)**
+
+In `python/genvarloader/_dataset/_genotypes.py:508`, clamp the available ref to be non-negative so an exhausted ref yields `out_end_idx == out_idx` and the right-pad fills the whole tail:
+
+```python
+        writable_ref = max(0, min(unfilled_length, len(ref) - ref_idx))
+```
+
+- [ ] **Step 4: Apply the same clamp to the numba track kernel**
+
+In `python/genvarloader/_dataset/_tracks.py:396`:
+
+```python
+        writable_ref = max(0, min(unfilled_length, len(track) - track_idx))
+```
+
+- [ ] **Step 5: Run the numba test to verify it PASSES**
+
+Run: `pixi run -e dev pytest tests/unit/dataset/test_reconstruct_trailing_fill.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS — `[1, 2, 50, 0, 0, 0, 0, 0]`.
+
+- [ ] **Step 6: Commit**
+
+```bash
+rtk git add python/genvarloader/_dataset/_genotypes.py python/genvarloader/_dataset/_tracks.py tests/unit/dataset/test_reconstruct_trailing_fill.py
+rtk git commit -m "fix(reconstruct,tracks): pad full tail in numba trailing-fill on ref overshoot
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+### Task 4: Verify the rust track-realign kernel + un-exclude parity
+
+**Files:**
+- Verify/Modify: rust track trailing-fill (search `src/` for the analog).
+- Modify: `tests/parity/test_reconstruct_haplotypes_parity.py`.
+- Check: `tests/parity/test_shift_and_realign_tracks_parity.py`, `tests/parity/test_dataset_parity.py`, `tests/parity/strategies.py`, `tests/parity/_fixtures.py`.
+
+- [ ] **Step 1: Verify the rust track kernel has no `.max(0)` over-pad**
+
+Run: `pixi run -e dev grep -n "max(0)\|writable_ref\|out_end" src/tracks.rs src/intervals.rs`
+If the track-realign trailing-fill uses the same `(out_idx + writable_ref).max(0)` pattern, apply the identical `out_idx` clamp + add a cargo test mirroring Task 1. If it already clamps to `out_idx` (or has no negative-`writable_ref` path), record that in the commit message and skip.
+
+- [ ] **Step 2: Remove the now-obsolete exclusion guards from the haplotype parity test**
+
+In `tests/parity/test_reconstruct_haplotypes_parity.py`, delete:
+- the `_ref_idx_overshoots_contig(...)` helper and both `assume(not _ref_idx_overshoots_contig(inputs))` calls (Guard 1),
+- the `_numba_fully_defined(...)` double-init helper and `assume(defined)` calls (Guard 3),
+- the `try/except SystemError: assume(False)` wrapper (Guard 2).
+
+The body simplifies to: run numba into `out_n`, run rust into `out_r`, `np.testing.assert_array_equal`. (Both kernels now fully write every position byte-identically across the full generated domain, including overshoot.)
+
+- [ ] **Step 3: Run the haplotype parity suite (both backends, full domain)**
+
+Run: `pixi run -e dev pytest tests/parity/test_reconstruct_haplotypes_parity.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS — hypothesis explores overshoot inputs (no longer assumed away) and finds byte-identity. (The parity helper calls both `numba_fn` and `rust_fn` directly, so one run covers both backends.)
+
+- [ ] **Step 4: Lift analogous exclusions in the track + dataset parity suites**
+
+Inspect `test_shift_and_realign_tracks_parity.py`, `test_dataset_parity.py`, `strategies.py`, `_fixtures.py` for overshoot/`max_jitter`-pinned guards tied to THIS divergence (not the separate #242 `intervals_to_tracks` clip bug — leave those for W2). Remove only the trailing-fill-overshoot exclusions; re-run each touched suite:
+
+Run: `pixi run -e dev pytest tests/parity/test_shift_and_realign_tracks_parity.py tests/parity/test_dataset_parity.py -v --basetemp=$(pwd)/.pytest_tmp`
+Expected: PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add src/ tests/parity/
+rtk git commit -m "test(parity): un-exclude ref-overshoot sub-domain now both kernels pad correctly
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+### Task 5: Full-tree verification, roadmap update, and PR
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (Phase 5 notes/log).
+
+- [ ] **Step 1: Run the full Python tree on the rust backend**
+
+Run: `pixi run -e dev pytest tests -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: green (the pre-existing xfails remain xfailed; no new failures).
+
+- [ ] **Step 2: Run the full tree on the numba backend**
+
+Run: `GVL_BACKEND=numba pixi run -e dev pytest tests/dataset tests/unit tests/parity -q --basetemp=$(pwd)/.pytest_tmp`
+Expected: green — same pass/xfail profile, confirming byte-identical parity.
+
+- [ ] **Step 3: Lint, format, typecheck, cargo**
+
+Run:
+```bash
+pixi run -e dev ruff check python/ tests/ && \
+pixi run -e dev ruff format --check python/ tests/ && \
+pixi run -e dev typecheck && \
+pixi run -e dev cargo-test
+```
+Expected: all clean/green.
+
+- [ ] **Step 4: Record the fix in the roadmap**
+
+Add a dated entry to the Notes & decisions log in `docs/roadmaps/rust-migration.md` noting: the overshoot trailing-fill divergence was fixed in BOTH kernels (clamp `out_end_idx` to `out_idx`; numba `writable_ref = max(0, ...)`), the previously-excluded sub-domain is now parity-covered (Guards 1–3 removed), and reference the filed issue. Do NOT yet mark Phase 5 ✅ (W2–W9 remain).
+
+- [ ] **Step 5: Commit and open the PR**
+
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): record trailing-fill overshoot fix (Phase 5 W1)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+rtk git push -u origin rust-migration   # (or a w1 topic branch, per your PR convention)
+```
+Then open the PR into `rust-migration` (file the GVL issue first and reference it). Title: `fix: pad full tail on reference overshoot in haplotype/track reconstruction (Phase 5 W1)`.
+
+---
+
+## Subsequent PRs (planned separately, in order)
+
+Each gets its own detailed bite-sized plan written when its predecessor lands. They are **not** bite-sized here because they depend on results that don't exist yet.
+
+- **PR2 (W2) — Fix the #242 `intervals_to_tracks` store-vs-query divergence.** Requires a systematic-debugging investigation: gvl stores intervals at `chromStart - max_jitter` but queries at `chromStart + jitter`, so a stored interval can start before the query window (`max_jitter>0`). The correct reconciliation (kernel clip vs store/query coordinate math) is unknown until investigated and may touch the write path. Fix both backends to agree-and-be-correct; un-exclude the #242 sub-domain across the parity + dataset suites; close issue #242. *Plan written after the investigation; W1 should land first so the oracle is otherwise trustworthy.*
+
+- **PR3 (W3) — Fuse the deferred annotated+spliced intersection path.** Add a fused rust kernel collapsing its remaining FFI crossings (pattern: `reconstruct_annotated_haplotypes_fused` / `reconstruct_haplotypes_spliced_fused`). Parity-gate against the composed numba oracle **while numba still exists**. Extend the parity suite to cover it.
+
+- **PR4 (W4) — Final single-thread numba-vs-rust `__getitem__` A/B.** Benchmark only (no code): `tests/benchmarks/test_e2e.py` pedantic min + `profile.py` wall-clock across all modes, both backends present, one back-to-back session. **Gate:** rust at parity-or-better single-thread → proceed to consolidation.
+
+- **PR5 (W5–W7) — The consolidation PR.** (a) Golden-snapshot the ~17 numba-oracle parity suites to frozen fixtures (storage scheme decided in that plan — compressed `.npz` keyed by generated input, or a bounded seeded sample); (b) delete all numba: ~21 `register()` refs, njit bodies, `_dispatch` registry + `GVL_BACKEND`, every `import numba`; replace `get(name)(...)` with direct rust calls; assert `import genvarloader` pulls neither numba nor llvmlite; (c) add rayon batch parallelism over per-(query,hap) work items, gated byte-identical to the serial golden result.
+
+- **PR6 (W8–W9) — Measure & merge.** Rust-only peak RSS (memray) vs the 3.53 GB numba baseline (expect the ~3.2 GB JIT drop); rayon multi-thread speedup (rayon N vs 1). If RSS and wall-clock are parity-or-better, open `rust-migration → main` (no squash); mark Phase 5 ✅ in the roadmap with the final tables + PR link; update `skills/genvarloader/SKILL.md` for any public-API change (e.g. `GVL_BACKEND` removal).
+
+---
+
+## Self-Review
+
+- **Spec coverage:** W1 (haps trailing-fill) is fully planned as PR1 — and corrected to "fix both kernels," a deviation from the spec's "verify rust already correct" found during planning (documented in the PR1 preamble). W2–W9 map to PR2–PR6. Decisions D1–D7 are all reflected (D4 = PR1; D5 = PR2; D3 = PR3; D6 = PR4; D2 = PR5; D1 = PR5; D7 = separate PRs throughout).
+- **Placeholder scan:** PR1 steps contain concrete code, exact commands, and expected output. PR2–PR6 are intentionally high-level (planned separately) and labeled as such — not placeholders within an executable task.
+- **Type consistency:** `reconstruct_haplotype_from_sparse` signature and the `run(...)` cargo helper argument order match the source read during planning; `writable_ref`/`out_end_idx`/`out_idx` names match both kernels.
diff --git a/docs/superpowers/plans/2026-06-27-rust-migration-phase-5-wrapup.md b/docs/superpowers/plans/2026-06-27-rust-migration-phase-5-wrapup.md
new file mode 100644
index 00000000..d2fec1af
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-27-rust-migration-phase-5-wrapup.md
@@ -0,0 +1,358 @@
+# Rust Migration Phase 5 Wrap-Up Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Finish Phase 5's finalization threads (thin-shim audit, cargo-standalone verification, seqpro-core released-dep verification, W6 perf re-baseline) and land them as one PR into `rust-migration`, leaving the `rust-migration → master` merge to the maintainer.
+
+**Architecture:** Four mostly-independent units. Three are verification + roadmap documentation (no production code); one (Unit B) may carry a small build/config fix if `cargo test` does not run standalone. Unit D is a measurement pass on Carter. A final task sets the Phase 5 status marker and runs the full gate.
+
+**Tech Stack:** Rust (PyO3 0.28 abi3, ndarray, rayon, seqpro-core 0.1), Python 3.10–3.13, maturin, pixi (`-e dev`), pytest + pytest-benchmark, cargo test, ruff/pyrefly/clippy.
+
+**Spec:** `docs/superpowers/specs/2026-06-27-rust-migration-phase-5-wrapup-design.md`
+
+## Global Constraints
+
+- **Branch:** `phase-5-w6-wrapup` (already created off `rust-migration`). All commits land here.
+- **PR target:** `rust-migration` (NOT master). Do not merge to master — the maintainer triggers `rust-migration → master` separately, no-squash.
+- **Out of scope:** Phase 6 (absorb genoray); the "single big `__getitem__` kernel" architectural collapse (Unit A *audits* it, does not build it).
+- **Rebuild before testing Rust:** `pixi run -e dev maturin develop --release` BEFORE any pytest run that imports the extension. pytest does NOT rebuild Rust.
+- **No numba A/B:** numba was deleted in W5. There is no live numba backend; all perf comparison is rust serial-vs-rayon (same session) + the W4-recorded numba figures. Do NOT re-checkout a numba commit.
+- **Carter perf caveat:** shared HPC node; absolute wall-clock drifts ≥2× across sessions. Durable signals = byte-identical parity (already gated) + same-session improve-or-hold + deterministic counts. See `[[gvl-rust-perf-gate-shared-node-noise]]`.
+- **Corpus:** `chr22_geuv.gvl` (format 2.0, 165 regions × 5 samples). Assumed present from W4/W5; Task 4 Step 1 verifies and rebuilds if absent.
+- **Roadmap is source of truth:** `docs/roadmaps/rust-migration.md` — tick items, set the Phase 5 marker, add a notes-log entry, record measurements under the checkpoint.
+
+---
+
+### Task 1: Thin-shim audit (Unit A)
+
+Investigation + documentation only. **No production code changes.** Produce a precise "what's left to collapse the PyO3 surface" verdict and write it into the roadmap.
+
+**Files:**
+- Create: `docs/roadmaps/phase-5-w6-thin-shim-audit.md` (the detailed audit)
+- Modify: `docs/roadmaps/rust-migration.md` (Phase 5 section + a notes-log entry referencing the audit)
+
+**Interfaces:**
+- Consumes: nothing (first task).
+- Produces: the audit verdict (bucket-2 "remaining collapsible glue" list) that Task 5 reads to set the Phase 5 status marker.
+
+- [ ] **Step 1: Inventory the read-path call chain**
+
+Trace `Dataset.__getitem__` to its FFI calls and list every Python function on the hot path between the public API and the `from ..genvarloader import ...` call. Use:
+
+```bash
+rtk grep -n "def __getitem__\|_reconstruct\|reconstruct_haplotypes_fused\|intervals_and_realign_track_fused\|assemble_variant_buffers" \
+  python/genvarloader/_dataset/_impl.py python/genvarloader/_dataset/_reconstruct.py \
+  python/genvarloader/_dataset/_haps.py python/genvarloader/_dataset/_query.py
+```
+
+Read `_dataset/_reconstruct.py`, `_dataset/_haps.py`, `_dataset/_query.py` in full to see the per-batch work each does before/after the FFI crossing.
+
+- [ ] **Step 2: Inventory the FFI surface**
+
+List the registered pyfunctions and which are fused `__getitem__` kernels:
+
+```bash
+rtk grep -n "wrap_pyfunction!\|add_class" src/lib.rs
+```
+
+Expected: ~28 entries incl. the five fused kernels (`reconstruct_haplotypes_fused`, `reconstruct_annotated_haplotypes_fused`, `reconstruct_haplotypes_spliced_fused`, `reconstruct_annotated_haplotypes_spliced_fused`, `intervals_and_realign_track_fused`) and `assemble_variant_buffers_{u8,i32}`.
+
+- [ ] **Step 3: Confirm the dispatch layer is fully gone**
+
+```bash
+ls python/genvarloader/_dispatch.py 2>&1                 # expect: No such file
+rtk grep -rn "GVL_BACKEND\|_dispatch\|import numba\|from numba\|nb\.njit\|nb\.prange" python/genvarloader/ --include=*.py
+```
+
+Expected: zero matches (confirms W5 removed the rust/numba switch and Python calls Rust directly). Also delete the stale bytecode so it cannot mislead future greps:
+
+```bash
+rm -f python/genvarloader/__pycache__/_dispatch.cpython-*.pyc
+```
+
+- [ ] **Step 4: Classify each read-path Python step into the three buckets**
+
+For every per-batch Python step found in Step 1, classify as: (1) **intentional shim** (indexing sugar / torch / validation / error messages — stays in Python), (2) **remaining collapsible glue** (per-batch coercion/alloc/object churn worth a future kernel), or (3) **already-collapsed** (one FFI crossing, no material Python work). Cross-reference the Phase 3 optimization-targets section of the roadmap (zero-copy `_ffi_array`, `_HapsFfiStatic` caching, uninit buffers) — those already eliminated the major bucket-2 items.
+
+- [ ] **Step 5: Write the audit document**
+
+Write `docs/roadmaps/phase-5-w6-thin-shim-audit.md` containing: the read/write-path call-chain inventory, the FFI surface list, the three-bucket classification table (one row per Python step with its bucket + justification), and a one-paragraph **verdict**: either "shim is already thin — bucket-2 list is empty/negligible, the single-big-kernel collapse is not warranted as Phase 5 work" OR "bucket-2 glue remains: <explicit list>". Include the `to_rc` / RC handling and any `np.ascontiguousarray` survivors (there should be none on per-sample-scale memmaps — that was the scale-guard fix; confirm via `rtk grep -rn "ascontiguousarray" python/genvarloader/_dataset/`).
+
+- [ ] **Step 6: Update the roadmap Phase 5 section**
+
+In `docs/roadmaps/rust-migration.md`, under Phase 5, annotate the "Collapse the PyO3 surface so Python is a true shim" checklist item with the audit verdict (link to the audit doc). Do NOT tick or mark the phase yet — Task 5 sets the final marker. Add a notes-log entry dated 2026-06-27 (Phase 5 W6 — thin-shim audit) summarizing the verdict.
+
+- [ ] **Step 7: Commit**
+
+```bash
+rtk git add docs/roadmaps/phase-5-w6-thin-shim-audit.md docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): Phase 5 W6 thin-shim audit — classify remaining PyO3 surface glue
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 2: cargo-testable standalone verification (Unit B)
+
+Confirm `cargo test` builds and runs the Rust suite without the pixi/maturin/Python-extension layer. This is the only task that may carry a code/config fix.
+
+**Files:**
+- Modify (only if broken): `Cargo.toml` and/or `.cargo/config.toml` (whatever the minimal fix requires)
+- Modify: `docs/roadmaps/rust-migration.md` (record the standalone result + the canonical invocation)
+
+**Interfaces:**
+- Consumes: nothing.
+- Produces: the verified standalone-test invocation string recorded in the roadmap; Task 5's gate reuses it.
+
+- [ ] **Step 1: Run the standalone cargo suite from a clean shell**
+
+Run WITHOUT pixi, from the repo root:
+
+```bash
+cargo test --release 2>&1 | tail -30
+```
+
+Expected (pass case): all tests pass (W5 reported 114 cargo tests). If it links and passes, the crate is already standalone-testable — skip to Step 4.
+
+- [ ] **Step 2: If it fails to link/build, diagnose**
+
+The most likely failure is pyo3 needing a libpython at link time (the `extension-module` feature is non-default, so `cargo test` links a real interpreter). Capture the exact error:
+
+```bash
+cargo test --release 2>&1 | grep -iE "error|undefined|python|link" | head -20
+```
+
+If it is a libpython discovery issue, the minimal fix is to ensure a Python is discoverable (e.g. `PYO3_PYTHON=$(pixi run -e dev which python) cargo test --release`). Prefer documenting the invocation over adding config that could perturb the abi3 wheel build. Only edit `Cargo.toml`/`.cargo/config.toml` if there is no env-only path.
+
+- [ ] **Step 3: Re-run to confirm the fix**
+
+```bash
+PYO3_PYTHON=$(pixi run -e dev which python) cargo test --release 2>&1 | tail -15   # or the plain command if no fix was needed
+```
+
+Expected: all tests pass.
+
+- [ ] **Step 4: Record the result in the roadmap**
+
+In `docs/roadmaps/rust-migration.md` Phase 5, annotate the "Confirm the crate is fully cargo-testable standalone" item with the verified invocation and the pass count (do NOT tick yet — Task 5 does the final marker). If a fix was needed, note it.
+
+- [ ] **Step 5: Commit**
+
+```bash
+rtk git add Cargo.toml .cargo/config.toml docs/roadmaps/rust-migration.md 2>/dev/null; rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): verify crate is cargo-testable standalone (Phase 5)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 3: seqpro-core released-dep verification (Unit C)
+
+Confirm seqpro-core resolves from crates.io with no path/patch override, and correct the stale Phase 1 roadmap note.
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (correct the stale Phase 1 "editable path-dep" note)
+
+**Interfaces:**
+- Consumes: nothing.
+- Produces: corrected roadmap text.
+
+- [ ] **Step 1: Confirm the resolved source is the registry**
+
+```bash
+rtk grep -n -A3 'name = "seqpro-core"' Cargo.lock
+rtk grep -rn "seqpro-core\|\[patch\|path =" Cargo.toml
+```
+
+Expected: `Cargo.lock` shows `version = "0.1.0"`, `source = "registry+https://github.com/rust-lang/crates.io-index"`, with a checksum; `Cargo.toml` shows `seqpro-core = "0.1"` and NO `[patch]` or `path =` override.
+
+- [ ] **Step 2: Confirm a clean build resolves it without a local checkout**
+
+```bash
+cargo build --release 2>&1 | grep -iE "seqpro|error" | head; echo "exit: ${PIPESTATUS[0]}"
+```
+
+Expected: builds clean, seqpro-core pulled from registry (no "path" / local-edit lines).
+
+- [ ] **Step 3: Correct the stale Phase 1 roadmap note**
+
+In `docs/roadmaps/rust-migration.md`, find the Phase 1 bullet and notes-log lines that say seqpro-core is "editable; flip to git/crates.io before shipping" / "path dep (editable…)". Replace with text stating it is already a released crates.io dependency (`seqpro-core 0.1.0`, registry source, verified in `Cargo.lock`), so the shipping prerequisite is satisfied.
+
+- [ ] **Step 4: Commit**
+
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): seqpro-core is already a released crates.io dep (correct stale Phase 1 note)
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 4: W6 perf re-baseline — serial vs rayon (Unit D)
+
+Measure the rayon multi-thread speedup curve + peak-RSS deltas on Carter and record under the Phase 5 checkpoint. Long pole.
+
+**Files:**
+- Create: `docs/roadmaps/phase-5-w6-perf-rebaseline.md` (full tables + methodology)
+- Modify: `docs/roadmaps/rust-migration.md` (summary under the Phase 5 checkpoint)
+
+**Interfaces:**
+- Consumes: the verified release build (rebuild in Step 2).
+- Produces: the rayon speedup curve + RSS deltas referenced by Task 5's checkpoint update.
+
+- [ ] **Step 1: Verify the corpus exists (rebuild if absent)**
+
+```bash
+ls -la tests/benchmarks/data/chr22_geuv.gvl 2>&1
+```
+
+If present, continue. If absent, rebuild (needs `/carter` or `GVL_BENCH_SOURCE`):
+
+```bash
+pixi run -e dev python tests/benchmarks/data/build_realistic.py
+```
+
+- [ ] **Step 2: Rebuild the extension release and identify the parallel toggle**
+
+```bash
+pixi run -e dev maturin develop --release
+```
+
+Find how the read kernels expose the W5 `parallel` gate and how to force serial vs parallel (the `should_parallelize(total_out_bytes)` threshold in `_threads.py` and `RAYON_NUM_THREADS`):
+
+```bash
+rtk grep -rn "should_parallelize\|RAYON_NUM_THREADS\|parallel" python/genvarloader/_threads.py
+```
+
+- [ ] **Step 3: Capture the serial baseline (1 thread)**
+
+Run the de-noised e2e harness pinned to one rayon thread for the seq/track paths, and `profile.py` for the variants paths:
+
+```bash
+RAYON_NUM_THREADS=1 pixi run -e dev pytest tests/benchmarks/test_e2e.py -q 2>&1 | tail -30
+RAYON_NUM_THREADS=1 pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variants --n-batches 2000
+RAYON_NUM_THREADS=1 pixi run -e dev python tests/benchmarks/profiling/profile.py --mode variant-windows --n-batches 2000
+```
+
+Record ms/batch (pedantic min for e2e modes; wall avg for variants modes) per mode.
+
+- [ ] **Step 4: Capture the thread sweep (2 / 4 / 8 / all cores)**
+
+Repeat Step 3's commands with `RAYON_NUM_THREADS=2`, `=4`, `=8`, and unset (default = all cores). Capture ms/batch per mode per thread count. Also capture peak RSS for one representative parallel run vs the serial run via memray:
+
+```bash
+pixi run -e dev memray-tracks 2>&1 | tail; pixi run -e dev memray-haps 2>&1 | tail   # then: memray stats <output>
+```
+
+(If `should_parallelize`'s byte threshold suppresses parallelism on this small corpus for some modes, note which modes never crossed the threshold — that is itself a finding, not a failure.)
+
+- [ ] **Step 5: Write the perf doc**
+
+Write `docs/roadmaps/phase-5-w6-perf-rebaseline.md` with: methodology (corpus, harness, HEAD, machine, `maturin develop --release`), a per-mode serial-vs-thread-count table (ms/batch + speedup vs serial), the peak-RSS serial-vs-parallel deltas, a note that numba A/B is unavailable (W5 deletion) with a pointer to the W4 figures (`docs/roadmaps/phase-5-w4-final-ab.md`), and the node-noise caveat. State the gvl-attributable conclusion (rayon speedup achieved; modes below the parallelism threshold noted).
+
+- [ ] **Step 6: Record the summary in the roadmap checkpoint**
+
+In `docs/roadmaps/rust-migration.md` Phase 5 "Checkpoint" area, add the rayon speedup summary + RSS deltas (link to the perf doc). This satisfies "full perf re-baseline recorded here."
+
+- [ ] **Step 7: Commit**
+
+```bash
+rtk git add docs/roadmaps/phase-5-w6-perf-rebaseline.md docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): Phase 5 W6 perf re-baseline — rayon serial-vs-multithread speedup + RSS
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+---
+
+### Task 5: Phase 5 status disposition + full gate + PR
+
+Set the Phase 5 marker from the audit verdict, run the full project gate, finalize the roadmap, and open the PR into `rust-migration`.
+
+**Files:**
+- Modify: `docs/roadmaps/rust-migration.md` (tick items, set Phase 5 marker, final notes-log entry)
+
+**Interfaces:**
+- Consumes: Task 1 audit verdict, Task 2 standalone result, Task 3 seqpro verification, Task 4 perf re-baseline.
+- Produces: the PR.
+
+- [ ] **Step 1: Rebuild and run the full pytest tree**
+
+```bash
+pixi run -e dev maturin develop --release
+pixi run -e dev pytest tests -q 2>&1 | tail -20
+```
+
+Expected: green (single rust-only run; numba backend gone). Note pass/skip/xfail counts; the W5 baseline was parity+dataset+unit = 692 passed / 35 skipped / 2 xfailed and whole-tree green.
+
+- [ ] **Step 2: Run cargo tests + lint + format + typecheck + clippy**
+
+```bash
+cargo test --release 2>&1 | tail -5
+pixi run -e dev ruff check python/ tests/
+pixi run -e dev ruff format --check python/ tests/
+pixi run -e dev typecheck
+cargo clippy --release 2>&1 | tail -10
+```
+
+Expected: cargo 114 passed; ruff/format/typecheck/clippy all clean.
+
+- [ ] **Step 3: Confirm the abi3 wheel builds**
+
+```bash
+pixi run -e dev maturin build --release 2>&1 | tail -5
+```
+
+Expected: wheel builds clean.
+
+- [ ] **Step 4: Set the Phase 5 status marker**
+
+Per the spec disposition, using Task 1's verdict:
+- If the audit found the shim already thin AND checkpoint criteria are met (numba count = 0 ✓, perf re-baseline ✓, cargo-standalone ✓): tick the "Collapse PyO3 surface" item with the audit verdict, tick "cargo-testable standalone", set Phase 5 marker to **✅**, and re-file any residual collapse as a separate optimization track entry.
+- If bucket-2 glue remains: keep Phase 5 **🚧**, tick only the completed items (cargo-standalone, perf recorded), and leave the collapse item open with the audited remainder list.
+
+Add a final notes-log entry dated 2026-06-27 (Phase 5 W6 — wrap-up) summarizing: thin-shim verdict, cargo-standalone confirmation, seqpro-core released confirmation, perf re-baseline result, and the chosen Phase 5 marker. Note that the `rust-migration → master` merge is left to the maintainer.
+
+- [ ] **Step 5: Commit the finalization**
+
+```bash
+rtk git add docs/roadmaps/rust-migration.md
+rtk git commit -m "docs(roadmap): finalize Phase 5 W6 — set status marker + gate results
+
+Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>"
+```
+
+- [ ] **Step 6: Push and open the PR into rust-migration**
+
+```bash
+rtk git push -u origin phase-5-w6-wrapup
+gh pr create --base rust-migration --head phase-5-w6-wrapup \
+  --title "Phase 5 W6 wrap-up: thin-shim audit + cargo-standalone + seqpro verification + perf re-baseline" \
+  --body "$(cat <<'EOF'
+Wraps up Phase 5 finalization threads (sans genoray, sans the single-big-kernel collapse).
+
+- **Thin-shim audit** (Unit A): classified remaining PyO3-surface Python glue; verdict in `docs/roadmaps/phase-5-w6-thin-shim-audit.md`.
+- **cargo-testable standalone** (Unit B): verified `cargo test` runs without the pixi/Python layer.
+- **seqpro-core released** (Unit C): confirmed `seqpro-core 0.1.0` resolves from crates.io; corrected the stale Phase 1 path-dep note.
+- **W6 perf re-baseline** (Unit D): rayon serial-vs-multithread speedup curve + peak-RSS deltas in `docs/roadmaps/phase-5-w6-perf-rebaseline.md`.
+
+Gate: full pytest tree green, cargo test green, ruff/format/pyrefly/clippy clean, abi3 wheel builds.
+
+**Merge note:** targets `rust-migration` only. The `rust-migration → master` merge is left to the maintainer (no-squash).
+
+🤖 Generated with [Claude Code](https://claude.com/claude-code)
+EOF
+)"
+```
+
+---
+
+## Notes for the implementer
+
+- This plan is audit/measure/document-heavy, not feature code. Only Task 2 may touch source/config, and only if `cargo test` does not already run standalone.
+- Every roadmap edit is additive/corrective text — preserve the existing structure and the status-legend conventions (⬜/🚧/✅).
+- Do NOT mark Phase 5 ✅ before Task 5; intermediate tasks annotate but do not set the phase marker.
+- Do NOT merge to master under any circumstances.
diff --git a/docs/superpowers/specs/2026-06-24-phase-3-closeout-design.md b/docs/superpowers/specs/2026-06-24-phase-3-closeout-design.md
new file mode 100644
index 00000000..3e300232
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-24-phase-3-closeout-design.md
@@ -0,0 +1,184 @@
+# Design: Phase 3 close-out — main merge, missing-kernel ports, seqpro 0.20
+
+**Date:** 2026-06-24
+**Branch:** `phase-3-reconstruction` (Phase 3 PR #245 → `rust-migration`)
+**Status:** approved (design); pending implementation plan
+
+## Context & motivation
+
+Phase 3 of the Rust migration (reconstruction + track realignment) was marked `✅` in
+`docs/roadmaps/rust-migration.md`, but the roadmap is internally inconsistent: the phase
+header is `✅` while four sub-items (lines 282–285) are left unchecked, and the close-out
+commits updated the file sloppily. Separately, two bug fixes that were surfaced *during*
+Phase 3 landed on `origin/main` and are not yet on this branch. And seqpro shipped 0.20.0
+with a faster `to_numpy(validate=False)` path that GVL should adopt at guaranteed-uniform
+materialization sites.
+
+This spec closes Phase 3 honestly: absorb the main fixes, port the one genuinely-missing
+rust kernel, fuse the remaining unfused-but-rust read paths, bump seqpro, and reconcile the
+roadmap with reality.
+
+### Verified ground truth (the audit behind this plan)
+
+- **`origin/main` is 9 commits ahead** of this branch with two real fixes:
+  - **PR #244 / #242** — `fix(intervals): clip sub-query interval starts in both kernels`.
+    Touches `python/genvarloader/_dataset/_intervals.py` (+13) and `src/intervals.rs` (+45).
+  - **PR #243** — `fix(indexing): SpliceIndexer.parse_idx double-applies sample-subset map`.
+    Touches `python/genvarloader/_dataset/_indexing.py`.
+- **Merge interaction:** Phase 3 never modified `src/intervals.rs`, so main's clip fix merges
+  clean on the Rust side. The Phase 3 fused tracks kernel
+  `intervals_and_realign_track_fused` (`src/ffi/mod.rs:653`) **calls the shared
+  `intervals::intervals_to_tracks` core**, so it inherits the #242 fix automatically — no
+  manual Rust propagation. The only text conflict is `_intervals.py` (main +13 vs Phase 3 +45).
+- **Backend reality on the default (no `GVL_BACKEND`) read path:**
+  - Splice (`_haps.py:855`) and annotated (`_haps.py:903`) haps already run **rust** — they
+    call the dispatch wrapper `reconstruct_haplotypes_from_sparse` (`default="rust"`), just
+    **unfused** (2 FFI crossings instead of 1). They are *correct*, not broken.
+  - `shift_and_realign_track_sparse` (singular) is **only** a numba parity reference — never
+    on the default path. Nothing to port.
+  - The one **genuinely-missing rust port** is `Reference.fetch` (`_fetch_impl_par`/
+    `_fetch_impl_ser`, `_reference.py:164–183`): a thin per-row `padded_slice` loop with no
+    rust impl, used by the spliced ref-only dataset path (`RefDataset._getitem_spliced`) and
+    `_flat_flanks.py`.
+- **seqpro 0.20.0** is the current PyPI release. Its skip-validation addition is
+  `to_numpy(validate=False)` (skips the uniformity scan). The Rust `seqpro-core` is `0.1.0`
+  from crates.io (independently versioned from the Python package).
+- **~10 `#242` test exclusions** (`xfail(reason=_REASON_242)` + `assume(False)` guards) exist
+  solely because #242 was unfixed; they become real passing tests once the fix is merged.
+
+## Goals
+
+1. Bring the branch to an honest, fully-rust-default state for Phase 3's banner
+   (reconstruction + track realignment).
+2. Absorb the bug fixes that landed on `main` during Phase 3.
+3. Bump seqpro to 0.20.0 and adopt its skip-validation arg where safe.
+4. Reconcile the roadmap with what is actually done.
+
+## Non-goals (deferred, with honest roadmap notes)
+
+- Deleting numba parity references — Phase 5.
+- The broad "single big `__getitem__` kernel" beyond the specific fusions below — Phase 5.
+- Write-path concerns / `Reference.fetch` callers beyond what parity requires — Phase 4.
+- Any public-API change (this work is entirely internal).
+
+## Work plan (dependency order)
+
+### Step 1 — Merge `origin/main` into `phase-3-reconstruction`
+
+- Merge commit (not squash; preserves history per maintainer preference).
+- Brings #244 (#242) + #243 onto the branch. When this branch later merges to
+  `rust-migration`, the fixes flow through.
+- **Conflict resolution:** `python/genvarloader/_dataset/_intervals.py` — reconcile main's
+  clip fix (+13) with Phase 3's edits (+45). `src/intervals.rs`, `_indexing.py` merge clean.
+- **Acceptance:** branch builds (`cargo build`, `maturin develop`), no leftover conflict
+  markers, `src/intervals.rs` carries the clip fix.
+
+### Step 2 — Lift the now-obsolete #242 exclusions
+
+- Remove `xfail(reason=_REASON_242)` markers and the `_REASON_242` constants from:
+  - `tests/dataset/test_flat_intervals.py`
+  - `tests/dataset/test_seqs_tracks.py`
+  - `tests/dataset/test_realign_tracks.py`
+  - `tests/unit/dataset/test_output_bytes_per_instance.py`
+  - `tests/integration/dataset/test_dummy_dataset_insertion_fill.py`
+- Remove the `assume(False)` #242-family guards in
+  `tests/parity/test_reconstruct_haplotypes_parity.py` and
+  `tests/parity/test_shift_and_realign_tracks_parity.py` **that correspond to the
+  `itv.start < query_start` / `start>=clen` #242 domain only**.
+- **Keep** the *reconstruct trailing-under-write* exclusion (overshoot pre-check +
+  double-init guard) — that is a genuine numba-undefined domain, unrelated to #242.
+- **Acceptance:** these tests now run (not xfail) and pass on `max_jitter>0` datasets under
+  both `GVL_BACKEND=rust` and `GVL_BACKEND=numba`.
+
+### Step 3 — Port `Reference.fetch` to rust
+
+- Add a rust kernel (working name `fetch_reference`) in the `src/reference/` module that
+  loops rows and calls the existing `padded_slice` core, mutating the caller's `out` buffer
+  in place (mirrors `_fetch_impl_ser`/`_par`; serial is fine — disjoint per-row out-slices).
+- Expose via `src/ffi/`; register in `python/genvarloader/_dataset/_reference.py` through
+  `_dispatch.register(..., default="rust")`, keeping the numba `_fetch_impl_*` as the parity
+  reference. Route `Reference.fetch` through the dispatcher.
+- **Acceptance:** byte-identical parity (hypothesis suite, both impls) for `fetch_reference`;
+  spliced ref-only dataset path (`RefDataset._getitem_spliced`) and `_flat_flanks.py`
+  exercise the rust kernel by default. Closes the last 3 numba kernels of roadmap item 3.
+
+### Step 4 — Fuse the annotated-haps and splice haps paths
+
+Both currently run correct-but-unfused rust (2 FFI crossings via the dispatch wrapper).
+
+- **Annotated haps:** add/extend a fused rust entry that fills `out`, `annot_v_idxs`, and
+  `annot_ref_pos` in a single FFI crossing (currently `_haps.py:903` composes via the
+  wrapper). Route `_reconstruct_annotated_haplotypes` (non-splice branch) through it when
+  `GVL_BACKEND` is rust (default), mirroring the Task-13 `reconstruct_haplotypes_fused`
+  pattern.
+- **Splice haps:** add a fused rust entry that consumes the splice-permuted request
+  (`flat_geno_idx`, `flat_shifts`, `permuted_regions`, permuted keep arrays,
+  `splice_plan.permuted_out_offsets`) and reconstructs in one crossing (currently
+  `_haps.py:855` composes via the wrapper). The Python-side splice permutation
+  (`_permute_request_for_splice`) stays in Python; only the reconstruction crossing fuses.
+- Annotated + splice combined (annotated path with a splice plan) may remain on the unfused
+  dispatched rust path if fusing the combination is disproportionately complex — if so,
+  document it as a Phase-5 residue rather than claiming 100%.
+- **Acceptance:** byte-identical dataset parity vs the composed numba oracle for each fused
+  path (same gate style as Tasks 13–14), across insertion-fill strategies where relevant.
+  Closes roadmap items 1 and 4.
+
+### Step 5 — Bump seqpro to 0.20.0 + adopt skip-validation
+
+- `pixi.toml`: `seqpro = "==0.18.0"` → `"==0.20.0"`.
+- `pyproject.toml`: `"seqpro>=0.18"` → `"seqpro>=0.20"`.
+- Re-run `pixi install`/lock; confirm the env resolves and `import seqpro; __version__ == 0.20.0`.
+- **Skip-validation adoption (propose-then-approve):** inventory read-path `.to_numpy()` /
+  fixed-length materialization sites where row uniformity is *guaranteed by construction*
+  (e.g. `with_len(L)` / `to_fixed` / `to_padded` outputs). Propose `validate=False` at those
+  sites for maintainer approval before applying. Do **not** blanket-apply.
+- **Rust compat check:** confirm `seqpro-core` 0.1.0's `Ragged` layout (offsets + data +
+  itemsize) still matches what GVL's `src/ragged/mod.rs` bridge constructs against seqpro
+  0.20.0. Low risk (core is pyo3-free and independently versioned), but verified via
+  `cargo test` + the dataset parity backstop.
+- **Acceptance:** full tree green on 0.20.0; any `validate=False` sites approved and parity
+  unchanged.
+
+### Step 6 — Roadmap + skill honesty pass
+
+- `docs/roadmaps/rust-migration.md`:
+  - Reconcile the `✅`-header / unchecked-boxes contradiction in Phase 3.
+  - Check off items 1, 3, 4 (now truthfully done); reword item 2 to state tracks/intervals
+    realign is rust-default + fused, with the remaining numba retained as Phase-5-deletion
+    parity references.
+  - Add a dated decisions-log entry recording: #242 fix merged + xfails lifted,
+    `Reference.fetch` ported, annotated/splice fused, seqpro 0.20 bump.
+- `skills/genvarloader/SKILL.md`: confirm no public-API change (expected no-op per CLAUDE.md
+  maintenance rule). Update only if an exported symbol/signature changed (none expected).
+
+## Verification gate (migration contract)
+
+- `cargo test` green (incl. new `fetch_reference` + fused-kernel unit tests).
+- Full pytest tree green: `pixi run -e dev pytest tests -q` (cover `tests/dataset` **and**
+  `tests/unit` per CLAUDE.md), including the un-xfailed #242 tests, under **both**
+  `GVL_BACKEND=rust` and `GVL_BACKEND=numba`.
+  - Env note: dataset tests need `--basetemp=$(pwd)/.pytest_tmp` on Carter HPC (os.link
+    cross-device Errno 18), same as Phases 2–3.
+- Byte-identical parity for `fetch_reference` and the fused annotated/splice kernels.
+- `ruff check python/ tests/`, `ruff format`, `typecheck` clean; abi3 wheel builds.
+- Throughput recorded (not gated) for the newly-fused paths, appended to the Phase 3
+  measurement block.
+
+## Risks & mitigations
+
+- **`_intervals.py` merge conflict** — small, mechanical; resolve by keeping both the clip
+  fix and Phase 3's additions. Mitigation: re-run the intervals parity + #242 tests after.
+- **Splice fusion complexity** — the permuted-request plumbing is the most involved piece.
+  Mitigation: keep the Python permutation in Python; fuse only the reconstruction crossing;
+  fall back to the documented unfused-rust path (with an honest roadmap note) if the
+  annotated×splice combination proves disproportionate.
+- **seqpro 0.20 Ragged layout drift** — could break the Rust bridge. Mitigation: `cargo test`
+  + dataset parity backstop catch any layout mismatch immediately.
+- **Lifting xfails exposes a latent failure** — if an un-xfailed test fails, that is a real
+  signal (the clip fix didn't fully cover it). Mitigation: investigate rather than re-xfail;
+  the #242 fix is the contract.
+
+## Out-of-scope confirmations
+
+No public API changes; no numba deletion; no write-path migration; no new perf gate (Phase 3
+remains parity-gated, throughput recorded only, per the branch/gate strategy).
diff --git a/docs/superpowers/specs/2026-06-24-rust-migration-phase-2-genotypes-variants-design.md b/docs/superpowers/specs/2026-06-24-rust-migration-phase-2-genotypes-variants-design.md
new file mode 100644
index 00000000..4587aa2c
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-24-rust-migration-phase-2-genotypes-variants-design.md
@@ -0,0 +1,138 @@
+# Design: Rust migration Phase 2 — Genotype assembly + variant gather
+
+**Date:** 2026-06-24
+**Roadmap:** `docs/roadmaps/rust-migration.md` (Phase 2)
+**Status:** approved design, pre-implementation
+
+## Context
+
+Phases 0 (foundation + `intervals_to_tracks` proof-point) and 1 (ragged primitives
+via `seqpro-core`) have landed. Phase 2 is the next bottom-up step: migrate the
+genotype assembly/selection kernels and the flat variant-gather kernels from
+numba to the Rust crate, following the strangler-fig + byte-identical-parity
+contract established in Phase 0.
+
+## Scope
+
+### Port (live kernels)
+
+From `python/genvarloader/_dataset/_genotypes.py`:
+- `get_diffs_sparse` — per-`(query, hap)` reference-length diffs; called from
+  `_haps.py:474` for haplotype-length sizing.
+- `choose_exonic_variants` (+ inner `_choose_exonic_variants`) — keep-mask for
+  variants fully contained in a query interval; called from `_haps.py`
+  (spliced/exonic path).
+
+From `python/genvarloader/_dataset/_flat_variants.py` (7 kernels, variants output
+mode only — driven by `get_variants_flat`, not the default tracks/haps getitem):
+- `_gather_v_idxs`, `_gather_v_idxs_ss` — gather variant indices for contiguous
+  `(n+1,)` and non-contiguous `(2, n)` offset forms.
+- `_gather_alleles` — two-level allele-byte gather.
+- `_compact_keep` — compact a flat buffer + offsets under a keep mask.
+- `_fill_empty_scalar`, `_fill_empty_seq`, `_fill_empty_fixed` — dummy-variant
+  fill for empty `(region, sample, ploid)` groups (scalar / bytestring /
+  fixed-inner-stride).
+
+### Delete (dead kernel)
+
+- `filter_af` (`_genotypes.py`) — superseded by inline numpy AF filtering in
+  `_haps.py:734-737` and `_flat_variants.py:698-701`; **zero callers**. This is the
+  same dead-code situation as the Phase 0 `splits_sum_le_value` pivot. Removed in
+  this PR rather than ported.
+
+### Phase boundary fix
+
+The roadmap text "`_genotypes.py` kernels (6 numba)" double-counts the two
+reconstruction kernels (`reconstruct_haplotypes_from_sparse`,
+`reconstruct_haplotype_from_sparse`) that live in `_genotypes.py` but belong to
+**Phase 3** (next to `_reconstruct.py`/`_haps.py`, where the big read-path win is
+measured as one unit). Phase 2 covers assembly/selection only. The roadmap is
+updated to remove the double-count.
+
+## Architecture
+
+Follows the Phase 0 seam (`src/ffi/` is the only place touching PyO3; core logic
+in lazily-grown pure-`ndarray` domain modules).
+
+- New domain modules: `src/genotypes/mod.rs` (assembly/selection) and
+  `src/variants/mod.rs` (flat gather/fill). Pure `ndarray`, no PyO3.
+- All PyO3 wrappers in `src/ffi/`, mirroring the `intervals_to_tracks` pattern.
+- **FFI signatures mirror the numba signatures exactly** — same inputs, same
+  `(data, offsets)`-tuple returns. Python keeps wrapping results into
+  `seqpro.rag.Ragged` / `keep_offsets` exactly as today, so dispatch is a drop-in
+  swap and parity is byte-identical.
+- **Both offset forms**: handle 1-D `(n+1,)` and 2-D `(2, n_slices)` `geno_offsets`
+  (windowed/sliced queries) — both branches exist in the numba kernels.
+- **Parallelism**: sequential first. Per-`(query, hap)` writes are disjoint
+  (`diffs[q,h]`, `keep[k_s:k_e]`), so sequential output is byte-identical to
+  numba's `prange` — same argument as the Phase 0 proof-point. Add `rayon` only if
+  the no-regression gate requires it.
+
+## Dispatch & strangler-fig contract
+
+- Register each ported kernel in `python/genvarloader/_dispatch.py` (per-kernel
+  default `rust`, `GVL_BACKEND` global override), routing the call sites in
+  `_haps.py` / `_flat_variants.py`.
+- Keep the numba impls as the parity reference until the phase closes, then delete
+  them + the switch in the same bundled PR (per the migration contract).
+- `filter_af` is deleted immediately (dead, nothing to keep as a reference).
+
+## Testing
+
+Extends the Phase 0 harness (`tests/parity/`).
+
+- **Per-kernel hypothesis parity gates** — run-both-assert-byte-identical,
+  covering the branch matrix:
+  - `get_diffs_sparse`: 1-D vs 2-D offsets; `keep`/`keep_offsets` present/absent;
+    the `q_starts`/`q_ends`/`v_starts` query-clipping path; empty groups.
+  - `choose_exonic_variants`: 1-D vs 2-D offsets; empty groups; variants partially
+    vs fully contained in the interval.
+  - flat kernels: contiguous vs non-contiguous gather; keep-mask compaction;
+    empty-group fill for scalar / seq / fixed fields.
+- **New variants-mode dataset-level backstop** with a kernel spy (mirrors the
+  tracks-mode backstop). Variants mode (`with_seqs("variants")`) has no
+  differential coverage today; this is genuinely new and asserts the Rust kernels
+  are actually invoked (no vacuous pass — the lesson baked in after the splits
+  backstop).
+- `cargo test` units per kernel.
+
+## Gate & measurement
+
+Gate = **parity + no regression** (per decision; the dramatic read-path speedup is
+Phase 3's, not Phase 2's — these kernels are cheap index-math and buffer gathers).
+
+- Parity green across py310–313 × linux/macOS.
+- No `__getitem__` throughput regression on `chr22_geuv`:
+  - `profile.py --mode haps` vs baseline **123.9 batch/s** (exercises
+    `get_diffs_sparse` + `choose_exonic_variants`).
+  - `profile.py --mode variants` vs baseline **145.3 batch/s** (exercises the flat
+    gather/fill kernels).
+- abi3 wheel still builds (standing CI invariant).
+- Record any incidental wins (kernel count down by 3 incl. the dead `filter_af`;
+  reduced JIT warmup / RSS).
+
+## Sequencing (one bundled PR)
+
+Internal beachhead order: genotypes-first, then variants.
+
+1. `get_diffs_sparse` → Rust + ffi + dispatch + parity gate.
+2. `choose_exonic_variants` (+ inner) → same loop.
+3. Delete dead `filter_af`.
+4. The 7 `_flat_variants.py` kernels → Rust + ffi + dispatch + parity gates +
+   variants-mode backstop.
+5. Flip defaults, delete numba impls + switch, measure, update roadmap.
+
+## Roadmap update (part of the PR)
+
+- Fix the Phase 2 double-count (reconstruction kernels → Phase 3).
+- Mark `filter_af` deleted-as-dead.
+- Note the variants-mode gate uses the variants baseline (145.3 batch/s).
+- Record decisions in the notes log; set the Phase 2 status marker + PR link;
+  record measurements.
+
+## Non-goals
+
+- Reconstruction kernels (`reconstruct_*`) — Phase 3.
+- Track realignment, reference, insertion-fill, splice — Phase 3.
+- Write/update pipeline — Phase 4.
+- Any rayon parallelism unless the no-regression gate forces it.
diff --git a/docs/superpowers/specs/2026-06-24-rust-migration-phase-3-design.md b/docs/superpowers/specs/2026-06-24-rust-migration-phase-3-design.md
new file mode 100644
index 00000000..a2bda002
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-24-rust-migration-phase-3-design.md
@@ -0,0 +1,186 @@
+# Phase 3 — Reconstruction + track realignment (design)
+
+**Date:** 2026-06-24
+**Branch:** `phase-3-reconstruction` (off the persistent `rust-migration` integration branch)
+**Roadmap:** `docs/roadmaps/rust-migration.md` → Phase 3
+**Status:** design approved 2026-06-24; spec under review
+
+This spec covers the largest migration phase — the numba bulk of the read path. It
+follows the established strangler-fig + byte-identical-parity contract from Phases 0–2,
+and additionally **begins the read-path consolidation** (single large `__getitem__`
+kernel) that Phase 2 profiling identified as the real throughput win.
+
+---
+
+## Goal
+
+1. Port the 8 numba-only kernel groups across the Phase 3 read-path files to Rust as
+   **1:1 parity twins** behind per-kernel dispatch (numba retained as registered parity
+   reference, deleted wholesale in Phase 5).
+2. **Begin consolidation**: fuse the two hot read paths — **haplotypes** and **tracks** —
+   into single Rust `__getitem__` kernels that cross the Python/Rust boundary once,
+   eliminating the redundant `np.ascontiguousarray` glue Phase 2 profiling pinned at
+   62% of the variants loop.
+
+## Decisions captured during brainstorming (2026-06-24)
+
+- **Port strategy:** 1:1 parity twins **+** begin consolidation (not strict 1:1-only,
+  not fused-from-scratch).
+- **Gate:** **parity is the hard gate** (byte-identical, blocks landing) for every ported
+  kernel; **throughput is recorded only** — no throughput gate in Phase 3. The final
+  throughput gate remains in the Phase 5 consolidation pass. (This supersedes the stale
+  `Gate: parity + Dataset.__getitem__ throughput` line in the current roadmap Phase 3
+  section, which predates the Phase 2 branch/gate-strategy change; that line will be
+  corrected as part of this work.)
+- **Consolidation beachhead:** fuse **both** the haplotypes and tracks read paths this
+  phase (not haplotypes-only, not deferred to end-of-phase profiling).
+- **Sequencing:** easiest→hairiest so parity tooling matures before the risky kernels:
+  reference → haplotype reconstruction → track realignment → fusion.
+- **Out of scope this phase:** `_insertion_fill.py:lower` and `_splice.py:build_splice_plan`
+  stay plain Python (array-packing / plan-building, not hot; they feed the kernels).
+
+---
+
+## Architecture
+
+Identical shape to Phase 2:
+
+- Pure-`ndarray` / `rayon` cores in new `src/` domain modules — no PyO3.
+- PyO3 wrappers confined to `src/ffi/`.
+- Per-kernel dispatch via `genvarloader._dispatch` (default `rust`; `GVL_BACKEND`
+  override; numba impl kept as the registered parity reference).
+- `main`/`rust-migration` stays shippable; every step reversible until parity is proven.
+
+### New Rust modules
+
+```
+src/
+├── reconstruct/   # reconstruct_haplotypes_from_sparse (+ singular inner),
+│                  # annotated variant (per-bp v_idx + ref-coord) variant
+├── tracks/        # shift_and_realign_track[s]_sparse, _apply_insertion_fill (4 strategies),
+│                  # _xorshift64 / _hash4 PRNG, tracks_to_intervals RLE
+│                  # (+ _scanned_mask / _compact_mask)
+└── reference/     # get_reference (par/ser), padded_slice, spliced-ref fetch
+```
+
+`padded_slice` moves out of `_utils.py`'s numba surface into the `reference` core (it is
+a reference-assembly leaf). `_insertion_fill.py:lower` and `_splice.py:build_splice_plan`
+remain plain Python and continue to produce the packed strategy arrays / splice
+permutation+offsets the kernels consume.
+
+### Fused `__getitem__` kernels (consolidation)
+
+Two new Rust entry points that compose what are today multiple per-kernel boundary
+crossings into one:
+
+- **Fused haplotypes**: `get_diffs_sparse` (already Rust) + `reconstruct_*_from_sparse`
+  in a single crossing, returning the reconstructed haplotype bytes (and, for the
+  annotated mode, the per-bp variant-index and ref-coordinate arrays) without
+  intermediate Python-side `np.ascontiguousarray` coercions.
+- **Fused tracks**: `get_diffs_sparse` → `shift_and_realign_tracks_sparse` →
+  `intervals_to_tracks` (already Rust) in a single crossing.
+
+These are **new** entry points, not 1:1 twins; they are parity-verified at the dataset
+level (see Testing) against the composed numba pipeline.
+
+---
+
+## Work breakdown (incremental landings on the branch; one bundled PR at phase close)
+
+Each sub-unit lands incrementally on `phase-3-reconstruction` with its own parity suite,
+mirroring Phase 2's task-by-task cadence. The whole phase merges into `rust-migration` as
+one bundled PR.
+
+### 3a — Reference path (warm-up; low parity risk)
+- Port `get_reference` (parallel + serial selection), `_get_reference_row`, and
+  `padded_slice` into `src/reference/`.
+- Port the spliced-reference fetch (`_fetch_spliced_ref` consumes `build_splice_plan`'s
+  permutation; the plan builder stays Python).
+- Parity: byte-identical reference assembly (incl. boundary padding) over hypothesis
+  inputs; spy-guarded reference-mode dataset backstop.
+
+### 3b — Haplotype reconstruction (core)
+- Port `reconstruct_haplotypes_from_sparse` (batch/parallel) + `reconstruct_haplotype_from_sparse`
+  (singular: shifting, variant overlaps, padding) into `src/reconstruct/`.
+- Port the annotated variant used by `_haps.py:_reconstruct_annotated_haplotypes`
+  (returns per-bp variant indices + ref coordinates alongside the S1 bytes).
+- Parity: byte-identical haplotype bytes **and** annotation arrays (variant idx + ref pos).
+
+### 3c — Track realignment + RLE (hairiest; the parity risks live here)
+- Port `shift_and_realign_tracks_sparse` (batch) + `shift_and_realign_track_sparse`
+  (singular) into `src/tracks/`, including `_apply_insertion_fill` with all four
+  strategies (Repeat5p, Constant, FlankSample, Interpolate) and the `_xorshift64`/`_hash4`
+  PRNG.
+- Port `tracks_to_intervals` (RLE) + `_scanned_mask` + `_compact_mask`.
+- Parity: byte-identical tracks across **all four** fill strategies (incl. the RNG-driven
+  FlankSample), plus byte-identical RLE round-trip.
+
+### 3d — Consolidation (fused kernels; throughput recorded, not gated)
+- Build the fused haplotype `__getitem__` Rust kernel and the fused tracks `__getitem__`
+  Rust kernel (single boundary crossing each; drop redundant `np.ascontiguousarray`).
+- Re-profile `chr22_geuv` (haplotypes + tracks modes, `NUMBA_NUM_THREADS=1`, Carter) and
+  **record** throughput + peak RSS in the roadmap. Confirm via cProfile that the
+  `np.ascontiguousarray` glue tax is gone from the fused paths.
+
+---
+
+## Parity strategy
+
+- Per-kernel `@pytest.mark.parity` hypothesis suites asserting **byte-identical** output;
+  for tuple-returning kernels, assert every returned array.
+- Spy-guarded **dataset backstops** for haplotypes and tracks modes proving the fused
+  kernels are actually invoked on the live `Dataset.__getitem__` path (the Phase 0
+  lesson: a backstop must spy + assert non-trivial output so a vacuous pass is impossible).
+- Parity is verified across the standing py310–313 × linux/macOS matrix per the contract;
+  a kernel only lands when parity holds.
+
+### Two identified parity risks (both in 3c)
+
+1. **FlankSample PRNG.** `_xorshift64`/`_hash4` are seeded and deterministic, so
+   byte-identical parity is achievable **only if** the Rust port reproduces the exact
+   `u64` wrapping arithmetic and hash-mixing order. Mitigation: port bit-for-bit and add a
+   direct PRNG-sequence unit test (Rust output == numba output for a fixed seed grid)
+   *before* wiring it into the kernel.
+2. **Interpolate fill (float32).** Byte-identical float parity requires identical
+   operation order. Both numba and Rust lower through LLVM, so this is achievable but is
+   the most likely 1-ULP break. Mitigation: attempt strict byte-identical first; if
+   intractable, fall back to the Phase 2 pattern (dtype/strategy-dispatched Rust core with
+   a numba fallback for the offending strategy), documented in the roadmap if used.
+
+---
+
+## Testing & close-out
+
+- Full tree green on **both** backends (`GVL_BACKEND=rust` and `GVL_BACKEND=numba`):
+  `pixi run -e dev pytest tests -q` (dataset + unit).
+- `cargo test` green; `ruff check`/`ruff format` clean on `python/ tests/`; `typecheck`
+  clean; abi3 wheel builds.
+- Env note (from Phase 2): dataset tests need pytest's tmp on the same filesystem as
+  `tests/data` (`--basetemp=<repo>/.pytest_tmp`) or the write-path `os.link` hardlink
+  fails cross-device (Errno 18).
+
+## Roadmap maintenance (part of the work)
+
+- Correct the stale `Gate: parity + Dataset.__getitem__ throughput` line in the Phase 3
+  section to **parity hard-gate; throughput recorded only** (matches the 2026-06-24
+  decision and the Phase 2 branch/gate strategy).
+- Tick Phase 3 tasks and record measurements under the relevant checkpoint as each
+  sub-unit lands; set the phase status marker (⬜→🚧→✅) + PR link.
+- Add a Notes & decisions log entry for Phase 3 mirroring the Phase 2 entry.
+
+## Out of scope
+
+- `_insertion_fill.py:lower`, `_splice.py:build_splice_plan` (stay plain Python).
+- Variant-flat / flank kernels already handled in Phase 2.
+- The final crate consolidation and wholesale numba deletion (Phase 5).
+- genoray variant IO (Phase 6).
+
+## Success criteria
+
+- All 8 Phase 3 kernel groups have byte-identical Rust twins behind dispatch (parity
+  hard-gate met).
+- Fused haplotypes + tracks `__getitem__` kernels land and are parity-verified at the
+  dataset level; their throughput + peak RSS are recorded in the roadmap.
+- Full tree green on both backends; cargo/lint/typecheck/abi3 clean.
+- Roadmap updated (gate line corrected, tasks ticked, measurements + decisions logged,
+  status marker + PR link set).
diff --git a/docs/superpowers/specs/2026-06-25-round3-instruction-level-kernel-tuning-design.md b/docs/superpowers/specs/2026-06-25-round3-instruction-level-kernel-tuning-design.md
new file mode 100644
index 00000000..21807359
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-25-round3-instruction-level-kernel-tuning-design.md
@@ -0,0 +1,188 @@
+# Round-3 instruction-level kernel tuning
+
+**Date:** 2026-06-25
+**Branch base:** `rust-migration` (Targets 5/6/7 merged: PRs #248/#249/#250)
+**Roadmap home:** `docs/roadmaps/rust-migration.md` → Phase 3 "Optimization targets — round 3" (a new sub-section alongside rounds 1–2 and targets 5–7; **not** a new phase)
+
+---
+
+## Goal
+
+Drive the now-Rust-dominated read-path kernels to **rust ≥ numba single-threaded** on all four
+read paths — **tracks-only, haplotypes, variants, variant-windows** — by tuning the generated
+machine code. Use `perf` to localize the hot Rust leaves and `cargo-show-asm` (+ llvm-mca via
+`--mca`) to inspect and verify codegen at the instruction level.
+
+This is a continuation of the established Phase-3 optimization rhythm (rounds 1–2, targets 5–7),
+not a new architectural phase. It changes no on-disk format, no public API, and no kernel
+semantics — only the instruction sequences the hot kernels compile to.
+
+### Non-goals
+
+- No rayon / batch parallelism (explicitly deferred to Phase 5; single-thread parity first).
+- No on-disk format change, no public API change, no new kernels.
+- No numba deletion (that is Phase 5).
+- Not a correctness pass — byte-identical parity must hold unchanged throughout.
+
+---
+
+## Decisions (locked with the user, 2026-06-25)
+
+1. **Gate = wall-clock throughput; asm instruction count is evidence, not the gate.**
+   The round lands on the established **rust ÷ numba batch/s** metric. Per-kernel
+   instruction-count / llvm-mca cycle deltas are recorded as supporting evidence in the roadmap,
+   but a kernel that drops instructions without improving ms/batch is reverted. Instruction count
+   is a proxy (kernels can be memory- or branch-bound); throughput is truth.
+
+2. **Tooling = `cargo-show-asm`** (`cargo asm`, v0.2.61, installed). Gives `--mca` llvm-mca
+   cycle/throughput estimates, `--rust` source interleave, and resolves modern monomorphized
+   symbols. The 2019-era gnzlbg `cargo-asm` is not used.
+
+3. **`unsafe` budget = targeted, parity-gated.** Prefer safe idioms first (slice hoisting,
+   iterators, `assert!` bound hints, codegen attributes — the T5 playbook). Where the optimizer
+   provably cannot elide a bound, allow `get_unchecked` / explicit SIMD, each with a `// SAFETY:`
+   comment, contained by the byte-identical parity gate on both backends.
+
+---
+
+## Approach
+
+**Profile-all-first ranked target list, driven by a per-kernel tune loop.** Reach for a Rust
+criterion microbench only for a kernel where the in-process flat profile is ambiguous or where
+llvm-mca on realistic inputs in isolation is needed — matching the roadmap's own guidance
+("a Rust-only criterion harness is only worth building if we want to micro-optimize a kernel in
+isolation from FFI/Python").
+
+Rejected alternatives:
+- *Per-path sequential* (tune kernels in path order): misses that several kernels are shared
+  across paths, so path-order tuning fails to compound shared wins.
+- *Criterion-first for every kernel*: more setup, and risks optimizing against unrealistic input
+  shapes divorced from the real FFI call sites.
+
+---
+
+## Workspace
+
+- **New git worktree** off `rust-migration` (via the `using-git-worktrees` skill).
+- **Its own fresh pixi env** — do **not** symlink `.pixi`. `maturin develop` repoints the shared
+  env's `.pth`/`.so`, so a shared env would corrupt the parent workspace's build
+  (per the `gvl-parallel-worktrees-fresh-pixi-env` note).
+- `cargo asm` (cargo-show-asm) already installed and on PATH (v0.2.61).
+- Release builds via `maturin develop --release`.
+- Add a `[profile.profiling]` to `Cargo.toml` that **inherits `release`** and adds
+  `debug = "line-tables-only"` + `force-frame-pointers = true`, for perf call-graph attribution
+  when flat self-time is ambiguous. Flat self-time on the plain release `.so` (symbols resolve
+  from the symbol table) is the default; the profiling profile is only for `perf report --children`
+  caller attribution. This profile must not change the codegen the gate measures — gate numbers
+  always come from the plain `--release` build.
+
+---
+
+## Procedure
+
+### Step 1 — Fresh baseline + ranked target list (no tuning until this exists)
+
+The last perf profiles predate the T5/6/7 merges, so re-baseline at current HEAD.
+
+For each of the four paths, run the established perf method (per `gvl-profiling-perf-not-pyspy-native`):
+
+```bash
+NUMBA_NUM_THREADS=1 perf record -F 999 -o p.data -- .pixi/envs/dev/bin/python \
+    tests/benchmarks/profiling/profile.py --mode <mode> --n-batches 12000
+perf report --stdio --no-children -i p.data        # flat self-time, Rust symbols resolved
+```
+
+Modes: `tracks`, `haplotypes`, `variants`, `variant-windows` (the four the user named;
+`profile.py --mode` already supports all of `{haplotypes,annotated,tracks,tracks-seqs,variants,variant-windows}`).
+
+Produce **one consolidated table**: rows = Rust kernel symbols, columns = per-path self-time %,
+plus an **aggregate weight** (self-time % summed across the paths a kernel appears in, so shared
+kernels like `intervals_to_tracks` and `shift_and_realign_tracks_sparse` rank by their total
+read-path cost). Record current **rust ÷ numba ratios** per path as the round-3 starting line.
+
+**Expected (to be confirmed, not assumed) targets:** `intervals_to_tracks` and
+`shift_and_realign_tracks_sparse` (shared: tracks + haplotypes), `reconstruct_haplotypes_from_sparse`,
+`rc_flat_rows_inplace`; and the variant-windows trio `tokenize` / `slice_flanks` /
+`assemble_alt_window` (T7 left these as the profile top). Step 1's real profile overrides any
+of these.
+
+### Step 2 — Per-kernel tune loop (highest aggregate weight first)
+
+For each target kernel, in descending aggregate-weight order:
+
+1. **Inspect.** `cargo asm --rust --mca <crate>::<path>::<symbol>` → capture instruction count,
+   llvm-mca cycle/throughput estimate, and the dominant cost (bounds check, redundant
+   slice/copy, missed autovectorization, register spill, etc.).
+2. **Fix.** Safe idioms first (hoist `as_slice_mut`, iterator forms, `assert!` to feed the
+   bound checker, `#[inline]`/codegen hints). Targeted `unsafe` (`get_unchecked` / explicit
+   SIMD) only where the bound is provably safe but the optimizer keeps the check; each `unsafe`
+   carries a `// SAFETY:` comment.
+3. **Confirm asm (evidence).** Re-run `cargo asm` → instruction/cycle drop recorded.
+4. **Confirm throughput (gate).** Re-run the path's throughput harness → ms/batch improvement
+   (or no regression). **If instructions dropped but ms/batch did not improve, revert** — it was
+   a memory/branch-bound kernel and the change adds risk for no win.
+5. **Confirm parity.** Run the kernel's `@pytest.mark.parity` suite → byte-identical on both
+   backends.
+
+### Step 3 — Gate + land
+
+Before merge:
+- Full tree on **both** backends: `pixi run -e dev pytest tests -q` under `GVL_BACKEND` rust and
+  numba (use `--basetemp=$(pwd)/.pytest_tmp` per the HPC `os.link` note).
+- `cargo test` green; lint (`ruff check python/ tests/`), format, `typecheck` clean; abi3 wheel
+  builds.
+- `docs/roadmaps/rust-migration.md` updated: round-3 target table, per-kernel asm deltas, final
+  rust ÷ numba ratios, decisions log entry, and the optimization-targets sequencing note.
+
+---
+
+## Measurement harnesses (per-path, established — do not invent new ones)
+
+| Path | Gate metric | Harness | Why |
+|---|---|---|---|
+| tracks-only | rust ÷ numba **pedantic min** (ms/batch) | `tests/benchmarks/test_e2e.py` (pytest-benchmark, `iterations=10, rounds=50, warmup=5`) | de-noised min is reproducible <1% |
+| haplotypes | rust ÷ numba **pedantic min** (ms/batch) | same | same |
+| variants | rust ÷ numba **wall-clock average** (ms/batch, 2000 batches) | `tests/benchmarks/profiling/profile.py` | `test_e2e_variants` is xfailed (`_FlatVariants.to_fixed` gap) → no pedantic min |
+| variant-windows | rust ÷ numba **wall-clock average** (ms/batch, 2000 batches) | `profile.py` | same xfail; T7 used this harness |
+
+All measurements: corpus `chr22_geuv.gvl` (format 2.0, 165 regions × 5 samples, 82 neg / 83 pos
+strand), `with_len(16384)`, `BATCH=32`, `NUMBA_NUM_THREADS=1`, `maturin develop --release`,
+Carter HPC (AMD EPYC 7543, linux-64). Report the **ratio**, not absolute batch/s (shared-node
+load varies across sessions — the standing roadmap caveat).
+
+---
+
+## Parity contract (unchanged)
+
+Byte-identical rust vs numba on both backends, via the existing `@pytest.mark.parity` hypothesis
+suites + the spy-guarded dataset backstops. The two documented numba-bug sub-domains stay excluded
+exactly as today (the #242-family `intervals_to_tracks` start<query clip and the reconstruct
+trailing-under-write overshoot) — this round must not touch those exclusions. Any new `unsafe`
+must produce output byte-identical to the safe path it replaces; the parity suite is the proof.
+
+---
+
+## Risks & stop rules
+
+1. **Instruction count ≠ wall-clock.** Throughput is the gate precisely to catch this; revert
+   instruction wins that don't move ms/batch (Step 2.4).
+2. **Diminishing returns.** Stop tuning a kernel when a round yields < ~5% throughput on its path.
+3. **Hard floors.** The cheapest path (tracks-only, ~1 ms/batch) is partly FFI fixed-cost- and
+   memory-bound; there is a floor below which instruction tuning does nothing. Record honestly;
+   do not force a win that isn't there.
+4. **`unsafe` risk** is contained by the byte-identical parity gate on both backends; no `unsafe`
+   lands without a `// SAFETY:` comment and a passing parity suite.
+5. **Profiling-profile codegen drift.** Gate numbers come only from the plain `--release` build;
+   the `[profile.profiling]` build is for perf attribution and is never the measured artifact.
+
+---
+
+## Deliverables
+
+- New worktree on a `opt/round3-*` branch off `rust-migration`, fresh pixi env.
+- `[profile.profiling]` added to `Cargo.toml`.
+- Step-1 consolidated profile table (committed under `docs/roadmaps/` or the round-3 roadmap
+  section).
+- Per-kernel tuning commits, each with asm-delta + throughput + parity evidence in the message.
+- Roadmap round-3 section with target table, asm deltas, final ratios, decisions-log entry.
+- Full-tree-green on both backends, cargo test, lint/format/typecheck, abi3 build.
diff --git a/docs/superpowers/specs/2026-06-25-rust-variant-rc-fold-design.md b/docs/superpowers/specs/2026-06-25-rust-variant-rc-fold-design.md
new file mode 100644
index 00000000..7d83975c
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-25-rust-variant-rc-fold-design.md
@@ -0,0 +1,172 @@
+# Spec: Rust variant-allele reverse-complement (churn-free)
+
+**Date:** 2026-06-25
+**Branch base:** `rust-migration`
+**Roadmap:** completes the deferred variant-RC half of optimization Target 6
+(`docs/roadmaps/rust-migration.md`, §"Optimization targets" #6); the Target-6 note
+said `RaggedVariants` + `_FlatVariants` RC were "targeted in Target 7", but Target 7
+(PR #250) collapsed object churn for *windows* and never folded their RC. This closes
+that loose end.
+
+## Background / corrected premise
+
+- "RC variants" **is** a supported feature: on the read path, negative-strand regions
+  reverse-complement the variant **alleles** (`alt`/`ref` byte strings) whenever
+  `view.rc_neg` is set. `_FlatVariants.reverse_masked` / `RaggedVariants.rc_` /
+  `_FlatAlleles.reverse_masked` implement it.
+- It is **already numba-free**: those methods call seqpro-core's Rust
+  `reverse_complement_masked`. The `_rag_variants.rc_helper-*.nbc` files in `__pycache__`
+  are **stale** numba caches from an older version — no live `rc_helper` exists.
+- `_FlatVariantWindows` (the Target-7 `assemble_variant_buffers` output) is **never**
+  reverse-complemented — `reverse_complement_ragged` returns it unchanged
+  ("reference-oriented"). So the windows path needs nothing here.
+
+## Problem
+
+The RC runs as a Python **post-pass** (`_query.py` → `reverse_complement_ragged` →
+`reverse_masked`/`rc_`) whose inner implementation rebuilds layered ragged objects per
+batch — `to_chars().to_packed()`, `Ragged.from_offsets(...)` view + rebuild, `np.repeat`
+mask expansion — purely to hand contiguous byte buffers to seqpro. The byte buffers in
+`_FlatAlleles` are **already** plain `uint8` data + `int64` offset arrays; the object
+churn is pure overhead.
+
+## Goal
+
+Replace the seqpro call + per-batch object churn with a thin gvl-owned Rust kernel that
+reverse-complements the masked alleles **in place on the raw `_FlatAlleles` buffers**,
+reusing the Target-6 primitives. Keep the existing seqpro path as the dispatch
+**reference** backend (retained for byte-identical parity + perf gating; deleted in
+Phase 5, **not now** — `rust-migration` is not ready to merge and numba/reference
+backends must stay for performance comparison).
+
+Non-goals: no on-disk format change; no change to `_FlatVariantWindows` (still not RC'd);
+no change to flank-token handling (the current post-pass RCs only `alt`/`ref`, never
+`flank_tokens` — preserve exactly).
+
+## Placement decision (settled)
+
+RC is a **dedicated Rust call applied after dummy-fill**, at the same point in the
+pipeline as today's seqpro pass — *not* folded inside `assemble_variant_buffers`.
+
+```
+assemble_variant_buffers (unchanged, no to_rc)
+  -> _FlatVariants
+  -> fill_empty_groups (dummy)             # unchanged
+  -> rc_alleles_inplace(byte_data, seq_offsets, var_offsets, to_rc_row)   # NEW, rust
+```
+
+Rationale: preserves the exact `assemble → fill → RC` ordering, so dummy-filled alleles
+(including a **custom** non-palindromic `DummyVariant.alt`, e.g. `b"AC"`) are RC'd
+identically to today. The default `DummyVariant.alt`/`.ref` is `b"N"` (RC-invariant), but
+custom dummies are reachable, so ordering parity matters. The one extra FFI crossing is on
+already-contiguous buffers (negligible vs. the deleted Python allocation churn). Folding
+into `assemble_variant_buffers` would put RC *before* fill and require a mask-aware
+`fill_empty_groups` to RC the dummy allele — more moving parts for no measurable gain.
+
+## Design
+
+### 1. Rust kernel (`src/variants/` + `src/ffi/`)
+
+Core (pure, in e.g. `src/variants/mod.rs` or `windows.rs` neighborhood), reusing
+`crate::reverse::{rc_flat_rows_inplace, COMP}`:
+
+```rust
+/// Reverse-complement the alleles of mask-selected (b*p) rows, in place.
+/// `byte_data`        contiguous allele bytes (uint8)
+/// `seq_offsets`      per-allele byte boundaries (len n_alleles + 1)
+/// `var_offsets`      per-(b*p)-row allele boundaries (len n_rows + 1)
+/// `to_rc_row`        per-(b*p)-row bool mask (len n_rows)
+pub fn rc_alleles_inplace(
+    byte_data: &mut [u8],
+    seq_offsets: ArrayView1<i64>,
+    var_offsets: ArrayView1<i64>,
+    to_rc_row: ArrayView1<bool>,
+)
+```
+
+Implementation: for each row `g` with `to_rc_row[g]`, the alleles `a` in
+`var_offsets[g]..var_offsets[g+1]` are RC'd — i.e. build the per-allele mask from the row
+mask + `var_offsets` and delegate to `rc_flat_rows_inplace(byte_data, seq_offsets,
+per_allele_mask)`. (Equivalent to today's `np.repeat(per_bp, np.diff(var_offsets))`
+expansion, done in Rust.)
+
+FFI wrapper `rc_alleles` in `src/ffi/mod.rs`: takes a `PyReadwriteArray1<u8>` (mutated in
+place) + the three views; registered in `lib.rs`. Mirrors the in-place convention of the
+other read-path kernels.
+
+### 2. Dispatch registration
+
+Register `rc_alleles` in `_dispatch`:
+- **rust**: the new FFI kernel above.
+- **numba** (reference): the existing seqpro-`reverse_complement_masked` implementation,
+  extracted into a small function so it can be the registered reference.
+
+`GVL_BACKEND=numba` therefore keeps variant RC on the seqpro reference (clean perf gating:
+a numba-backend read does not smuggle in the new rust RC). `GVL_BACKEND` unset ⇒ rust.
+
+### 3. Python call sites
+
+- `_FlatAlleles.reverse_masked` (`_flat_variants.py`): replace the
+  `Ragged.from_offsets(...) + reverse_complement_masked(...)` body with
+  `get("rc_alleles")(self.byte_data, self.seq_offsets, self.var_offsets, per_bp_mask)`,
+  where `per_bp_mask = np.repeat(mask, self.ploidy)` (same broadcast as today). Operates in
+  place on `byte_data`; returns `self`.
+- `RaggedVariants.rc_` (`_rag_variants.py`): keep the existing buffer extraction
+  (`to_chars().to_packed()` is needed to *reach* the contiguous char buffer + offsets) but
+  replace the inner `_sp_reverse_complement(view, _COMP, mask=allele_mask)` call with
+  `get("rc_alleles")(data, char_off, var_off, to_rc_row)`. (This path is the cold
+  non-flat route; the hot flat read path goes through `_FlatAlleles.reverse_masked`.)
+- Both keep the early-out when the mask is all-False.
+
+### 4. `_query.py`
+
+- **Unspliced post-pass: unchanged in structure.** It already routes variant kinds through
+  `reverse_complement_ragged` on both backends; backend choice now happens *inside*
+  `reverse_masked`/`rc_` via the `rc_alleles` dispatch. No backend-split edits needed here.
+- **Remove the dead spliced variant guard** in `_getitem_spliced`: spliced variants are
+  rejected upstream (`__call__` raises `NotImplementedError` for spliced variant/
+  variant-windows kinds), so the `_VARIANT_TYPES_S` branch is unreachable. Delete it.
+
+## Parity & testing
+
+Byte-identical differential testing is the standing migration contract; the reference here
+is the existing seqpro implementation.
+
+1. **Rust unit tests** (`#[cfg(test)]`): `rc_alleles_inplace` on multi-row, multi-allele
+   buffers — masked vs unmasked rows, empty rows, odd-length + `N` alleles, all-False mask
+   no-op. (Mirrors the `reverse.rs` test style.)
+2. **Kernel parity** (`tests/parity/`, hypothesis): `rc_alleles` rust vs reference,
+   byte-identical, over property-generated `(byte_data, seq_offsets, var_offsets, mask)`
+   for both the `_FlatAlleles` layout and the `RaggedVariants.rc_` char-buffer layout.
+3. **Dummy-fill + custom-allele edge cases** (locks the ordering risk): a neg-strand query
+   with empty `(region, sample, ploid)` groups, run with **(a)** the default `b"N"` dummy
+   and **(b)** a custom non-palindromic dummy (`alt=b"AC"`, `ref=...`), asserting rust ==
+   reference end-to-end. This is the case that would diverge under an in-kernel
+   (pre-fill) fold.
+4. **Live-path spy** (`tests/parity/test_dataset_parity.py` precedent): open a variants
+   dataset with negative-strand regions, index it, assert the `rc_alleles` kernel is
+   actually invoked and the result is byte-identical to the numba/reference backend.
+
+Full-tree gate before close: `pixi run -e dev pytest tests -q` on **both** backends,
+`cargo test`, lint/format/typecheck, abi3 wheel build. Update
+`docs/roadmaps/rust-migration.md` (tick the Target-6 variant-RC follow-up; record that the
+deferred `RaggedVariants`/`_FlatVariants` RC now runs on a gvl rust kernel, reference
+retained).
+
+## Files touched
+
+- `src/variants/...` — `rc_alleles_inplace` core + tests
+- `src/ffi/mod.rs`, `src/lib.rs` — `rc_alleles` pyfunction + registration
+- `python/genvarloader/_dataset/_flat_variants.py` — `_FlatAlleles.reverse_masked`
+- `python/genvarloader/_dataset/_rag_variants.py` — `RaggedVariants.rc_`
+- `python/genvarloader/_dataset/_query.py` — remove dead spliced variant guard
+- `python/genvarloader/_dispatch.py` (or the per-module registration site) — register
+  `rc_alleles`
+- `tests/parity/...`, `tests/dataset/...` — parity + edge-case + spy tests
+- `docs/roadmaps/rust-migration.md` — status update
+
+## Out of scope
+
+- Assembly / instruction-count micro-optimization (owned separately, in parallel).
+- Deleting the seqpro reference path (Phase 5).
+- Any change to `_FlatVariantWindows` RC behavior (remains a no-op).
diff --git a/docs/superpowers/specs/2026-06-25-target-5-tracks-intervals-slice-design.md b/docs/superpowers/specs/2026-06-25-target-5-tracks-intervals-slice-design.md
new file mode 100644
index 00000000..6fb5e3fa
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-25-target-5-tracks-intervals-slice-design.md
@@ -0,0 +1,126 @@
+# Target 5 — tracks-only ndarray slicing optimization
+
+**Date:** 2026-06-25
+**Workstream:** Phase 5, optimization round 2, Target 5 (rust-only, byte-identical).
+**Branch:** `opt/target-5-intervals-slice` off `rust-migration`.
+**Roadmap:** `docs/roadmaps/rust-migration.md` — Phase 5 ⬜, "Optimization targets — round 2".
+**Handoff:** `docs/handoffs/2026-06-25-phase5-getitem-optimization.md` (Target 5 section).
+
+## Problem
+
+`intervals_to_tracks` (`src/intervals.rs`) is the kernel behind the cheapest read
+path (tracks-only, ~1.1–1.7 ms/batch). On that path Rust runs at **0.63× numba**
+— the single read path where Rust is clearly slower. `perf` flat self-time
+attributes ~20.5% of the kernel to ndarray slice machinery:
+`ndarray::slice_mut` (11%) + `ndarray::do_slice` (9.5%), all from constructing a
+`SliceInfo` per painted interval in:
+
+```rust
+out.slice_mut(ndarray::s![a..b]).fill(value);
+```
+
+numba compiles the equivalent `out[a:b] = value` to a direct memset and pays none
+of this. Because tracks-only does no sequence work, this fixed per-interval cost
+dominates with nothing to amortize it against.
+
+## Goal
+
+Close the deficit so Rust is **≥ 1.0× numba** on tracks-only, while keeping the
+output **byte-identical** to the numba oracle. The kernel is shared by the
+combined **tracks** (seqs + read-depth) path, which improves with it.
+
+## Scope
+
+- **In:** `src/intervals.rs` — the `intervals_to_tracks` body, and (only if the
+  perf fallback lands) one added cargo test.
+- **Out:** No Python changes. No FFI-signature changes. No oracle change. No
+  changes to `out.fill(0.0)` semantics. No overlap with Targets 6/7 (they touch
+  `intervals.rs` too, but Target 5 merges first and they rebase onto it).
+
+## Design
+
+The `out` buffer is freshly allocated and contiguous, so we can address it as a
+raw `&mut [f32]` and drop the per-interval `SliceInfo`.
+
+1. **Hoist the slice once**, at the top of the function, after the zero prelude:
+   ```rust
+   let out_slice = out.as_slice_mut().unwrap();
+   ```
+   `.unwrap()` is intentional: a non-contiguous `out` is an invariant violation,
+   not a recoverable case, and should fail loud.
+
+2. **Zero prelude on the raw slice:**
+   ```rust
+   out_slice.fill(0.0);
+   ```
+   **Keep the zero prelude.** tracks-only depends on it — gaps between intervals
+   must read 0. This is unlike the fully-overwritten sequence buffers whose
+   zero-init was skipped in commit `1b3e355`; that optimization does not apply
+   here.
+
+3. **Per-interval write on the raw slice** (default, safe form):
+   ```rust
+   let a = out_s + s as usize;
+   let b = out_s + e as usize;
+   out_slice[a..b].fill(value);
+   ```
+   This keeps a single range bounds-check but removes `SliceInfo` construction —
+   the proven cost.
+
+All surrounding arithmetic and control flow is **unchanged**:
+- `start = itv_starts[i] - query_start`, `end = itv_ends[i] - query_start` in i64.
+- `break` when `start >= length` (intervals sorted by start).
+- `s = start.max(0)`, `e = end.min(length)`; write only when `e > s`.
+- Per-query `itv_s == itv_e` → skip (out slice stays 0).
+
+## Parity
+
+Byte-identical by construction — same arithmetic, same write order, same values,
+only a different way to address the contiguous buffer.
+
+Gates (all must stay green):
+- `pixi run -e dev cargo-test` — the 8 existing unit tests in `src/intervals.rs`
+  pin the full contract (basic paint, empty intervals, end-clamp, break-on-
+  start≥length, the three #242 jitter cases, multi-query disjoint). Refactor
+  **under** them, untouched.
+- `pixi run -e dev pytest tests/parity -q` (rust default) **and**
+  `GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q` (oracle) — including
+  the `intervals_to_tracks` hypothesis parity gate and the tracks dataset
+  backstop that proves the kernel runs on the live `__getitem__` path.
+
+No new test is required for the safe form (no new behavior). A SAFETY-proof test
+is added **only if** the unsafe fallback (below) is needed.
+
+## Perf gate and fallback
+
+Build release first: `pixi run -e dev maturin develop --release`. Re-measure
+tracks-only via `tests/benchmarks/test_e2e.py` — `_bench_indexing` uses
+`benchmark.pedantic(iterations=10, rounds=50)`; compare the **min** rust ÷ min
+numba (cleanest CPU-bound estimate), with `NUMBA_NUM_THREADS=1`.
+
+- **≥ 1.0×** → done. Record the ratio in the roadmap round-2 re-measurement block.
+- **< 1.0×** → escalate the inner write to elide the bounds-check:
+  ```rust
+  // SAFETY: a = out_s + s, b = out_s + e with 0 <= s <= e <= length and
+  // out_s + length == out_e <= out_slice.len() (out_offsets is a valid CSR
+  // layout over out_slice), so a..b is in bounds.
+  unsafe { out_slice.get_unchecked_mut(a..b).fill(value); }
+  ```
+  Add one cargo test asserting the bounds invariant the SAFETY comment relies on,
+  re-measure, then record.
+
+The expected outcome is that the safe form clears the gate (the `SliceInfo`
+construction, not the bounds-check, was the dominant cost); the unsafe form is a
+contingency, not the plan.
+
+## Definition of done
+
+1. Refactored `intervals_to_tracks`, all existing cargo tests green untouched.
+2. `cargo-test` + `pytest tests/parity` on **both** backends green.
+3. Full tree on both backends (`pixi run -e dev pytest tests -q`, then
+   `GVL_BACKEND=numba …`) — scoped runs skip `tests/unit/`.
+4. `ruff check python/ tests/` + `ruff format python/ tests/` + `typecheck`
+   clean (no Python changes expected, but run them).
+5. tracks-only re-measured ≥ 1.0×; ratio recorded in
+   `docs/roadmaps/rust-migration.md` with Target 5 ticked and the PR link set.
+6. Parity-gated PR opened from `opt/target-5-intervals-slice`.
diff --git a/docs/superpowers/specs/2026-06-25-target6-kernel-rc-design.md b/docs/superpowers/specs/2026-06-25-target6-kernel-rc-design.md
new file mode 100644
index 00000000..16d414ef
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-25-target6-kernel-rc-design.md
@@ -0,0 +1,201 @@
+# Design — Target 6: fold strand reverse-complement into the Rust read-path kernels
+
+**Date:** 2026-06-25
+**Workstream:** Phase 5, Target 6 (rust-migration roadmap, round-2 optimization block)
+**Branch:** `opt/target-6-kernel-rc` off `zero-copy-scale-safe-readpath`
+**Handoff:** `docs/handoffs/2026-06-25-phase5-getitem-optimization.md` (Target 6 section)
+
+## Goal
+
+Delete the per-batch reverse-complement (RC) post-pass on the read path by emitting
+negative-strand regions already reverse-complemented from the Rust kernels. This is the
+largest single-thread throughput lever left before rayon, and it is **backend-agnostic**
+(numba pays the same cost), so it must land before rayon batch parallelism.
+
+## Corrected cost model (why this design, not the handoff's literal framing)
+
+The handoff calls the RC cost a "numpy post-pass." The code shows otherwise: RC today runs
+through seqpro's **compiled** flat kernels (`_reverse_rows_masked` /
+`reverse_complement_masked` via `_query.py::reverse_complement_ragged` and
+`_flat.py::_Flat.reverse_masked`), not a Python loop. Both backends call the *same* RC code
+*after* reconstruction, which is exactly why numba shows the same ~19% self-time on
+haplotypes.
+
+Therefore the cost is **the second full-batch traversal of the output buffer** (re-read +
+complement + numpy re-wrap), **not** an FFI crossing unique to rust. This rules out a
+"rewrite the post-pass in Rust but keep it batch-wide" approach — it would re-read the same
+cold buffer and barely move the number.
+
+The chosen approach removes the **cold, batch-wide** traversal: RC each negative-strand
+query's slice **in-place, immediately after that query is written, inside the existing
+per-query kernel loop**, while the slice is still hot in L1/L2. A second hot pass over a
+~16 KB slice is near-noise next to reconstruction; today's cost is high precisely because
+the pass is cold, whole-batch, and materialized through numpy.
+
+### Approach considered and rejected
+
+- **A — fold the reversed write into the reconstruct core** (emit bytes already RC'd, no
+  second pass at all). Rejected: maximum single-thread perf, but RC logic entangles with
+  indel + insertion-fill + trailing-fill in the hottest kernels, is bespoke per output kind,
+  and the annotated/splice cases make a subtle parity break likely. Its only gain over the
+  chosen approach is eliminating one *hot* pass — not worth the risk. Revisit only if the
+  chosen approach's measured ratio still lags numba.
+- **C — Rust post-pass called from Python** (replace `reverse_complement_ragged` with one
+  Rust pyfunction over the returned flat buffers). Rejected: keeps the exact cold,
+  batch-wide traversal; captures neither the cache-locality win nor a meaningful dispatch
+  win, since RC is not an extra rust FFI crossing today.
+
+## Scope
+
+In scope — five flat-buffer output kinds, all sharing the in-place primitives:
+
+| Kind | Buffers | RC behavior |
+|---|---|---|
+| haplotypes (S1) | `out_data: u8` | reverse + complement |
+| reference (S1) | `out_data: u8` | reverse + complement |
+| tracks (f32) | `out_data: f32` | reverse only (no complement) |
+| annotated | `haps: u8`, `var_idxs: i32`, `ref_coords: i32/i64` | haps reverse+complement; both index arrays reverse-only; all three in lockstep per query |
+| splice (haps / ref / tracks) | permuted element buffer | same primitive per spliced **element**, using permuted offsets + permuted per-element mask |
+
+Out of scope:
+
+- **`RaggedVariants` (`variants` mode) RC — deferred to Target 7.** Its RC is structurally
+  different (reverse allele order within each row **and** complement allele bytes over the
+  nested ragged allele structure, `RaggedVariants.rc_`) and lives in the `src/variants/`
+  gather path that Target 7 is concurrently rewriting. Target 6 leaves a slimmed
+  `reverse_complement_ragged` husk handling only this case; Target 7 absorbs it and deletes
+  the husk.
+- **`variant-windows` and `intervals`** — reference-oriented, RC is a no-op today and stays a
+  no-op.
+
+## Components — Rust primitives
+
+A new small module (`src/reverse.rs`) with two generic in-place primitives, each over a flat
+`(data, offsets)` buffer + a per-row `to_rc` mask:
+
+1. `reverse_flat_rows_inplace<T: Copy>(data: &mut [T], offsets, to_rc)` — reverses element
+   order within each masked row. Order only, no complement. Generic over element width
+   (`u8`, `f32`, `i32`, `i64`).
+2. `rc_flat_rows_inplace(data: &mut [u8], offsets, to_rc)` — reverses **and** complements
+   bytes via a 256-entry `_COMP` LUT.
+
+**`_COMP` LUT contract:** reproduce `bytes.maketrans(b"ACGT", b"TGCA")`
+(`python/genvarloader/_ragged.py:330`) exactly — a `[u8; 256]` that is **identity for
+everything** except `A↔T` and `C↔G` (uppercase only). `N`, IUPAC codes, and lowercase
+`a/c/g/t` are pass-through (identity), matching today's behavior byte-for-byte.
+
+Output-kind → primitive mapping:
+
+- haplotypes, reference → `rc_flat_rows_inplace`
+- tracks → `reverse_flat_rows_inplace::<f32>`
+- annotated → `rc_flat_rows_inplace` on `haps`; `reverse_flat_rows_inplace` on `var_idxs`
+  and `ref_coords`; applied in lockstep per query.
+- splice → the relevant primitive per spliced element.
+
+## Mask threading & per-kernel integration
+
+The `to_rc` mask is **computed in Python and passed into each kernel** as a new
+`Option<PyReadonlyArray1<bool>>` argument. Rationale: the strand→mask logic and (critically)
+the splice permutation logic already exist and are tested; reproducing the permutation in
+Rust would be gratuitous risk.
+
+- **Unspliced kernels** (`reconstruct_haplotypes_fused` `src/ffi/mod.rs:393`,
+  `reconstruct_annotated_haplotypes_fused` `:604`, `intervals_and_realign_track_fused`
+  `:848`, `get_reference` `:728`): Python passes `to_rc = full_regions[r_idx, 3] == -1`
+  (one bool per query). The kernel applies the primitive to query `k`'s just-written slice
+  when `to_rc[k]`.
+- **Spliced kernels** (`reconstruct_haplotypes_spliced_fused` `:521`, the spliced-reference
+  fetch `_fetch_spliced_ref` / reference core): Python passes the **already-permuted
+  per-element** mask — the existing `to_rc_per_elem` (`_query.py:259-280`) / `to_rc_perm`
+  (`_reference.py:438-444`) computation moves from post-pass input to kernel input,
+  unchanged. The spliced kernel's loop is already per-element over permuted `out_offsets`,
+  so the primitive applies per element with no new boundary math. **Assert** the element
+  boundaries being RC'd match `plan.group_offsets` (handoff warning).
+
+**`Option` keeps the fast path trivially byte-identical:** when `rc_neg` is off or no
+negative-strand region is selected (`to_rc.any() == false`), Python passes `None` and the
+kernel does zero extra work. All-positive datasets are provably unchanged; existing fixtures
+and the scale guard cannot regress.
+
+**Insertion-fill / trailing-fill ordering preserved for free:** RC runs *after* a query's
+full forward write (fills already placed), so it sees the exact final post-fill bytes the
+current post-pass sees. No interleaving with fill logic.
+
+**Rust files touched:** `src/ffi/mod.rs` (6 kernel signatures + call sites), the
+reconstruct/track/reference cores under `src/{reconstruct,tracks,intervals,reference}/`, and
+the new `src/reverse.rs` (with cargo unit tests).
+
+## Python-side changes & deletion plan
+
+- **`_query.py::_getitem_unspliced`** (`:188-190`): delete the
+  `reverse_complement_ragged` post-pass; compute `to_rc` and thread it through
+  `view.recon(...)` into the kernels. Only the deferred `RaggedVariants` case still routes
+  through the husk.
+- **`_query.py::_getitem_spliced`** (`:259-280`): keep the permuted `to_rc_per_elem`
+  computation, but hand its result to the kernel via the splice plan / recon call instead of
+  to `reverse_complement_ragged`.
+- **`_query.py::reverse_complement_ragged`** (`:374-410`): shrink to the **husk** — only the
+  `RaggedVariants` branch survives (`return rag.rc_(to_rc)`); delete the `_Flat`,
+  `_FlatAnnotatedHaps`, and no-op branches. Add `# TODO(target-7)` noting Target 7 absorbs
+  and deletes it.
+- **`_reference.py`** (`:438-444`): delete the spliced-reference
+  `per_elem.reverse_masked(to_rc_perm, comp=_COMP)` post-pass; thread `to_rc_perm` into
+  `_fetch_spliced_ref` / the reference kernel. (Third RC site, missed by the handoff, now
+  in-scope.)
+- **Reconstructors** (`Haps`, `Ref`, `Tracks`, `HapsTracks`, `SeqsTracks`, annotated) gain a
+  `to_rc` parameter on their recon entry that they forward to the FFI kernel. Exact signature
+  confirmed when reading `_reconstruct.py`; principle: mask flows region-compute → recon →
+  kernel, and the only Python RC left anywhere is the variants husk.
+- **No stray callers:** `grep -rn reverse_complement_ragged python/` and
+  `grep -rn reverse_masked python/` confirm nothing else depends on the deleted paths.
+
+## Parity, tests & perf gate
+
+**Primary risk: vacuous parity pass.** Default fixtures use `max_jitter=0` and may be
+all-positive-strand, so RC code could never fire and parity would pass trivially. Guards:
+
+- **New strand=−1 fixtures** in `tests/parity/test_dataset_parity.py`: datasets mixing `+`
+  and `−` regions, covering every in-scope kind (haplotypes, reference, tracks, annotated)
+  and the spliced variant of each. Reuse the kernel-spy backstop to prove RC executes on the
+  live `__getitem__` path.
+- **Non-vacuity assertion:** for a `−`-strand region, assert output bytes ≠ the `+`-strand
+  orientation (RC genuinely fired), and assert exact RC'd bytes for a known fixture.
+- **Rust unit tests** (`src/reverse.rs`): empty rows, single byte, odd/even lengths,
+  `to_rc` all-false (no-op) / all-true / mixed; LUT identity on `N`/lowercase/IUPAC; `f32`
+  reverse-only; lockstep reversal of the three annotated buffers.
+
+**Parity gate (byte-identical vs current post-pass), both backends:**
+
+```bash
+pixi run -e dev cargo-test
+pixi run -e dev pytest tests/parity -q                       # rust default
+GVL_BACKEND=numba pixi run -e dev pytest tests/parity -q      # oracle
+```
+
+**TDD order:** reference (simplest, no fill) → haplotypes → tracks (reverse-only) →
+annotated → **splice last**. Land each kind behind parity before deleting its Python
+post-pass branch. Variants deferred.
+
+**Before push:** full tree both backends (`pixi run -e dev pytest tests -q`, then
+`GVL_BACKEND=numba …`) to catch `tests/unit/` references to deleted code; lint/format/
+typecheck on `python/ tests/`.
+
+**Perf gate:** re-measure `haplotypes`, `tracks-only`, `tracks-seqs`, `annotated` via the
+de-noised `tests/benchmarks/test_e2e.py` harness (min over `pedantic(iterations=10,
+rounds=50)`, release build). Expect the RC self-time gone from `perf` flat profiles and the
+rust÷numba ratios up (haplotypes was 0.94× with RC its biggest sink at ~19% self). Record
+re-measured ratios in `docs/roadmaps/rust-migration.md` under the Phase 5 round-2 block,
+tick Target 6, set the PR link, and set the marker that Target 6 must merge before rayon.
+
+**HPC gotcha:** run pytest with `--basetemp=$(pwd)/.pytest_tmp` so the write path's `os.link`
+hardlink does not fail cross-device (Errno 18). Work in a dedicated git worktree.
+
+## Coordination with parallel workstreams
+
+- **Target 7** (variants/windows assembly): owns the deferred `RaggedVariants.rc_` port and
+  the `reverse_complement_ragged` husk deletion. Overlaps Target 6 in `src/ffi/mod.rs`
+  (additive — new pyfunction args vs new pyfunctions, low conflict).
+- **Target 5** (intervals slicing): overlaps `src/intervals.rs`; merge order is 5 first, then
+  6/7. Rebase Target 6 onto 5 if 5 lands first.
+- **Rayon** is blocked until 5 + 6 + 7 are on the base branch. The in-loop, per-query RC of
+  this design parallelizes cleanly (disjoint per-query slices).
diff --git a/docs/superpowers/specs/2026-06-25-target7-variant-windows-rust-assembly-design.md b/docs/superpowers/specs/2026-06-25-target7-variant-windows-rust-assembly-design.md
new file mode 100644
index 00000000..745e730a
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-25-target7-variant-windows-rust-assembly-design.md
@@ -0,0 +1,162 @@
+# Design: Target 7 — variant-windows/variants assembly in one Rust call
+
+**Date:** 2026-06-25
+**Branch:** `opt/target-7-windows-rust-assembly` off `zero-copy-scale-safe-readpath`
+**Roadmap:** `docs/roadmaps/rust-migration.md` — Phase 5 round-2 target 7 (⬜)
+**Handoff:** `docs/handoffs/2026-06-25-phase5-getitem-optimization.md`
+
+## Problem
+
+The `variant-windows` (and `variants`) flat-output read path is **Python-overhead / GC-bound,
+not kernel-bound**. `perf` flat self-time on `profile.py --mode variant-windows` shows no dominant
+Rust kernel; the cost is the interpreter + allocator: `_PyEval_EvalFrameDefault` ~8.5%, GC
+(`gc_collect_main` + `deduce_unreachable` + `visit_reachable` + `dict_traverse`) **~14% combined**,
+dict/attr lookups, and ctypes/cffi dynamic-symbol lookup ~2.3%.
+
+The source is the per-batch object graph the assembly tail allocates: a `Ragged` from
+`reference.fetch`, numpy LUT-gather temporaries (`lut[bytes]`), `np.concatenate`/`reshape`
+temporaries, and wrapper dataclasses (`_FlatWindow` / `_FlatAlleles` / `_FlatVariants` /
+`_FlatVariantWindows` / scalar `_Flat`). The fix is to collapse the **ragged byte/token assembly**
+into **one Rust call** that returns the final flat `(data, offsets)` buffers, so Python builds the
+wrapper objects once and the numpy temporaries disappear.
+
+This is the windows half of the deferred Phase-5 single-big-kernel rewrite.
+
+## Decisions (locked during brainstorming)
+
+1. **Scope:** cover **all** of `variants` + `variant-windows` (alleles, windows, bare alleles, the
+   `flank_tokens` ride-along) — the full collapse, not windows-only.
+2. **Fetch boundary:** the Rust call **owns the reference fetch** internally (the reference is a
+   contiguous `u8` buffer + `i64` contig offsets — the same inputs `get_reference` already takes),
+   removing the per-batch `Ragged` allocation and a Python round-trip.
+3. **Granularity:** **one mega-call** (flag-driven) returning a bundle of all requested flat
+   buffers in a single FFI crossing — fewest objects/crossings.
+4. **Front edge:** **assembly tail only.** The mega-call takes already-gathered `v_idxs` /
+   `row_offsets` + dataset-static per-variant arrays and returns all ragged byte/token buffers. The
+   `v_idxs` gather + AF filter + compaction front-end and the cheap, dtype-polymorphic scalar-field
+   gathers stay in Python — this keeps the issue-#231 custom-FORMAT-field numba fallback intact.
+5. **Empty-group fill:** **not** folded into the mega-call. `fill_empty_groups` runs afterward on
+   the wrapped buffers via the existing `fill_empty_seq/scalar/fixed` Rust cores, keeping the
+   offset-consistency logic in one place.
+
+## Architecture
+
+Three layers; only the middle changes.
+
+| Layer | Status | What |
+|---|---|---|
+| **Front-end** | unchanged (Python) | `geno_offset_idx` → `gather_rows` → `v_idxs`/`row_offsets`, AF filter, `compact_keep`, dosage gather, unphased-union fold → compacted `v_idxs`, `row_offsets`, `eff_ploidy` |
+| **Scalar fields** | unchanged (Python) | `arr[v_idxs]` + `_Flat` wrap for start/ilen/dosage/info/custom-FORMAT — cheap fancy-indexing, dtype-polymorphic, #231 fallback preserved |
+| **Ragged byte/token assembly** | **NEW (Rust mega-call)** | one FFI call owning `gather_alleles`, reference fetch, LUT tokenize, flank slice, alt-window assemble, flank-tokens — returns all requested flat `(data, seq_offsets)` buffers in one crossing |
+| **Empty-group fill** | unchanged (Python + existing Rust cores) | `fill_empty_groups` on wrapped buffers, only when `dummy_variant` is set |
+
+Python wraps the returned buffers into `_FlatAlleles` / `_FlatWindow` / `_Flat` **once** and
+assembles `_FlatVariants` / `_FlatVariantWindows`. **No consumer change:** `reshape` / `squeeze` /
+`to_ragged` / `fill_empty_groups` still operate on the same wrapper types; flat output mode returns
+`_FlatVariantWindows` directly as before.
+
+## The mega-call
+
+`assemble_variant_buffers(...)` — Rust pyfunction in `src/variants/windows.rs`, registered in the
+dispatch registry (`python/genvarloader/_dispatch.py`) with `rust` default and `numba` = today's
+Python/numba assembly composed into the same bundle shape (the parity oracle).
+
+### Inputs
+
+- `v_idxs (i32)` — compacted variant indices, length `n_var`.
+- `row_offsets (i64)` — per-`(b*p_eff)`-row variant boundaries, length `b*p_eff + 1`.
+- Dataset-static globals (reuse `Haps.ffi_static` where already cached):
+  - `v_starts (i32)`, `ilens (i32)` — global per-variant arrays (gathered by `v_idxs` inside Rust).
+  - `alt_bytes (u8)` + `alt_off (i64)` — global allele byte buffer + offsets.
+  - `ref_bytes (u8)` + `ref_off (i64)` — global, when ref is requested.
+- `reference (u8)` + `contig_offsets (i64)` + `pad_char` — reference genome (owns the fetch).
+- `v_contigs (i32)` — per-variant contig id (computed in Python via
+  `np.repeat(regions[:,0], eff_ploidy)` then repeat by row counts; precomputed, cheap).
+- `flank_length (i32)`.
+- `token_lut ((256,) u8 | i32)` — `unknown_token` already baked in.
+- **Flag set** describing which outputs to emit and the `ref` / `alt` ∈ {`window`, `allele`, `byte`}
+  modes.
+
+### Internals (small, individually unit-tested Rust cores)
+
+Mirror today's Python/numba helpers:
+- `gather_alleles` — variable-length allele bytestrings for `v_idxs`.
+- `fetch_window` — reuse `get_reference`'s core; `[start-L, end+L)` read with absolute-coordinate
+  OOB padding.
+- `slice_flanks` — `f5` = first `L` bytes, `f3` = last `L` bytes of each window read.
+- `assemble_alt_window` — `flank5 · alt · flank3` per variant.
+- `tokenize` — apply the 256-entry LUT (output dtype = `lut.dtype`).
+
+Preserve the **single fused fetch** for the `ref=window & alt=window` hot path (derive alt-window
+flanks by slicing the one ref read), exactly as `compute_windows` does today. Fetch only when a
+window output is actually requested.
+
+### Returns
+
+A dict keyed by field name → flat buffers:
+- `alt` / `ref` (plain variants): `(byte_data u8, seq_offsets i64)`.
+- `ref_window` / `alt_window` / bare `ref` / bare `alt` (windows): `(token_data lut.dtype, seq_offsets i64)`.
+- `flank_tokens`: `(token_data,)` with fixed inner `2L`, offsets = `row_offsets`.
+
+`var_offsets` equals `row_offsets` unchanged (no fill applied yet), so Python reuses it rather than
+returning a copy. Token dtype follows `lut.dtype` (two monomorphizations: `u8` / `i32`).
+
+## Parity strategy
+
+Byte-identical gate, both backends. The assembly is **not** currently dispatched, so:
+
+1. Register `assemble_variant_buffers` in the dispatch registry with:
+   - `numba` = today's exact Python/numba helpers (`compute_windows`, `compute_ref_window`,
+     `compute_alt_window`, `tokenize_alleles`, `compute_flank_tokens`, `gather_alleles`) composed to
+     return the same bundle shape.
+   - `rust` = the new mega-call.
+2. TDD: pin the current flat `(data, offsets)` bundle as the oracle, build Rust under it.
+3. The dataset backstop (`tests/parity/test_dataset_parity.py`) spies on the kernel to prove it runs
+   on the live `__getitem__` path (no vacuous pass).
+
+Reproduce exactly:
+- `ends = starts - min(ilens, 0) + 1`.
+- absolute-coordinate OOB padding with `pad_char`.
+- `flank5 · alt · flank3` byte order.
+- `[flank5 | flank3]` variant-major `2L` layout for `flank_tokens`.
+- LUT mapping incl. `unknown_token` and `N` / out-of-alphabet bytes.
+
+**Pre-existing xfail:** `test_e2e_variants` xfails today (`_FlatVariants.to_fixed` missing). Confirm
+it xfails identically at base before starting; it is **not** a regression introduced here.
+
+## Testing & perf gate
+
+- Rust unit tests on each core (`gather_alleles`, `slice_flanks`, `assemble_alt_window`, `tokenize`,
+  fused windows) + the orchestrator.
+- `pixi run -e dev pytest tests/parity tests/unit -q` on both backends
+  (`GVL_BACKEND=numba` too). Add fixtures covering the full `ref`/`alt` ∈ {window, allele} mode
+  matrix, empty groups (dummy-variant fill), and the `flank_tokens` ride-along.
+- `pixi run -e dev cargo-test`.
+- Full tree before push (`pixi run -e dev pytest tests -q`, then `GVL_BACKEND=numba …`) per
+  CLAUDE.md (scoped runs skip `tests/unit/`).
+- Lint/format/typecheck: `ruff check python/ tests/ && ruff format … && typecheck`.
+- Perf: re-measure `variant-windows` and `variants` via `tests/benchmarks/test_e2e.py` (min of
+  `benchmark.pedantic`); expect GC/eval self-time to drop. Record the re-measured ratios in the
+  roadmap, set the Phase-5 target-7 marker + PR link.
+- HPC gotcha: `--basetemp=$(pwd)/.pytest_tmp` so the write path's `os.link` hardlink doesn't fail
+  cross-device (Errno 18).
+
+## Files
+
+- **New:** `src/variants/windows.rs` — the cores + `assemble_variant_buffers` pyfunction. Wire into
+  `src/ffi/mod.rs` (re-export) and `src/lib.rs` (`add_function`).
+- **Rewrite:** `python/genvarloader/_dataset/_flat_variants.py` (`get_variants_flat` assembly tail
+  calls the dispatched mega-call and wraps once) and `python/genvarloader/_dataset/_flat_flanks.py`
+  (helpers retained as the numba oracle behind the registry).
+- **Tests:** `tests/parity/` fixtures (mode matrix + empty + flank), Rust unit tests in
+  `src/variants/windows.rs`.
+- **Roadmap:** tick target 7, record ratios, set PR link.
+
+## Out of scope
+
+- Folding `fill_empty_groups` into the mega-call (kept as a separate post-pass).
+- Folding the `v_idxs` gather / AF filter / compaction / scalar-field gather into Rust (front edge =
+  assembly tail only; preserves #231 dtype-polymorphic fallback).
+- Strand reverse-complement (target 6) and rayon batch parallelism (blocked until 5/6/7 land).
+- Deleting the numba assembly helpers — they remain as registered parity oracles (wholesale numba
+  deletion is a later Phase-5 step, not this workstream).
diff --git a/docs/superpowers/specs/2026-06-25-zero-copy-scale-safe-readpath-design.md b/docs/superpowers/specs/2026-06-25-zero-copy-scale-safe-readpath-design.md
new file mode 100644
index 00000000..31188196
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-25-zero-copy-scale-safe-readpath-design.md
@@ -0,0 +1,137 @@
+# Zero-copy, scale-safe Rust read path (gvl format 2.0) — Design
+
+**Status:** approved design, ready for implementation planning
+**Date:** 2026-06-25
+**Author:** brainstormed with the maintainer (david@standardmodel.bio)
+**Related:** `docs/roadmaps/rust-migration.md` (Phase 3 throughput → optimization targets); memory `rust-memmap-ascontiguous-scalability`.
+
+## Problem
+
+The rust read path materializes **per-sample-scale memmapped arrays into RAM on every `ds[r, s]`**, which OOMs at gvl's >1M-sample design target. Confirmed via py-spy (`--native`, 43k samples: the hottest self-time leaf is numpy's `_aligned_strided_to_contig_size4` at ~20%) plus a per-batch copy trace (monkeypatched `np.ascontiguousarray` over one `ds[r, s]`):
+
+- **The defect (rust-only):** track intervals are stored **array-of-structs** — `INTERVAL_DTYPE = [(start, i4), (end, i4), (value, f4)]`, itemsize 12 (`_ragged.py:26`). So `RaggedIntervals.{starts,ends,values}.data` are **strided field views** (stride 12, non-contiguous). The fused-rust track branch (`_reconstruct.py:241-250`) wraps each in `np.ascontiguousarray(..., i4/f4)`, copying the **entire per-sample-scale interval record store** into RAM every batch (3 × 3.6 MB on the toy corpus; GB-scale → OOM at 1M samples). The **numba** branch (`_reconstruct.py:271-274`) passes the same strided views directly with no copy, so this is a rust-path regression, not a pre-existing cost.
+- **Same footgun, currently benign:** the fused kernels also wrap the full `genotypes.data`/`offsets` memmap in `np.ascontiguousarray`. Today that is a no-op (contiguous `int32`/`int64`) — but any future non-contiguous/mistyped genotype view would silently copy the whole sample-scale store.
+- **Minor, sub-linear:** `variants.start` is stored `int64` and re-cast to `int32` every batch.
+- **Unrelated avoidable work:** the fused kernels `Array1::zeros(total)` output buffers they then fully overwrite (`__memset` ~7.6% with 3 buffers on the annotated path).
+
+## Goal
+
+Eliminate per-batch materialization of per-sample-scale memmaps at the Python→Rust boundary; cache only the truly-static **sub-linear** arrays; skip provably-unnecessary zero-init — all **byte-identical** to current output. One breaking on-disk change (AoS → SoA intervals), gated behind a `format_version` major bump and an explicit migration.
+
+## Global constraints
+
+- **Byte-identical parity is the landing gate.** Every change here is layout/marshalling only; output bytes are unchanged. Verified across `GVL_BACKEND=rust` and `GVL_BACKEND=numba` via the existing `tests/parity` suites.
+- **Public API change is limited and intentional:** add `gvl.migrate` to `python/genvarloader/__init__.py` `__all__`, and bump `DATASET_FORMAT_VERSION` to `2.0.0`. Per `CLAUDE.md`, the new public symbol + changed on-disk format **requires a `skills/genvarloader/SKILL.md` update** (open-a-dataset workflow + the migration note). No other public signatures change.
+- **No new perf gate.** Throughput is recorded, not gated (consistent with the migration roadmap). The hard new gate is the **scale-guard test** (no memmap-materializing copy on the read path).
+- **Commands under pixi:** `pixi run -e dev <task>`; build the ext with `pixi run -e dev maturin develop --release` after Rust changes. Dataset/parity tests need `--basetemp=$(pwd)/.pytest_tmp` (Carter os.link Errno 18). Prefix shell with `rtk`. Lint/format/typecheck scope: `ruff check python/ tests/`, `ruff format python/ tests/`, `pixi run -e dev typecheck`.
+- **Merge style:** merge commit, never squash.
+
+---
+
+## Components
+
+### A. On-disk intervals: AoS → SoA (`format_version` 1.0.0 → 2.0.0)
+
+The single biggest change and the only breaking one.
+
+- **Constant:** `DATASET_FORMAT_VERSION` (`_write.py:44`) → `2.0.0`. Its doc comment already says "Bump MAJOR only when an existing dataset can no longer be read correctly by new code" — this qualifies.
+- **Write** (`_write.py`, the two `dtype=INTERVAL_DTYPE` allocation/serialization sites near `:1091` and `:1325`, plus the per-track writer that emits `intervals/<track>/intervals.npy`): emit **three contiguous arrays** per track instead of one record array:
+  - `intervals/<track>/starts.npy` — `int32`, contiguous
+  - `intervals/<track>/ends.npy` — `int32`, contiguous
+  - `intervals/<track>/values.npy` — `float32`, contiguous
+  - `intervals/<track>/offsets.npy` — **unchanged** (the ragged grouping is identical; only the data layout changes).
+- **Read** (`_tracks.py::_open_intervals`, `:707-722`): memmap the three contiguous arrays directly and build `RaggedIntervals` from them, so `.starts/.ends/.values.data` are C-contiguous memmaps (no field-view stride).
+- `INTERVAL_DTYPE` (`_ragged.py:26`) is **removed from the on-disk format and the read path**. It may remain for (a) one-time in-memory record construction during `gvl.write` (the write path is not the hot per-batch path, so a copy there is harmless) and (b) the migration reader (Component C). The binding requirement is that **`_open_intervals` no longer produces strided field views** — what the writer does in memory before serializing three contiguous files is an implementation detail.
+- New `gvl.write` datasets are born `2.0.0` / SoA.
+- **No Rust-kernel change.** The Rust entries (`intervals_to_tracks`, `intervals_and_realign_track_fused`) already take `itv_starts`/`itv_ends`/`itv_values` as three separate arrays; SoA storage simply makes the arrays Python hands them contiguous.
+
+### B. Version gate on open (new)
+
+The dataset open path does **not** currently validate `format_version` (only `_fasta_cache.py:175 _check_format_version` does, for the FASTA cache). Add the equivalent for datasets:
+
+- A `_check_dataset_format_version(meta, path)` helper invoked where `_open.py` loads `metadata.json` into the `Metadata` model (`format_version` field at `_write.py:72`).
+- `meta.format_version.major < DATASET_FORMAT_VERSION.major` → raise a clear error instructing the user to run `gvl.migrate(path)`.
+- `meta.format_version.major > DATASET_FORMAT_VERSION.major` → raise "dataset written by a newer gvl; upgrade genvarloader".
+- Equal major → proceed.
+- Datasets with `format_version is None` (pre-versioning) are treated as the oldest major → migrate path. The committed test datasets must be brought to 2.0.0 so the suite runs: regenerate the toy fixtures via `pixi run -e dev gen`, and bring the benchmark corpus (`tests/benchmarks/data/chr22_geuv.gvl`, built by `build_realistic.py` rather than `gen`) to 2.0.0 by running the new `gvl.migrate` on it — which also dogfoods the migration. Confirm which committed datasets are `None` vs `1.0.0` during implementation.
+
+### C. `gvl.migrate(path)` — new public API
+
+In-place, streaming, idempotent rewrite of a 1.x AoS dataset to 2.0 SoA.
+
+- **Signature:** `gvl.migrate(path: str | Path) -> None` (added to `__init__.py __all__`). Lives in a new module, e.g. `python/genvarloader/_dataset/_migrate.py`.
+- **Algorithm, per track under `intervals/<track>/`:**
+  1. Open `intervals.npy` as an `INTERVAL_DTYPE` memmap (read-only); stream it in fixed-size record chunks (never load the whole store into RAM).
+  2. Write `starts.npy`, `ends.npy`, `values.npy` by appending each chunk's `["start"]/["end"]/["value"]` fields to the three contiguous output files; `flush`/`fsync` each.
+  3. After **all** tracks' SoA files are written and fsynced, update `metadata.json` `format_version` → `2.0.0` (**last** durable write).
+  4. Then delete each `intervals.npy`.
+- **Idempotency / crash-safety by ordering:** metadata is bumped only after SoA is durable, so an interruption leaves the dataset still-1.x (old `intervals.npy` intact, re-runnable). If interrupted after the metadata bump but before deletion, both layouts coexist harmlessly; a re-run completes the cleanup. `migrate` on an already-2.0 dataset is a no-op (idempotent check on `format_version`).
+- **Disk:** peak extra ≈ one track's interval store (transient), never the whole dataset. Genotypes/regions/reference are untouched.
+- Emit progress logging (per-track, record counts) consistent with the existing writer's logging.
+
+### D. Zero-copy FFI contract + loud boundary guard
+
+Establish one rule for **all per-sample-scale FFI args**: cross zero-copy, or fail loudly — never silently materialize.
+
+- **Drop `np.ascontiguousarray(...)`** on per-sample-scale memmapped args at the call sites:
+  - `_reconstruct.py:241-250` — the SoA interval fields (now contiguous → drop is safe and the copy is gone).
+  - `_reconstruct.py:232-234` and the `_haps.py` fused calls (plain `~789-813`, annotated `~917`, splice `~859`) — `genotypes.data`, `genotypes.offsets` / `_as_starts_stops(...)` inputs derived from them.
+- **Add a shared boundary helper**, e.g. `_ffi_array(arr, dtype, name) -> np.ndarray` in a small util, that asserts `arr.flags["C_CONTIGUOUS"]` and `arr.dtype == dtype` and raises a precise `ValueError` naming the arg if violated (so a future non-contiguous/mistyped per-sample-scale array fails at the call site with an intelligible message instead of a silent GB copy or an opaque PyO3 error). Apply it to the per-sample-scale args in place of the dropped `ascontiguousarray`.
+- Per-batch-sized arrays that are genuinely freshly constructed and may be non-contiguous (e.g. a strided column slice like `regions[:, 1]`, `flat_shifts.reshape(...)`) are **batch-bounded**, not sample-scale; keep coercing those (cheap) — the guard is specifically for the sample-scale memmaps. Document this distinction at the call sites.
+
+### E. RAM-cache the sub-linear static arrays
+
+- Cache, once per reconstructor (lazy, lifetime = the `Haps`/reconstructor object), the typed-contiguous per-variant/reference arrays the kernels consume: chiefly `v_starts` (`variants.start`, `int64`→`int32` recast today); `ilens`, `alt.data`, `alt.offsets`, `reference`, `ref_offsets` are already no-ops but get cached for uniformity and to drop their per-batch `ascontiguousarray` calls.
+- **No memory knob** (YAGNI): these grow only with the variant count (≲ a few billion germline variants even at 1M samples → fits ≥64 GB RAM, per the maintainer's sizing). Per-sample-scale arrays are explicitly **excluded** from caching (Component D governs them).
+- Implementation seam: a cached property / precomputed dataclass field on the reconstructor holding the FFI-ready arrays; computed on first `ds[r, s]` (or at reconstructor construction).
+
+### F. Skip zero-initialization where provably full-write
+
+- Replace `Array1::zeros(total)` with uninitialized allocation in the fused kernels (`src/ffi/mod.rs`): `out_data` in `reconstruct_haplotypes_fused`, `reconstruct_annotated_haplotypes_fused` (+ its `annot_v`/`annot_pos`), `reconstruct_haplotypes_spliced_fused`, and the fused tracks kernel's scratch/output buffer — **only** where the reconstruct/track core writes **every** output position for in-contract inputs.
+- **Safety argument (documented at each site):** out-of-contract inputs (a deletion driving `ref_idx` past the contig end) are **already** undefined and excluded from the parity oracle by the existing overshoot/double-init guards (`tests/parity/test_reconstruct_haplotypes_parity.py`). So uninitialized allocation adds no new observable exposure: in-contract → fully written; out-of-contract → already undefined. Use a safe-Rust uninitialized pattern (e.g. `Array1::uninit` + assume-init only after the full-write, or `Vec::with_capacity` + set_len behind a clearly-documented invariant). Prefer the least-`unsafe` construction that compiles clean under clippy.
+- This is the one component where parity could regress if the full-write invariant is wrong; gate it behind the existing reconstruct/track parity suites on both backends and keep the change isolated (own commit) so it can be reverted independently.
+
+### Out of scope (deferred)
+
+- **Reverse-complement fusion** into the kernel (the strand RC numpy post-pass, ~9% inclusive). Noted by the maintainer for future planning; not part of this spec.
+- The Phase 5 "single big `__getitem__` kernel" rewrite — targets D–F are complementary to it but do not depend on it.
+
+---
+
+## Testing & parity
+
+- **Byte-identical parity (gate):** run `GVL_BACKEND=rust` and `GVL_BACKEND=numba` over `tests/parity` (and the dataset/unit/integration suites) — output unchanged by every component.
+- **New tests:**
+  1. **Migration round-trip:** write a small 1.x AoS dataset (or fixture), run `gvl.migrate`, assert (a) the three SoA files exist and `intervals.npy` is gone, (b) `metadata.json` `format_version == 2.0.0`, (c) `ds[r, s]` is byte-identical to the pre-migration read. Also assert `migrate` is idempotent (second run is a no-op) and re-runnable after a simulated mid-write interruption.
+  2. **Version gate:** opening a 1.x dataset raises with the `gvl.migrate` hint; opening a synthesized "future major" raises the upgrade error.
+  3. **Scale-guard (the hard new gate):** monkeypatch `np.ascontiguousarray` over one `ds[r, s]` (haps, annotated, tracks-only) and assert **zero** copies whose source `.base` is an `np.memmap` — locks the defect closed and prevents regressions. (Mirrors the diagnostic used to find the bug.)
+  4. **FFI guard:** feed a deliberately non-contiguous per-sample-scale array to the boundary helper and assert it raises the precise error (never a silent copy).
+- **Build/CI:** `maturin develop --release`, `cargo test`, `ruff check/format`, `typecheck`, abi3 wheel build. Regenerate committed test datasets to 2.0.0 (`pixi run -e dev gen`) so the suite runs against the new format.
+- **Throughput (recorded, not gated):** re-run `tests/benchmarks/test_e2e.py` on both backends; expect the rust tracks/annotated paths to close further on numba once the per-batch interval copy is gone. Record in the roadmap.
+
+## File-touch map
+
+| File | Change | Component |
+|---|---|---|
+| `python/genvarloader/_dataset/_write.py` | `DATASET_FORMAT_VERSION` → 2.0.0; write SoA `starts/ends/values.npy` per track | A |
+| `python/genvarloader/_ragged.py` | retire `INTERVAL_DTYPE` from read/write (keep for migration only) | A |
+| `python/genvarloader/_dataset/_tracks.py` | `_open_intervals` memmaps three contiguous arrays | A |
+| `python/genvarloader/_dataset/_open.py` | call `_check_dataset_format_version` on load | B |
+| `python/genvarloader/_dataset/_migrate.py` (new) | `migrate()` streaming in-place AoS→SoA | C |
+| `python/genvarloader/__init__.py` | export `migrate` in `__all__` | C |
+| `python/genvarloader/_dataset/_reconstruct.py` | drop `ascontiguousarray` on sample-scale args; apply `_ffi_array` guard | D |
+| `python/genvarloader/_dataset/_haps.py` | same for the fused haps/annotated/splice calls | D |
+| `python/genvarloader/_dataset/_utils.py` (or new util) | `_ffi_array(arr, dtype, name)` boundary helper | D |
+| reconstructor (`_haps.py` / `_reconstruct.py`) | cache FFI-ready sub-linear arrays | E |
+| `src/ffi/mod.rs` | uninitialized output allocation in the four fused kernels | F |
+| `skills/genvarloader/SKILL.md` | document `gvl.migrate` + format 2.0 open behavior | A/C |
+| `tests/parity/`, `tests/unit/`, `tests/integration/` | migration round-trip, version gate, scale-guard, FFI-guard tests | all |
+| `docs/roadmaps/rust-migration.md` | mark targets 1–2 (and the zero-init part of 3) addressed; record throughput | all |
+
+## Risks & mitigations
+
+- **Parity regression from skip-zero-init (F)** — isolate in its own commit; gate on reconstruct/track parity both backends; revertable independently.
+- **Committed test datasets are 1.x** — bring to 2.0.0 as part of the work (toy fixtures via `gen`; benchmark corpus via `gvl.migrate`), else the version gate fails the whole suite. Verify the `gen` task and every committed `.gvl` fixture.
+- **Hidden interval readers** — audit for any consumer of `intervals.npy` / `INTERVAL_DTYPE` beyond `_open_intervals` and the writer (e.g. tooling, `_table.py`) before retiring the AoS read path.
+- **`format_version is None` datasets** — treat as oldest-major (migrate); confirm behavior on a synthesized `None` metadata.
+- **Migration interruption** — ordering (SoA durable → metadata bump → delete AoS) makes it re-runnable; the round-trip test exercises an interrupted-then-resumed run.
diff --git a/docs/superpowers/specs/2026-06-26-rc-alleles-instruction-tuning-design.md b/docs/superpowers/specs/2026-06-26-rc-alleles-instruction-tuning-design.md
new file mode 100644
index 00000000..d02d2309
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-26-rc-alleles-instruction-tuning-design.md
@@ -0,0 +1,123 @@
+# rc_alleles_inplace Instruction-Level Tuning — Design
+
+**Date:** 2026-06-26
+**Branch target:** `opt/rc-alleles-instruction-tuning` → `rust-migration`
+**Roadmap:** lands under Phase 3, Target 6 / round-3 area of `docs/roadmaps/rust-migration.md`
+
+## Context
+
+PR #251 (`rust-variant-rc-fold`) folded variant-allele reverse-complement into a
+gvl-owned Rust kernel, `variants::rc_alleles_inplace` (`src/variants/mod.rs`). PR #252
+(round-3 instruction-level tuning) applied `cargo asm`-driven instruction-count /
+autovectorization passes to seven hot kernels — but `rc_alleles_inplace` was **not** in
+its target list. This is a follow-up pass closing that gap, using the same round-3
+methodology, scoped to the full #251 Rust surface.
+
+### Audit of the full #251 Rust surface
+
+| File | #251 addition | Optimizable? |
+|---|---|---|
+| `src/variants/mod.rs` | `rc_alleles_inplace` core (67 lines) | **Yes** — the only compute kernel |
+| `src/ffi/mod.rs` | `rc_alleles` PyO3 wrapper (17 lines) | No — `as_slice_mut().unwrap()` + 3 `as_array()` borrows, zero-cost boundary glue, no hot loop |
+| `src/lib.rs` | registration (1 line) | No |
+
+The wrapper and registration carry no hot loop; the entire optimizable surface is
+`rc_alleles_inplace`.
+
+## The inefficiency
+
+Current `rc_alleles_inplace`:
+
+```rust
+let mut per_allele = vec![false; n_alleles];           // ① heap alloc + memset every call
+for g in 0..to_rc_row.len() { ... per_allele[a]=true }  // ② expand row→allele mask (pass 1)
+let per_allele = ndarray::Array1::from_vec(per_allele); // ③ Array1 wrap
+crate::reverse::rc_flat_rows_inplace(byte_data, seq_offsets, per_allele.view()); // ④ rescans ALL alleles checking the mask (pass 2)
+```
+
+It materializes an intermediate per-allele bool mask only to hand it to a generic helper
+that re-scans every allele. Two passes (build mask → scan mask) plus a per-call heap
+allocation and memset.
+
+## The change
+
+**One logical change in `src/variants/mod.rs`, with a small extract in `src/reverse.rs`.**
+
+### 1. Shared `#[inline]` reverse+complement helper
+
+Factor the per-row body inside `rc_flat_rows_inplace`'s masked branch — `row.reverse()`
+followed by the round-3 branchless-vectorized complement — into:
+
+```rust
+#[inline]
+pub(crate) fn rc_row(row: &mut [u8]) { /* row.reverse() + vectorized COMP arithmetic */ }
+```
+
+`rc_flat_rows_inplace` calls `rc_row` per masked row. Same vectorized complement, DRY.
+
+### 2. Fuse `rc_alleles_inplace` into a single pass
+
+```rust
+pub fn rc_alleles_inplace(byte_data, seq_offsets, var_offsets, to_rc_row) {
+    for g in 0..to_rc_row.len() {
+        if !to_rc_row[g] { continue; }
+        for a in var_offsets[g] as usize..var_offsets[g + 1] as usize {
+            let s = seq_offsets[a] as usize;
+            let e = seq_offsets[a + 1] as usize;
+            crate::reverse::rc_row(&mut byte_data[s..e]);
+        }
+    }
+}
+```
+
+Deletes the `vec![false; n_alleles]` alloc+memset (①), the `Array1::from_vec` wrap (③),
+and the redundant full-allele rescan (④); collapses the two passes into one. `n_alleles`
+is no longer computed.
+
+### Byte-identity argument
+
+`var_offsets` partition the alleles by row (contiguous, disjoint), so each allele belongs
+to exactly one row. The old code RC'd allele `a` iff its owning row was masked; the fused
+loop RCs exactly that set, in the same order (rows ascending, alleles ascending within a
+row). Empty allele (`s == e`) → `rc_row` on an empty slice is a no-op; empty row
+(`a0 == a1`) → inner loop skips. Behavior is identical to today on every input.
+
+### Risk control on the shared kernel
+
+`rc_flat_rows_inplace` sits on the round-3-tuned haplotype hot path. The `#[inline]`
+extract must leave its codegen equivalent. **Gate:** confirm `rc_flat_rows_inplace`'s asm
+is unchanged/equivalent after the extract. If extraction perturbs it, fall back to
+duplicating the ~6-line complement locally in `rc_alleles_inplace` and leave
+`rc_flat_rows_inplace` byte-for-byte untouched. DRY is preferred but never at the cost of
+regressing the tuned kernel.
+
+## Gate (parity + instruction-count drop + no regression)
+
+This path (`rc_alleles` fires only on negative-strand variants / `RaggedVariants` reads)
+is noise-dominated in wall-clock per the roadmap, so the gate is **not** round-3's strict
+"improve throughput or revert." Keep the change iff:
+
+1. **Parity byte-identical, both backends:** `tests/parity/test_rc_alleles_parity.py` +
+   cargo unit tests (`rc_alleles_*` in `variants`, `reverse` module tests).
+2. **Instruction count drops:** `cargo asm --rust genvarloader::variants::rc_alleles_inplace`
+   before/after — record the delta as evidence (the deterministic win).
+3. **No throughput regression:** `profile.py --mode variants` rust÷numba **holds**
+   (same session, both backends); not required to improve.
+4. **`rc_flat_rows_inplace` asm equivalent** after the extract (risk control above).
+
+Plus the standard full gate: full pytest tree on both backends, `cargo test`,
+`ruff check`/`format`, `typecheck`, abi3 wheel build.
+
+## Process
+
+Round-3 precedent: worktree off `rust-migration` with its **own** fresh pixi env (never
+symlink `.pixi` — `maturin develop` repoints the shared env), one commit for the kernel +
+roadmap update, PR into `rust-migration` (**no squash merge**). Update the roadmap under
+the Target-6 / round-3 area noting `rc_alleles_inplace` was tuned (instr before→after,
+rust÷numba held).
+
+## Out of scope
+
+No on-disk format change, no public API change, no new kernels, no rayon/batch
+parallelism (Phase 5), no numba/seqpro-reference deletion (Phase 5). No change to
+`flank_tokens` or `_FlatVariantWindows` (never RC'd).
diff --git a/docs/superpowers/specs/2026-06-26-rust-migration-phase-4-close-out-design.md b/docs/superpowers/specs/2026-06-26-rust-migration-phase-4-close-out-design.md
new file mode 100644
index 00000000..6dbfd492
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-26-rust-migration-phase-4-close-out-design.md
@@ -0,0 +1,115 @@
+# Design: Rust migration Phase 4 close-out (write/update gate + reconcile)
+
+**Date:** 2026-06-26
+**Branch:** `phase-4-close-out` (worktree `.claude/worktrees/phase-4-close-out`, off `rust-variant-rc-fold`)
+**Roadmap:** `docs/roadmaps/rust-migration.md` — Phase 4 (🚧 → ✅)
+
+## Problem & context
+
+Phase 4 of the Rust migration ("Write / update pipeline") is marked 🚧 with bullets:
+
+- Migrate `_dataset/_write.py`: variant normalization (left-align, bi-allelic, atomize),
+  genotype storage, interval extraction + realign.
+  - [x] bigWig interval extraction — single-pass streaming Rust writer
+  - [x] Table + annot overlap — COITrees Rust engine
+- Migrate remaining `_dataset/_utils.py` / `_flat_flanks.py` / `_variants/_sitesonly.py`
+  kernels touched by the write path.
+
+**Investigation finding (2026-06-26): the porting is essentially already done.** Tracing the
+real `gvl.write()` / `gvl.update()` paths shows the roadmap bullets mischaracterize the work:
+
+- **Variant normalization (left-align, bi-allelic, atomize) is NOT something GVL does.** It is a
+  documented *precondition* the user satisfies with `bcftools norm` / `plink2 --normalize`
+  (`_write.py:124-129`). The write path only *validates and rejects* non-bi-allelic / symbolic /
+  breakend records (`_write.py:599-615`). There is no numba normalization kernel to port.
+- **Genotype storage is done by genoray**, via `dense2sparse` / `_dense2sparse_with_length`
+  (`genoray._svar`, imported at `_write.py:21-22`). That belongs to **Phase 6 (absorb genoray)**,
+  not Phase 4.
+- **Interval extraction + realign** on the write path is the bigWig streaming writer (✅) and the
+  Table COITrees engine (✅), both already shipped. There is no write-time *realign* — realign is a
+  read-path concern.
+- Of the remaining-file candidates, the only GVL numba kernel reachable on the write path is
+  `splits_sum_le_value` (`_utils.py:165-196`), used solely by `_write_track_legacy`
+  (`_write.py:1254-1386`), the dispatch fall-through for custom `IntervalTrack` sources
+  (`_write.py:1467`). The Phase 0 notes (roadmap lines 767-780) already document this exact path as
+  **dead** for the only concrete public track types (`BigWigs`→Rust, `Table`→Rust). Verified
+  2026-06-26: there are **no** concrete `IntervalTrack` subclasses anywhere in the codebase besides
+  `BigWigs` and `Table`, and `IntervalTrack` itself is **not exported** in `__init__.py`.
+  `_flat_flanks.py::_assemble_alt_windows`, `_sitesonly.py::apply_site_only_variants`, `padded_slice`,
+  and the `_tracks.py` kernels are all **read-path**, outside Phase 4.
+
+So "finishing Phase 4" is a **close-out + reconcile**, not a new port. Decisions taken with the
+maintainer (2026-06-26):
+
+1. Deliver: close out the gate **and** reconcile the roadmap. Mark Phase 4 ✅.
+2. The dead legacy track path is **deleted as dead** (Phase 0 precedent).
+3. The gate is measured as a **Carter absolute re-baseline** (the write path is already Rust-only;
+   the Python/numba orchestration was deleted at landing, so there is no live numba A/B).
+
+## Scope
+
+### In scope
+
+**A. Delete the dead legacy track path**
+- Remove `_write_track_legacy` (`_write.py:1254-1386`).
+- Replace the `else` fall-through at `_write.py:1467` with a clear `TypeError` naming the unsupported
+  track type and pointing at `BigWigs` / `Table`.
+- Remove `splits_sum_le_value` (`_utils.py:165-196`) and its unit test.
+- Leave `padded_slice` (`_utils.py:37-72`, read-path numba reference) untouched.
+- Confirm no other importers of `splits_sum_le_value` (it is not registered in `_dispatch.py`).
+- Net effect: the `gvl.write()` / `gvl.update()` path is **numba-free**.
+
+**B. Measurement gate — Carter absolute re-baseline**
+- **`write()` workload:** build the `chr22_geuv` corpus from its sources (PGEN variants + a bigWig
+  track; 165 regions × 5 samples, chr22) via `tests/benchmarks/profiling/profile_write.py --op write`.
+  Record wall-clock + peak RSS (memray), `NUMBA_NUM_THREADS=1`, release build, Carter HPC
+  (AMD EPYC 7543, linux-64).
+- **`update()` workload:** open `chr22_geuv.gvl`, `gvl.update()` adding a new per-sample `BigWigs`
+  read-depth track — exercises the Rust streaming bigWig writer through the update entry point.
+  Record wall-clock + peak RSS. This replaces the 60-row synthetic smoke row.
+- Record both as the canonical Phase 4 numbers in the roadmap baseline table; annotate the old
+  1.143 s / 3.593 GB write figure as macOS / non-comparable.
+
+**C. Parity confirmation**
+- Write-path parity = the already-landed differential tests: the bigWig writer's byte-identical
+  test (roadmap 2026-06-19 note, Task 6) and the Table COITrees numpy-oracle + property tests. No new
+  A/B (legacy is deleted). Re-run these plus the full tree on both backends to confirm green.
+
+**D. Roadmap + reconciliation**
+- Rewrite the Phase 4 section to reflect reality:
+  - variant normalization → user precondition (bcftools / plink2), struck from Phase 4;
+  - genotype storage / variant IO → explicitly Phase 6 (genoray);
+  - bigWig + Table slices ✅;
+  - dead legacy path deleted.
+- Record the Carter write/update baseline numbers.
+- Set Phase 4 ✅ + PR link; add a notes/decisions-log entry.
+
+### Out of scope (explicitly)
+
+- Genotype storage / variant IO (`dense2sparse`) → **Phase 6 (genoray)**.
+- All read-path numba kernels (`padded_slice`, `_assemble_alt_windows`, `apply_site_only_variants`,
+  `_tracks.py` realign kernels) → retained as Phase-5-deletion references.
+- Rayon batch parallelism → Phase 5.
+- Any new Rust kernel (nothing on the write path needs one once the dead path is deleted).
+
+## Verification
+
+- Full test tree on **both backends** (`GVL_BACKEND` rust + numba): `pixi run -e dev pytest tests -q`
+  (dataset + unit). Read-path parity must be unaffected by the deletion.
+- `cargo test` green; lint (`ruff check python/ tests/`), format, `typecheck` clean; abi3 wheel builds.
+- `tests/integration/test_scale_guard.py` still green (write path).
+- Confirm deleting `_write_track_legacy` breaks no existing test (search for tests that write a custom
+  `IntervalTrack`; expect none).
+- Public API is unchanged (`IntervalTrack` unexported; `BigWigs` / `Table` untouched) → no SKILL.md
+  update expected; verify against the CLAUDE.md skill-maintenance checklist before closing.
+
+## Risks & notes
+
+- **Cross-machine baseline:** the original 1.143 s / 3.593 GB write figure was macOS; the new numbers
+  are Carter. They are not directly comparable — the roadmap entry must say so explicitly. Carter
+  becomes the canonical write/update baseline going forward.
+- **Corpus availability:** `write()` measurement needs the `chr22_geuv` source inputs (PGEN + bigWig)
+  reachable via `/carter` or `GVL_BENCH_SOURCE` (per the Phase 0 build_realistic.py note). If sources
+  are unavailable, fall back to the synthetic chr21/chr22 slice used for the bigWig write slice.
+- **Worktree env:** fresh pixi env per worktree (no symlinked `.pixi`), per the parallel-worktree
+  memory; `pixi run -e dev gen` before the first test run.
diff --git a/docs/superpowers/specs/2026-06-26-rust-migration-phase-5-design.md b/docs/superpowers/specs/2026-06-26-rust-migration-phase-5-design.md
new file mode 100644
index 00000000..6fe21f0b
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-26-rust-migration-phase-5-design.md
@@ -0,0 +1,263 @@
+# Design: Rust Migration Phase 5 — Consolidation, numba deletion, rayon, final benchmark → main
+
+**Date:** 2026-06-26
+**Branch:** `rust-migration` (the persistent integration branch; pre-consolidation bug fixes land as their own PRs into it first)
+**Roadmap:** `docs/roadmaps/rust-migration.md` — Phase 5 (⬜ → target ✅)
+**Status:** design approved; spec for writing-plans
+
+---
+
+## 1. Context & goal
+
+Phases 0–4 of the Rust migration are ✅: the read path (`Dataset.__getitem__`) and
+write/update path are Rust-backed and rust-by-default, with byte-identical parity proven
+against retained numba reference kernels. Those numba kernels were **deliberately kept
+alive** as differential-test oracles, to be "deleted wholesale in Phase 5."
+
+Phase 5 is the consolidation phase. Its roadmap checklist:
+
+- Collapse the PyO3 surface so Python is a true shim.
+- Delete all remaining core numba kernels (target count = 0).
+- Confirm the crate is fully cargo-testable standalone.
+
+**Goal of this work:** finish Phase 5, run a final numba-vs-rust benchmark on
+`__getitem__` (wall-clock + peak RSS), and — if rust reaches parity or better — open the
+`rust-migration → main` PR (the single big merge the branch strategy was built around).
+
+### What is already satisfied
+
+- **cargo-testable standalone:** `seqpro-core = "0.1.0"` is a published crates.io registry
+  dependency (checksum-locked in `Cargo.lock`), not an editable path-dep. `cargo test`
+  already runs without the Python/maturin layer (prior phases cite "cargo 109 passed").
+  This checklist item needs only a final verification, not new work.
+
+### Why this is not a no-op (the RSS gate)
+
+All three hot read-path modules (`_genotypes.py`, `_flat_variants.py`, `_tracks.py`) still
+`import numba as nb` at module load. The roadmap repeatedly records that peak RSS
+(~3.53 GB) is "dominated by the numba/llvmlite JIT baseline (~3.2 GB)." Therefore the
+rust-only peak-RSS win **cannot be measured until numba is deleted** — a benchmark today
+would show near-parity RSS by construction (both backends import numba). The RSS metric
+the user wants is gated on the numba deletion that is Phase 5's core.
+
+---
+
+## 2. Current state (measured 2026-06-26)
+
+- `rust-migration` is **162 commits ahead of `main`, 0 behind, 123 files changed** — a
+  clean fast-forward merge whenever chosen. `main` stays shippable.
+- **~21 `register(...)` dual-backend kernels** across `_genotypes.py`, `_flat_variants.py`,
+  `_intervals.py`, `_tracks.py`, `_reference.py`, all routed through the
+  `python/genvarloader/_dispatch.py` registry (`GVL_BACKEND` override, per-kernel default
+  `rust`).
+- **~17 numba-oracle parity suites** in `tests/parity/` (e.g.
+  `test_reconstruct_haplotypes_parity.py`, `test_fused_haps_parity.py`,
+  `test_dataset_parity.py`) compare rust against the live numba impl.
+- **Two known numba-vs-rust divergences are currently excluded from parity** (rust is
+  correct in both; numba is the buggy oracle):
+  1. **Haplotype trailing-fill** (`_genotypes.py:508`): when a deletion drives `ref_idx`
+     past the contig end, `writable_ref = min(unfilled_length, len(ref) - ref_idx)` goes
+     negative, so `out_end_idx = out_idx + writable_ref < out_idx`, and
+     `out[out_end_idx:] = pad_char` uses Python-style negative indexing — it wraps and
+     leaves trailing positions unwritten. Rust clamps `out_end_idx` to 0 and pads
+     correctly. The same latent pattern exists at `_tracks.py:396`.
+  2. **#242-family** (`intervals_to_tracks`): gvl stores intervals at
+     `chromStart - max_jitter` but queries at `chromStart + jitter`, so for `max_jitter>0`
+     datasets a stored interval can start before the query window. The numba/rust kernels
+     diverge (debug_assert panic / clip behavior). Filed as
+     [mcvickerlab/GenVarLoader#242](https://github.com/mcvickerlab/GenVarLoader/issues/242).
+- **Deferred fusion:** the annotated+spliced *intersection* read path still runs on the
+  unfused dispatched rust core (Phase 3 explicitly deferred its fusion to Phase 5).
+
+---
+
+## 3. Decisions (locked with the user)
+
+| # | Decision | Choice |
+|---|----------|--------|
+| D1 | Rayon batch parallelism | **In scope** for Phase 5 (the roadmap's "next lever"). |
+| D2 | Fate of numba-oracle parity suites after deletion | **Golden-snapshot** them to frozen fixtures (preserve independent differential coverage in perpetuity), *after* fixing the numba bugs so the frozen oracle is correct. |
+| D3 | PyO3 shim collapse aggressiveness | **Also fuse the deferred annotated+spliced path**, not just remove dispatch indirection. |
+| D4 | Haplotype trailing-fill numba bug | **Fix it** (clamp), so the golden oracle is correct. |
+| D5 | #242-family exclusion | **Fix it too**, so the golden oracle is fully exclusion-free (touches the write/store path; needs a correct-behavior investigation). |
+| D6 | Final benchmark threading convention | **Single-thread verdict** (rayon=1 vs `NUMBA_NUM_THREADS=1`), comparable to all prior baselines; rayon multi-thread speedup reported separately as an additive bonus. |
+| D7 | Bug fixes (D4, D5) PR strategy | **Separate PR(s), land first**, per the established numba-oracle-bug-policy (file issue + isolated fix + un-exclude from parity). |
+
+---
+
+## 4. Workstreams
+
+### Stage A — Pre-consolidation correctness (separate PRs, land first)
+
+These make numba a trustworthy, exclusion-free oracle **before** it is frozen as golden
+fixtures and then deleted. Each uses systematic-debugging to establish the correct
+behavior, and lands as its own PR into `rust-migration` (per D7).
+
+**W1 — Fix the haplotype trailing-fill numba bug (D4).**
+- File a GVL issue referencing the `_genotypes.py:508` trailing-fill divergence.
+- Fix: `writable_ref = max(0, min(unfilled_length, len(ref) - ref_idx))` at
+  `_genotypes.py:508`; mirror the clamp at `_tracks.py:396`.
+- Verify rust already produces the correct (clamped/padded) output; confirm
+  rust == numba after the fix across the previously-excluded overshoot sub-domain.
+- Un-exclude that sub-domain: drop Guard 1 (the overshoot pre-check) in
+  `tests/parity/test_reconstruct_haplotypes_parity.py`; remove the double-init sentinel
+  guard where it only existed to mask this divergence.
+- **Acceptance:** the overshoot sub-domain is parity-covered (not excluded), full tree
+  green on both backends.
+
+**W2 — Fix the #242-family divergence (D5).**
+- Investigation (systematic-debugging): determine the correct `intervals_to_tracks`
+  behavior when a stored interval starts before the query window (`max_jitter>0`),
+  reconciling the `chromStart - max_jitter` store vs `chromStart + jitter` query offset.
+  This may touch the write/store path and/or the query coordinate math, not only the
+  kernel.
+- Apply the fix to **both** backends so they agree and both are correct; reference/close
+  #242.
+- Un-exclude the #242-family sub-domain: remove the `assume(False)` / xfail guards in the
+  affected parity + dataset suites (`test_reconstruct_haplotypes_parity.py`,
+  `test_dataset_parity.py`, `test_shift_and_realign_tracks_parity.py`,
+  `strategies.py`/`_fixtures.py` generators), lifting fixtures off the forced
+  `max_jitter=0` where they were pinned only to dodge #242.
+- **Acceptance:** `max_jitter>0` parity restored; #242 closed; full tree green on both
+  backends.
+
+### Stage B — Fusion (parity-gated against numba, before deletion)
+
+**W3 — Fuse the deferred annotated+spliced intersection path (D3).**
+- Add a fused rust kernel that collapses the remaining FFI crossings on the
+  annotated+spliced read path (the intersection still on the unfused dispatched core),
+  matching the fusion pattern of `reconstruct_annotated_haplotypes_fused` /
+  `reconstruct_haplotypes_spliced_fused`.
+- Gate on byte-identical parity against the composed numba oracle **while numba still
+  exists**.
+- **Acceptance:** annotated+spliced path is fused and byte-identical; parity suite extended
+  to cover it.
+
+### Stage C — Final numba-vs-rust benchmark (the gate; numba still present)
+
+**W4 — Capture the single-thread parity verdict (D6).**
+- Harness: existing `tests/benchmarks/test_e2e.py` (pytest-benchmark pedantic min) +
+  `tests/benchmarks/profiling/profile.py` wall-clock, `NUMBA_NUM_THREADS=1`, rayon
+  threads=1, release build, corpus `chr22_geuv.gvl` (format 2.0), Carter HPC.
+- Run the numba-vs-rust A/B in **one back-to-back session** across all modes:
+  tracks-only, tracks-seqs, haplotypes, annotated, variants, variant-windows.
+- This is the canonical "final numba vs rust" wall-clock comparison; it must run while both
+  backends exist (after deletion there is no numba to A/B).
+- **Gate:** rust at **parity or better** (single-thread) on `__getitem__`. Per-path
+  node-noise caveat applies (use within-session ratios; the durable signal is the
+  established instruction-count reductions + parity).
+
+### Stage D — Consolidation (the single big Phase 5 PR)
+
+**W5 — Golden-snapshot the parity suites (D2).**
+- Before deleting numba, generate frozen golden fixtures from the now-correct numba oracle
+  for each of the ~17 parity suites (including the W3 fused path and the W1/W2
+  un-excluded sub-domains).
+- Convert the suites from "run-both-assert-byte-identical" to golden-file regression tests
+  that need no live numba. Store fixtures compactly (compressed `.npz`/`.npy` keyed by the
+  hypothesis-generated input, or a deterministic seeded sample set — chosen in the plan to
+  keep the repo size bounded).
+- **Acceptance:** golden suites pass against rust with numba uninstalled/uncalled.
+
+**W6 — Delete numba + collapse to thin shim.**
+- Delete the ~21 `register()` numba refs, all njit bodies, the `python/genvarloader/_dispatch.py`
+  registry + `GVL_BACKEND`, and every `import numba` in the core modules.
+- Replace `get(name)(...)` dispatch call sites (`_intervals.py`, `_reference.py`,
+  `_reconstruct.py`, `_tracks.py`, `_flat_variants.py`, `_rag_variants.py`,
+  `_genotypes.py`) with direct rust calls — Python becomes indexing sugar + torch +
+  validation/error messages only.
+- Remove `numba` from the project's runtime dependency set (verify nothing else in the
+  package imports it).
+- **Acceptance:** core numba kernel count = 0; `python -c "import genvarloader"` does not
+  import numba or llvmlite (asserted by a test); full tree green.
+
+**W7 — Add rayon batch parallelism (D1).**
+- Parallelize the read-path batch drivers with rayon over the per-(query, hap) work items
+  (disjoint output slices — proven safe / serial-equivalent in Phase 3). Rust-only;
+  thread count controlled by an env/config knob, default chosen in the plan.
+- **Acceptance:** byte-identical to the serial result (golden suites still pass);
+  multi-thread speedup measured.
+
+### Stage E — Measure & merge
+
+**W8 — Rust-only RSS + rayon speedup.**
+- After deletion, measure rust-only peak RSS on `__getitem__` (memray) vs the recorded
+  numba baseline (3.53 GB) — expect the ~3.2 GB JIT removal.
+- Measure rayon multi-thread speedup (rayon N vs rayon 1) as the additive bonus (D6).
+
+**W9 — PR `rust-migration → main`.**
+- If the Stage C verdict is parity-or-better and RSS is parity-or-better, open the merge
+  PR (no squash — preserve commit history). Update `docs/roadmaps/rust-migration.md`:
+  mark Phase 5 ✅, record the final single-thread A/B table, the rust-only RSS, the rayon
+  speedup, and the PR link. Update `skills/genvarloader/SKILL.md` if any public symbol
+  changed (e.g. removal of `GVL_BACKEND`).
+
+---
+
+## 5. Sequencing & PR strategy
+
+```
+W1 (haps trailing-fill fix)   ──┐  separate PRs into rust-migration
+W2 (#242 fix)                 ──┘  (land first; un-exclude parity)
+        │
+W3 (annotated+spliced fusion) ───  PR into rust-migration (parity-gated vs numba)
+        │
+W4 (final numba-vs-rust A/B)  ───  benchmark only (both backends present) → GATE
+        │
+W5..W8 (golden snapshot, delete numba, rayon, RSS) ── single Phase 5 consolidation PR
+        │
+W9 (rust-migration → main)    ───  the big merge, if gate passes
+```
+
+Rationale for ordering: the numba bugs must be fixed (W1, W2) and the deferred path fused
+(W3) **while numba still exists** as the oracle; the parity verdict (W4) must be captured
+**before** deletion; only then is it safe to freeze golden fixtures (W5) and delete numba
+(W6). Rayon (W7) is rust-only and lands after deletion. RSS (W8) is only meaningful after
+deletion.
+
+---
+
+## 6. Out of scope
+
+- **Phase 6 (absorb genoray):** variant IO stays on Python genoray.
+- **Multi-thread numba (prange) A/B:** the verdict is single-thread per D6.
+- Any further single-thread kernel micro-optimization (rounds 1–3 are complete; headroom
+  is maximized per the roadmap).
+
+---
+
+## 7. Risks & mitigations
+
+- **#242 is broader than a kernel clamp (W2).** It touches store-vs-query coordinate math;
+  the correct behavior must be established by investigation before coding. Mitigation:
+  systematic-debugging, fix both backends together, land as its own PR with the
+  un-exclusion as the acceptance gate. If it proves larger than expected, it can be split
+  out without blocking W1/W3.
+- **Golden-fixture repo bloat (W5).** Frozen oracle outputs could be large. Mitigation:
+  compress and/or use a bounded deterministic seeded sample rather than the full
+  hypothesis space; decide the exact scheme in the plan.
+- **Node-noise on the benchmark verdict (W4).** Carter is a shared node (absolute ms/batch
+  drifts ≥2× across sessions). Mitigation: single back-to-back session, within-session
+  ratios, pedantic min; lean on the durable instruction-count + parity evidence already in
+  the roadmap.
+- **Rayon non-determinism (W7).** Mitigation: disjoint output slices (already established);
+  gate on byte-identical equality to the serial golden result.
+
+---
+
+## 8. Acceptance criteria (Phase 5 ✅)
+
+1. Haplotype trailing-fill and #242 divergences fixed; both previously-excluded sub-domains
+   parity-covered (W1, W2).
+2. Annotated+spliced path fused, byte-identical (W3).
+3. Final single-thread numba-vs-rust `__getitem__` A/B captured; rust at parity-or-better
+   (W4).
+4. Parity suites converted to golden fixtures; pass with numba absent (W5).
+5. Core numba kernel count = 0; `import genvarloader` pulls neither numba nor llvmlite;
+   `_dispatch`/`GVL_BACKEND` gone; PyO3 surface is a thin shim (W6).
+6. Rayon batch parallelism byte-identical to serial; speedup measured (W7).
+7. Rust-only peak RSS at parity-or-better vs the 3.53 GB numba baseline (W8).
+8. `cargo test` green standalone; full Python tree green; lint/format/typecheck clean;
+   abi3 wheel builds.
+9. `rust-migration → main` PR opened (no squash); roadmap Phase 5 ✅ + final numbers + PR
+   link recorded; skill updated if public API changed (W9).
diff --git a/docs/superpowers/specs/2026-06-27-rust-migration-phase-5-wrapup-design.md b/docs/superpowers/specs/2026-06-27-rust-migration-phase-5-wrapup-design.md
new file mode 100644
index 00000000..0e98bf05
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-27-rust-migration-phase-5-wrapup-design.md
@@ -0,0 +1,129 @@
+# Design: Wrap up Phase 5 of the Rust migration (sans genoray)
+
+**Date:** 2026-06-27
+**Branch:** `phase-5-w6-wrapup` (off `rust-migration`)
+**Roadmap:** `docs/roadmaps/rust-migration.md` (Phase 5, 🚧 — W1–W5 done, W6–W9 remain)
+**Status going in:** Phases 0–4 ✅. W5 (PR #260) golden-snapshotted the numba-oracle parity
+suites, deleted all gvl-own numba kernels (count = 0), and added rayon batch parallelism
+gated byte-identical to the serial golden result.
+
+## Goal
+
+Finish Phase 5's open finalization threads so the Rust migration is shippable, **excluding
+Phase 6 (absorb genoray)** which stays out of scope. Land everything as **one PR into
+`rust-migration`** (NOT master). The `rust-migration → master` merge is left to the
+maintainer to trigger (no-squash, per [[no-squash-merges]]).
+
+**Explicitly NOT in scope:** the "single big `__getitem__` kernel" architectural collapse.
+Instead of building it, Unit A *audits* whether it is still warranted and records the verdict
+in the roadmap.
+
+## Context discovered during brainstorming
+
+- **No dispatch layer remains.** `python/genvarloader/_dispatch.py` is deleted (only a stale
+  `.pyc` lingers); zero `GVL_BACKEND` / `import numba` / `nb.njit` references in source. W5
+  already collapsed the rust/numba switch — Python calls Rust directly via
+  `from ..genvarloader import (...)` (the compiled `genvarloader.genvarloader` pymodule).
+- **~28 FFI entries** registered in `src/lib.rs`, including the fused one-FFI-crossing
+  `__getitem__` kernels from Phase 3/W3 (`reconstruct_haplotypes_fused`,
+  `reconstruct_annotated_haplotypes_fused`, `reconstruct_haplotypes_spliced_fused`,
+  `reconstruct_annotated_haplotypes_spliced_fused`, `intervals_and_realign_track_fused`).
+- **seqpro-core is already a released dep.** `Cargo.toml` has `seqpro-core = "0.1"` and
+  `Cargo.lock` resolves `seqpro-core 0.1.0` from the crates.io registry with a checksum — no
+  path dep, no `[patch]`. The Phase 1 "editable path-dep, flip before shipping" note is stale.
+
+The upshot: "collapse the PyO3 surface to a thin shim" is **largely already realized** at the
+indirection level. What is left to determine is how much Python *orchestration glue* still
+sits between `__getitem__` and the fused calls — that is what Unit A measures.
+
+## Units of work
+
+The units are mostly independent. Unit D (perf) is the long pole. Units B/C are quick
+verifications. Unit A is investigation + roadmap text with no code change.
+
+### Unit A — PyO3 surface / thin-shim audit (reframed Phase 5 item)
+
+Inventory the live **read path** (`Dataset.__getitem__` → reconstructor in
+`_dataset/_reconstruct.py` / `_haps.py` / `_query.py` → fused FFI kernel) and the **write
+path**, and classify every remaining piece of Python between the public API and the FFI call
+into one of three buckets:
+
+1. **Intentional shim** — indexing sugar, torch integration, validation / error messages.
+   Stays in Python by design (this is the migration's end state).
+2. **Genuinely-remaining collapsible glue** — per-batch coercions, allocations, or Python
+   object churn on the hot path that a future "bigger kernel" would absorb.
+3. **Already-collapsed** — confirmed to be one FFI crossing with no material Python work.
+
+**Output:** a precise "what's left for the thin shim" list written into the roadmap (Phase 5
+section + notes log). Given W5 removed dispatch and Phase 3/W3 fused each path to one
+crossing, the expectation is the bucket-2 list is short or empty. **No code changes in this
+unit.**
+
+### Unit B — `cargo test` standalone verification
+
+Confirm the crate builds and tests purely via `cargo test` (rlib path, no pixi / maturin /
+Python-extension layer). The lib is `crate-type = ["cdylib", "rlib"]`; the
+`extension-module` pyo3 feature is non-default, so `cargo test` links a real libpython. If it
+is broken, record the minimal fix or the documented invocation. Record the result under the
+Phase 5 checkpoint ("crate is fully cargo-testable standalone").
+
+### Unit C — seqpro-core released-dep verification
+
+Already resolves `seqpro-core 0.1.0` from crates.io (verified in `Cargo.lock`). Confirm a
+clean build against the published crate with no lingering path / `[patch]` override, and
+**correct the stale Phase 1 roadmap note** ("editable path-dep, flip to git/crates.io before
+shipping") to reflect that it is already released.
+
+### Unit D — W6 perf re-baseline (long pole)
+
+On Carter (AMD EPYC 7543, linux-64), corpus `chr22_geuv.gvl` (format 2.0, 165 regions × 5
+samples, chr22), using the established de-noised harness (`tests/benchmarks/test_e2e.py`
+pedantic-min, iterations=10/rounds=50/warmup=5, + `tests/benchmarks/profiling/profile.py`
+wall-clock for the variants paths). Release build (`maturin develop --release`).
+
+- **Primary new signal:** rust **serial vs rayon multi-thread** — a clean *same-session* A/B
+  via the `parallel` toggle W5 added to the read kernels. Measure **serial + a thread sweep
+  (2 / 4 / 8 / default-all-cores)** across the read paths (tracks-only, tracks-seqs,
+  haplotypes, annotated, variants, variant-windows) to capture the rayon speedup **curve** and
+  the gvl-attributable **peak-RSS** deltas.
+- **Constraint — no live numba A/B.** numba was deleted in W5, so we compare against the
+  **W4-recorded** same-session numba numbers (`docs/roadmaps/phase-5-w4-final-ab.md`) and the
+  Phase 0 / Phase 4 baselines. We do **not** re-checkout a numba commit: W4 already locked the
+  single-thread numba A/B, and [[gvl-rust-perf-gate-shared-node-noise]] makes cross-session
+  absolute wall-clock unreliable. The durable signals are byte-identical parity (already
+  gated) + same-session serial-vs-rayon improve-or-hold + deterministic counts.
+- **Output:** record the rayon speedup curve + RSS deltas under the Phase 5 checkpoint
+  ("full perf re-baseline recorded here").
+
+### Phase 5 status disposition
+
+Set by Unit A's verdict:
+
+- If the audit shows the shim is already thin (likely) **and** the checkpoint criteria are met
+  (numba count = 0 ✓; perf re-baseline ✓; cargo-testable standalone ✓), mark **Phase 5 ✅** and
+  re-file any residual collapse as a separate, clearly-labelled optimization track (it was
+  never part of the Phase 5 checkpoint gate).
+- If real bucket-2 glue remains, keep **Phase 5 🚧** with the audited list as the explicit
+  remainder, and note that this branch advanced W6 + the verifications.
+
+## Gate (per CLAUDE.md)
+
+1. `pixi run -e dev maturin develop --release` **first** (pytest does not rebuild Rust).
+2. Full tree: `pixi run -e dev pytest tests -q` green (numba backend is gone, so a single
+   rust-only run — no A/B matrix).
+3. `cargo test --release` green.
+4. `pixi run -e dev ruff check python/ tests/` + `ruff format` + `typecheck` + `cargo clippy`
+   clean.
+5. abi3 wheel builds.
+6. Roadmap updated: tick completed items, set Phase 5 marker, add a notes-log entry, record
+   the Unit D measurements under the checkpoint, correct the stale seqpro-core note.
+
+## Deliverable
+
+One PR into `rust-migration` covering Units A–D + the roadmap finalization. The maintainer
+performs the `rust-migration → master` merge separately.
+
+## Open questions
+
+None blocking. Thread-sweep granularity for Unit D (2/4/8/all) confirmed during brainstorming;
+adjustable if the corpus is too small for higher thread counts to show signal.
diff --git a/pixi.lock b/pixi.lock
index a7ca9be4..158e8a89 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -173,7 +173,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/13/2f/b4530fbf948867702d0a3f27de4a6aab1d156f406d72852ab902c4d04de9/rich_rst-1.3.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/21/48/92dddc8df65b576c9d30752650c89301b5222d4ac10187724796cedfd723/pysam-0.24.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/23/18/4cedda786e7da429e7489549a9e5461530d4133130e541f25fb94f015776/cyclopts-4.11.2-py3-none-any.whl
@@ -193,6 +192,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/6c/3c/3f62dee257eb3d6b2c1ef2a09d36d9793c7111156a73b5654d2c2305e5ce/idna-3.14-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/6e/ae/76fb528c6112a3df5a581a18f1a2ceee5983d54977d7f2b6bc883637fe4c/polars_config_meta-0.3.4-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/74/ff/9d30128a88df6c795097b6f73218d4a5afcd0e2d74cf2dedd99b28d42cdc/cyvcf2-0.31.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl
@@ -353,8 +353,8 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/56/c6/65f646c7ff09bd257f660434adb45c4dfcbbcebcc030562fecf6f5bf887d/pydantic_core-2.46.4-cp310-cp310-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/57/f6/a92704f33af317ce33c2bbda4a63f902f088d24b92a89fb5cdc52148e7cb/arro3_core-0.8.0-cp310-cp310-macosx_11_0_arm64.whl
@@ -563,7 +563,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/16/ee/efbd56687be60ef9af0c9c0ebe106964c07400eade5b0af8902a1d8cd58c/torch-2.10.0-3-cp310-cp310-manylinux_2_28_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/21/48/92dddc8df65b576c9d30752650c89301b5222d4ac10187724796cedfd723/pysam-0.24.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
@@ -595,6 +594,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/6e/ae/76fb528c6112a3df5a581a18f1a2ceee5983d54977d7f2b6bc883637fe4c/polars_config_meta-0.3.4-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/72/25/973bd6128381951b23cdcd8a9870c6dcfc5606cb864df8eabd82e529f9c1/torchinfo-1.8.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/74/ff/9d30128a88df6c795097b6f73218d4a5afcd0e2d74cf2dedd99b28d42cdc/cyvcf2-0.31.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl
@@ -773,7 +773,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/56/c6/65f646c7ff09bd257f660434adb45c4dfcbbcebcc030562fecf6f5bf887d/pydantic_core-2.46.4-cp310-cp310-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/57/f6/a92704f33af317ce33c2bbda4a63f902f088d24b92a89fb5cdc52148e7cb/arro3_core-0.8.0-cp310-cp310-macosx_11_0_arm64.whl
@@ -1003,7 +1003,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/18/29/71729b4671f21e1eaa5d6573031ab810ad2936c8175f03f97f3ff164c802/websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/23/18/4cedda786e7da429e7489549a9e5461530d4133130e541f25fb94f015776/cyclopts-4.11.2-py3-none-any.whl
@@ -1051,6 +1050,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/72/25/973bd6128381951b23cdcd8a9870c6dcfc5606cb864df8eabd82e529f9c1/torchinfo-1.8.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/73/f7/b1884cb3188ab181fc81fa00c266699dab600f927a964df02ec3d5d1916a/sphinx-9.1.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl
@@ -1259,7 +1259,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2d/0b/ceb7694d864abc0a047649aec263878acb9f792e1fec3e676f22dc9015e3/jupyter_client-8.8.0-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/2f/97/9214bd9b860e680a281232e218d10b718a7280b593f4ab56240a558dc975/pgenlib-0.94.0-cp312-cp312-macosx_10_13_universal2.whl
       - pypi: https://files.pythonhosted.org/packages/31/a3/5b1562db76a5a488274b2332a97199b32d0442aca0ed193697fd47786316/uvicorn-0.46.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl
@@ -1270,6 +1269,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/47/d4/dbacced3953544b9a93088cc10ef2b596d348c983d5c67a404fa41ec51ba/fonttools-4.62.1-cp312-cp312-macosx_10_13_universal2.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
@@ -1538,7 +1538,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/18/29/71729b4671f21e1eaa5d6573031ab810ad2936c8175f03f97f3ff164c802/websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/23/18/4cedda786e7da429e7489549a9e5461530d4133130e541f25fb94f015776/cyclopts-4.11.2-py3-none-any.whl
@@ -1595,6 +1594,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/72/25/973bd6128381951b23cdcd8a9870c6dcfc5606cb864df8eabd82e529f9c1/torchinfo-1.8.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/73/1b/44a01c4e70933637c93e6e1a8063d1e998b50213a6b65ac5a9169c47e98e/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/73/f7/b1884cb3188ab181fc81fa00c266699dab600f927a964df02ec3d5d1916a/sphinx-9.1.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/75/2e/46030320b5a80661e88039f59060d1790298b4718944a65a7f2aeda3d9e9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl
@@ -1819,7 +1819,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2d/0b/ceb7694d864abc0a047649aec263878acb9f792e1fec3e676f22dc9015e3/jupyter_client-8.8.0-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/2f/97/9214bd9b860e680a281232e218d10b718a7280b593f4ab56240a558dc975/pgenlib-0.94.0-cp312-cp312-macosx_10_13_universal2.whl
       - pypi: https://files.pythonhosted.org/packages/31/a3/5b1562db76a5a488274b2332a97199b32d0442aca0ed193697fd47786316/uvicorn-0.46.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl
@@ -1829,6 +1828,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/47/d4/dbacced3953544b9a93088cc10ef2b596d348c983d5c67a404fa41ec51ba/fonttools-4.62.1-cp312-cp312-macosx_10_13_universal2.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
@@ -1985,7 +1985,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/13/2f/b4530fbf948867702d0a3f27de4a6aab1d156f406d72852ab902c4d04de9/rich_rst-1.3.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/20/e7/bed0024a0f4ab0c8a9c64d4445f39b30c99bd1acd228291959e3de664247/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/21/48/92dddc8df65b576c9d30752650c89301b5222d4ac10187724796cedfd723/pysam-0.24.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
@@ -2010,6 +2009,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/6c/3c/3f62dee257eb3d6b2c1ef2a09d36d9793c7111156a73b5654d2c2305e5ce/idna-3.14-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/6e/ae/76fb528c6112a3df5a581a18f1a2ceee5983d54977d7f2b6bc883637fe4c/polars_config_meta-0.3.4-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/74/ff/9d30128a88df6c795097b6f73218d4a5afcd0e2d74cf2dedd99b28d42cdc/cyvcf2-0.31.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/75/a6/a0a304dc33b49145b21f4808d763822111e67d1c3a32b524a1baf947b6e1/platformdirs-4.9.6-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
@@ -2102,9 +2102,9 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/38/3d/2d244233ac4f76e38533cfcb2991c9eb4c7bf688ae0a036d30725b8faafe/importlib_metadata-9.0.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/56/c6/65f646c7ff09bd257f660434adb45c4dfcbbcebcc030562fecf6f5bf887d/pydantic_core-2.46.4-cp310-cp310-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/57/f6/a92704f33af317ce33c2bbda4a63f902f088d24b92a89fb5cdc52148e7cb/arro3_core-0.8.0-cp310-cp310-macosx_11_0_arm64.whl
@@ -2442,7 +2442,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/13/2f/b4530fbf948867702d0a3f27de4a6aab1d156f406d72852ab902c4d04de9/rich_rst-1.3.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/21/48/92dddc8df65b576c9d30752650c89301b5222d4ac10187724796cedfd723/pysam-0.24.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/23/18/4cedda786e7da429e7489549a9e5461530d4133130e541f25fb94f015776/cyclopts-4.11.2-py3-none-any.whl
@@ -2462,6 +2461,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/6c/3c/3f62dee257eb3d6b2c1ef2a09d36d9793c7111156a73b5654d2c2305e5ce/idna-3.14-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/6e/ae/76fb528c6112a3df5a581a18f1a2ceee5983d54977d7f2b6bc883637fe4c/polars_config_meta-0.3.4-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/74/ff/9d30128a88df6c795097b6f73218d4a5afcd0e2d74cf2dedd99b28d42cdc/cyvcf2-0.31.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl
@@ -2686,7 +2686,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/56/c6/65f646c7ff09bd257f660434adb45c4dfcbbcebcc030562fecf6f5bf887d/pydantic_core-2.46.4-cp310-cp310-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/57/f6/a92704f33af317ce33c2bbda4a63f902f088d24b92a89fb5cdc52148e7cb/arro3_core-0.8.0-cp310-cp310-macosx_11_0_arm64.whl
@@ -2902,7 +2902,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/13/2f/b4530fbf948867702d0a3f27de4a6aab1d156f406d72852ab902c4d04de9/rich_rst-1.3.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/21/48/92dddc8df65b576c9d30752650c89301b5222d4ac10187724796cedfd723/pysam-0.24.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/23/18/4cedda786e7da429e7489549a9e5461530d4133130e541f25fb94f015776/cyclopts-4.11.2-py3-none-any.whl
@@ -2922,6 +2921,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/6c/3c/3f62dee257eb3d6b2c1ef2a09d36d9793c7111156a73b5654d2c2305e5ce/idna-3.14-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/6e/ae/76fb528c6112a3df5a581a18f1a2ceee5983d54977d7f2b6bc883637fe4c/polars_config_meta-0.3.4-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/74/ff/9d30128a88df6c795097b6f73218d4a5afcd0e2d74cf2dedd99b28d42cdc/cyvcf2-0.31.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl
@@ -3082,8 +3082,8 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/56/c6/65f646c7ff09bd257f660434adb45c4dfcbbcebcc030562fecf6f5bf887d/pydantic_core-2.46.4-cp310-cp310-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/57/f6/a92704f33af317ce33c2bbda4a63f902f088d24b92a89fb5cdc52148e7cb/arro3_core-0.8.0-cp310-cp310-macosx_11_0_arm64.whl
@@ -3307,7 +3307,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/18/dc/1843828349729a86f8d9f79b19bd6e7eaa358a5682f13a0af667dae0c1d0/cyvcf2-0.32.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/23/18/4cedda786e7da429e7489549a9e5461530d4133130e541f25fb94f015776/cyclopts-4.11.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/28/53/21f7b97e82772caa61541348427f42435120b32961c92d16f9c8ce9757d6/cslug-1.0.0-py3-none-any.whl
@@ -3328,6 +3327,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/6e/ae/76fb528c6112a3df5a581a18f1a2ceee5983d54977d7f2b6bc883637fe4c/polars_config_meta-0.3.4-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/74/dc/035d54638fc5d2971cbf1e987ccd45f1091c83bcf747281cf6cc25e72c88/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl
@@ -3478,9 +3478,9 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/34/0b/b9d1911cfefa61399821dfb37f486d83e0f42630a8d12f7194270c417002/llvmlite-0.47.0-cp311-cp311-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/5a/b0/a4ffc4ae74d2d822200dcc46898987d8eb6032d1e2b219cae39da6f5cbcc/pandas-3.0.3-cp311-cp311-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/5b/bc/246f452431c592a2a424050e8bb9ccf494fb47613fd97c912f4d573a5e3b/phantom_types-3.0.2-py3-none-any.whl
@@ -3701,7 +3701,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/15/ef/7d57ceb0651af74194e97ed6583e148d352f03d696090221b8059cdfc90b/polars_runtime_32-1.40.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/23/18/4cedda786e7da429e7489549a9e5461530d4133130e541f25fb94f015776/cyclopts-4.11.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/28/53/21f7b97e82772caa61541348427f42435120b32961c92d16f9c8ce9757d6/cslug-1.0.0-py3-none-any.whl
@@ -3723,6 +3722,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/6c/3c/3f62dee257eb3d6b2c1ef2a09d36d9793c7111156a73b5654d2c2305e5ce/idna-3.14-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/6e/ae/76fb528c6112a3df5a581a18f1a2ceee5983d54977d7f2b6bc883637fe4c/polars_config_meta-0.3.4-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl
@@ -3876,9 +3876,9 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/2f/97/9214bd9b860e680a281232e218d10b718a7280b593f4ab56240a558dc975/pgenlib-0.94.0-cp312-cp312-macosx_10_13_universal2.whl
       - pypi: https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/57/bc/76f8f8c5cf9adee47fdb7bbb03be8900f76f902d451d7477cf12b845e1de/numba-0.65.1-cp312-cp312-macosx_12_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/5b/bc/246f452431c592a2a424050e8bb9ccf494fb47613fd97c912f4d573a5e3b/phantom_types-3.0.2-py3-none-any.whl
@@ -4100,7 +4100,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/15/ef/7d57ceb0651af74194e97ed6583e148d352f03d696090221b8059cdfc90b/polars_runtime_32-1.40.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/23/18/4cedda786e7da429e7489549a9e5461530d4133130e541f25fb94f015776/cyclopts-4.11.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/28/53/21f7b97e82772caa61541348427f42435120b32961c92d16f9c8ce9757d6/cslug-1.0.0-py3-none-any.whl
@@ -4121,6 +4120,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/6c/3c/3f62dee257eb3d6b2c1ef2a09d36d9793c7111156a73b5654d2c2305e5ce/idna-3.14-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/6e/ae/76fb528c6112a3df5a581a18f1a2ceee5983d54977d7f2b6bc883637fe4c/polars_config_meta-0.3.4-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl
@@ -4275,10 +4275,10 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/3e/fe/1624eb5024e897bf4074bfc31f9e5e823160aed1ac14e7720e849a3d1109/selectolax-0.4.8-cp313-cp313-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/3f/06/9ae96a3e5dcfd119377ba33d4c42a7d89da1efabd5cb3e366b156c45ff4d/zstandard-0.25.0-cp313-cp313-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/5b/bc/246f452431c592a2a424050e8bb9ccf494fb47613fd97c912f4d573a5e3b/phantom_types-3.0.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/5f/dd/0c6a5a36ec132665f85e5e33f0480b58cf5aa8af8fbe1d5971410d789558/ncls-0.0.70.tar.gz
@@ -4614,7 +4614,6 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/16/ee/efbd56687be60ef9af0c9c0ebe106964c07400eade5b0af8902a1d8cd58c/torch-2.10.0-3-cp310-cp310-manylinux_2_28_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/21/48/92dddc8df65b576c9d30752650c89301b5222d4ac10187724796cedfd723/pysam-0.24.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl
@@ -4646,6 +4645,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/6e/ae/76fb528c6112a3df5a581a18f1a2ceee5983d54977d7f2b6bc883637fe4c/polars_config_meta-0.3.4-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/72/25/973bd6128381951b23cdcd8a9870c6dcfc5606cb864df8eabd82e529f9c1/torchinfo-1.8.0-py3-none-any.whl
+      - pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/74/ff/9d30128a88df6c795097b6f73218d4a5afcd0e2d74cf2dedd99b28d42cdc/cyvcf2-0.31.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
       - pypi: https://files.pythonhosted.org/packages/77/39/4d8414260c3d83f22029a39e51553c173611b378d62ca391e5ca68e65cfa/awkward-2.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl
@@ -4887,7 +4887,7 @@ environments:
       - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/2c/2d/6ea7cad2c2f0625c4120bef5353ab7cf749141bf1d070011cebb72f68189/pandera-0.31.1-py3-none-any.whl
-      - pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
+      - pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/4e/ca/03624e017e5ee2d7ce8a08d89f81c1e535eb3c30d7b2dc4a435ea3fbbeae/mkdocs_glightbox-0.5.2-py3-none-any.whl
       - pypi: https://files.pythonhosted.org/packages/56/c6/65f646c7ff09bd257f660434adb45c4dfcbbcebcc030562fecf6f5bf887d/pydantic_core-2.46.4-cp310-cp310-macosx_11_0_arm64.whl
       - pypi: https://files.pythonhosted.org/packages/57/f6/a92704f33af317ce33c2bbda4a63f902f088d24b92a89fb5cdc52148e7cb/arro3_core-0.8.0-cp310-cp310-macosx_11_0_arm64.whl
@@ -6175,6 +6175,9 @@ packages:
   license: Apache-2.0 WITH LLVM-exception
   license_family: Apache
   purls: []
+  run_exports:
+    weak:
+    - libllvm14 >=14.0.6,<14.1.0a0
   size: 31484415
   timestamp: 1690557554081
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libllvm22-22.1.5-hf7376ad_1.conda
@@ -6653,6 +6656,7 @@ packages:
   license_family: BSD
   purls:
   - pkg:pypi/llvmlite?source=hash-mapping
+  run_exports: {}
   size: 3328102
   timestamp: 1706921747584
 - conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda
@@ -6970,6 +6974,7 @@ packages:
   license_family: BSD
   purls:
   - pkg:pypi/numba?source=hash-mapping
+  run_exports: {}
   size: 4313101
   timestamp: 1711475336305
 - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.4-py310hb13e2d6_0.conda
@@ -10382,6 +10387,9 @@ packages:
   license: Apache-2.0 WITH LLVM-exception
   license_family: Apache
   purls: []
+  run_exports:
+    weak:
+    - libllvm14 >=14.0.6,<14.1.0a0
   size: 20571387
   timestamp: 1690559110016
 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/liblzma-5.8.3-h8088a28_0.conda
@@ -10556,6 +10564,7 @@ packages:
   license_family: BSD
   purls:
   - pkg:pypi/llvmlite?source=hash-mapping
+  run_exports: {}
   size: 306724
   timestamp: 1706921994701
 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/lz4-c-1.10.0-h286801f_1.conda
@@ -10797,6 +10806,7 @@ packages:
   license_family: BSD
   purls:
   - pkg:pypi/numba?source=hash-mapping
+  run_exports: {}
   size: 4292616
   timestamp: 1711475805806
 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/numpy-1.26.4-py310hd45542a_0.conda
@@ -11489,10 +11499,9 @@ packages:
 - pypi: .
   name: genvarloader
   requires_dist:
-  - seqpro>=0.18
+  - seqpro>=0.20
   - genoray>=2.12.3,<3
   - numpy
-  - numba>=0.59.1
   - loguru
   - natsort
   - polars>=1.37.1
@@ -12379,25 +12388,6 @@ packages:
   requires_dist:
   - numpy>=1.21.3
   requires_python: '>=3.10'
-- pypi: https://files.pythonhosted.org/packages/1d/6c/330593fe4990a574afae001614ca6465b1352047fc9e623c8d675504fa44/seqpro-0.18.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
-  name: seqpro
-  version: 0.18.0
-  sha256: 6616e416009a44c971f8873b187b0b748203077201da1185feb3dcbc296260e8
-  requires_dist:
-  - numba>=0.58.1
-  - numpy>=1.26.0
-  - polars>=1.21.0,<2
-  - pyranges>=0.1.3,<0.2
-  - pandera>=0.31.1
-  - pandas
-  - pyarrow
-  - natsort
-  - narwhals>=2.20.0
-  - setuptools>=70
-  - awkward>=2.5.0
-  - polars-config-meta[polars]>=0.3.2
-  - attrs
-  requires_python: '>=3.10'
 - pypi: https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
   name: nvidia-cufft-cu12
   version: 11.3.3.83
@@ -12657,25 +12647,6 @@ packages:
   requires_dist:
   - typing-extensions ; python_full_version < '3.12'
   requires_python: '>=3.9'
-- pypi: https://files.pythonhosted.org/packages/2f/25/1e51f4a6a387956f6ce601eedde4d3955816ec8491bc61a2794d59da9053/seqpro-0.18.0-cp39-abi3-macosx_11_0_arm64.whl
-  name: seqpro
-  version: 0.18.0
-  sha256: d0b99c5e400933ae33f4369e921d30a74bf7fc30491fc45e2c95d99eb24c13f6
-  requires_dist:
-  - numba>=0.58.1
-  - numpy>=1.26.0
-  - polars>=1.21.0,<2
-  - pyranges>=0.1.3,<0.2
-  - pandera>=0.31.1
-  - pandas
-  - pyarrow
-  - natsort
-  - narwhals>=2.20.0
-  - setuptools>=70
-  - awkward>=2.5.0
-  - polars-config-meta[polars]>=0.3.2
-  - attrs
-  requires_python: '>=3.10'
 - pypi: https://files.pythonhosted.org/packages/2f/86/a6f3ff1fd795f49545a7c74b2c92f62729135d73e7e4055bf74da5a26c82/aiohttp-3.13.5-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl
   name: aiohttp
   version: 3.13.5
@@ -13151,6 +13122,25 @@ packages:
   version: 12.6.80
   sha256: 6768bad6cab4f19e8292125e5f1ac8aa7d1718704012a0e3272a6f61c4bce132
   requires_python: '>=3'
+- pypi: https://files.pythonhosted.org/packages/4b/82/14fed4543ed4ddb4fa582f04bd50e9c2dacad4f6c2aa38de4cf8b32ea252/seqpro-0.20.0-cp39-abi3-macosx_11_0_arm64.whl
+  name: seqpro
+  version: 0.20.0
+  sha256: 47d4e459c8dc078768a57a8f2b9b58526bb084eab111c7e6c2e3eb68cba30c1e
+  requires_dist:
+  - numba>=0.58.1
+  - numpy>=1.26.0
+  - polars>=1.21.0,<2
+  - pyranges>=0.1.3,<0.2
+  - pandera>=0.31.1
+  - pandas
+  - pyarrow
+  - natsort
+  - narwhals>=2.20.0
+  - setuptools>=70
+  - awkward>=2.5.0
+  - polars-config-meta[polars]>=0.3.2
+  - attrs
+  requires_python: '>=3.10'
 - pypi: https://files.pythonhosted.org/packages/4b/ac/b605473de2bb404e742f2cc3583d12aedb2352a70e49ae8fce455b50c5aa/multidict-6.7.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl
   name: multidict
   version: 6.7.1
@@ -14025,6 +14015,25 @@ packages:
   - pytz ; extra == 'test'
   - pandas ; extra == 'test'
   requires_python: '>=3.9'
+- pypi: https://files.pythonhosted.org/packages/74/df/b1f009cb86e2d721ad8a1e9f64acb0df49743e15b62dad54276e863bc960/seqpro-0.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+  name: seqpro
+  version: 0.20.0
+  sha256: d4f826e7eace851058adc6dd7e9f358dfc264b735109c6701f32c91877e64737
+  requires_dist:
+  - numba>=0.58.1
+  - numpy>=1.26.0
+  - polars>=1.21.0,<2
+  - pyranges>=0.1.3,<0.2
+  - pandera>=0.31.1
+  - pandas
+  - pyarrow
+  - natsort
+  - narwhals>=2.20.0
+  - setuptools>=70
+  - awkward>=2.5.0
+  - polars-config-meta[polars]>=0.3.2
+  - attrs
+  requires_python: '>=3.10'
 - pypi: https://files.pythonhosted.org/packages/74/ff/9d30128a88df6c795097b6f73218d4a5afcd0e2d74cf2dedd99b28d42cdc/cyvcf2-0.31.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
   name: cyvcf2
   version: 0.31.4
diff --git a/pixi.toml b/pixi.toml
index 83f7f852..3e54e402 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -83,12 +83,17 @@ basenji2-pytorch = ">=0.1.2"
 [feature.py310.dependencies]
 python = "3.10.*"
 numpy = "1.26.*"
+# numba kept as a CONDA pin only because seqpro (a hard dep) eagerly imports
+# numba, and only the conda build ships a working libllvmlite.so in this env —
+# the PyPI numba/llvmlite wheel fails to load here. genvarloader's OWN code is
+# numba-free (see tests/parity/test_import_no_numba.py); this pin is purely to
+# keep seqpro's transitive numba working. Drop once seqpro stops importing numba.
 numba = "==0.59.1"
 
 [feature.py310.pypi-dependencies]
 pyarrow = ">=21"
 hirola = "==0.3"
-seqpro = "==0.18.0"
+seqpro = "==0.20.0"
 genoray = "==2.12.3"
 polars = "==1.37.1"
 loguru = "*"
@@ -142,9 +147,14 @@ test-join-audit = { cmd = "pytest tests -p tests._join_audit_plugin", depends-on
 typecheck = { cmd = "pyrefly check" }
 bench = { cmd = "pytest tests/benchmarks --codspeed -p no:cov" }
 bench-local = { cmd = "pytest tests/benchmarks --benchmark-only -p no:cov" }
-profile-haps = { cmd = "py-spy record -o tests/benchmarks/profiling/haps.speedscope.json -f speedscope -- python tests/benchmarks/profiling/profile.py --mode haplotypes" }
-profile-tracks = { cmd = "py-spy record -o tests/benchmarks/profiling/tracks.speedscope.json -f speedscope -- python tests/benchmarks/profiling/profile.py --mode tracks" }
-profile-variants = { cmd = "py-spy record -o tests/benchmarks/profiling/variants.speedscope.json -f speedscope -- python tests/benchmarks/profiling/profile.py --mode variants" }
+# perf on the Python process (NOT py-spy --native, which slows deep-stack paths ~10x).
+# No sudo on Carter (perf_event_paranoid=2 allows user-space sampling of own process);
+# resolves genvarloader.abi3.so Rust symbols. View with:
+#   perf report --stdio --no-children -i tests/benchmarks/profiling/<mode>.perf.data
+# $CONDA_PREFIX/bin/python = the active pixi env interpreter (perf must exec the right one).
+profile-haps = { cmd = "perf record -F 999 -o tests/benchmarks/profiling/haps.perf.data -- $CONDA_PREFIX/bin/python tests/benchmarks/profiling/profile.py --mode haplotypes --n-batches 12000" }
+profile-tracks = { cmd = "perf record -F 999 -o tests/benchmarks/profiling/tracks.perf.data -- $CONDA_PREFIX/bin/python tests/benchmarks/profiling/profile.py --mode tracks --n-batches 12000" }
+profile-variants = { cmd = "perf record -F 999 -o tests/benchmarks/profiling/variants.perf.data -- $CONDA_PREFIX/bin/python tests/benchmarks/profiling/profile.py --mode variants --n-batches 12000" }
 memray-haps = { cmd = "memray run -fo tests/benchmarks/profiling/haps.memray.bin tests/benchmarks/profiling/profile.py --mode haplotypes" }
 memray-tracks = { cmd = "memray run -fo tests/benchmarks/profiling/tracks.memray.bin tests/benchmarks/profiling/profile.py --mode tracks" }
 memray-variants = { cmd = "memray run -fo tests/benchmarks/profiling/variants.memray.bin tests/benchmarks/profiling/profile.py --mode variants" }
diff --git a/pyproject.toml b/pyproject.toml
index e39ad6fd..ac046e4d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,10 +10,9 @@ readme = "README.md"
 license = { file = "LICENSE.txt" }
 requires-python = ">=3.10,<3.14" # >= 3.14 blocked by pyarrow/genoray
 dependencies = [
-    "seqpro>=0.18",
+    "seqpro>=0.20",
     "genoray>=2.12.3,<3",
     "numpy",
-    "numba>=0.59.1",
     "loguru",
     "natsort",
     "polars>=1.37.1",
@@ -112,8 +111,8 @@ bad-override = "warn"
 # Mostly the same ArrayDataset / RaggedDataset return-shape drift plus a few
 # polymorphic-return sites that PR5/PR6 will narrow. Keep visible as WARN.
 bad-return = "warn"
-# numba ITYPE default + a default arg mismatch in a small kernel; revisit
-# in PR8 once the surrounding code stabilizes.
+# Default arg mismatch at a few call sites; revisit in PR8 once the
+# surrounding code stabilizes.
 bad-function-definition = "warn"
 # Six call sites with overload friction (seqpro.cast_seqs, Dataset.open,
 # numpy.reshape, genoray.get_record_info). Surface but don't block.
@@ -148,7 +147,7 @@ filterwarnings = [
 ]
 markers = [
     "slow: mark test as slow (deselect with '-m \"not slow\"')",
-    "parity: byte-identical numba-vs-rust differential tests (Rust migration)",
+    "parity: rust-vs-frozen-golden differential tests (Rust migration)",
 ]
 
 [tool.coverage.run]
@@ -168,8 +167,6 @@ exclude_lines = [
     "if TYPE_CHECKING:",
     "raise NotImplementedError",
     "\\.\\.\\.",
-    "@nb.njit",
-    "@numba.njit",
     "raise ImportError\\(\"PyTorch is not available",
 ]
 
diff --git a/python/genvarloader/__init__.py b/python/genvarloader/__init__.py
index 545edf23..c665c73c 100644
--- a/python/genvarloader/__init__.py
+++ b/python/genvarloader/__init__.py
@@ -1,9 +1,9 @@
-# ruff: noqa: E402  cap_numba_threads() must run before any numba kernel imports
+# ruff: noqa: E402  cap_threads() must run before the first rust parallel call
 import importlib.metadata
 
-from ._threads import cap_numba_threads
+from ._threads import cap_threads
 
-cap_numba_threads()
+cap_threads()
 
 from seqpro.bed import read as read_bedlike
 from seqpro.bed import with_len as with_length
@@ -26,6 +26,7 @@
 )
 from ._dataset._rag_variants import RaggedVariants
 from ._dataset._reference import RefDataset, Reference
+from ._dataset._migrate import migrate
 from ._dataset._svar_link import migrate_svar_link
 from ._dataset._write import get_splice_bed, update, write
 from ._dummy import get_dummy_dataset
@@ -71,6 +72,7 @@
     "data_registry",
     "get_dummy_dataset",
     "get_splice_bed",
+    "migrate",
     "migrate_svar_link",
     "read_bedlike",
     "sites_vcf_to_table",
diff --git a/python/genvarloader/_dataset/_flat_flanks.py b/python/genvarloader/_dataset/_flat_flanks.py
index fdb3e957..a6211465 100644
--- a/python/genvarloader/_dataset/_flat_flanks.py
+++ b/python/genvarloader/_dataset/_flat_flanks.py
@@ -6,10 +6,12 @@
 
 from __future__ import annotations
 
-import numba as nb
 import numpy as np
 from numpy.typing import NDArray
 
+from .._ragged import Ragged
+from .._utils import lengths_to_offsets
+from ..genvarloader import get_reference as _get_reference_ffi
 from ._flat_variants import _FlatWindow
 
 
@@ -80,7 +82,6 @@ def compute_flank_tokens(
     return tokens.reshape(-1), np.asarray(row_offsets, np.int64)
 
 
-@nb.njit(nogil=True, cache=True)  # pragma: no cover - njit
 def _assemble_alt_windows(f5, f3, alt_data, alt_seq_off, flank_len):
     """Concatenate flank5 (fixed L) + alt (variable) + flank3 (fixed L) per variant
     into a flat byte buffer. f5/f3 are (n_var, L) row-major flat (n_var*L,)."""
@@ -219,3 +220,137 @@ def compute_windows(
     )
     alt_w = _FlatWindow(lut[alt_bytes], alt_off, row_off, (None,))
     return ref_w, alt_w
+
+
+class _RefShim:
+    """Minimal reference-object shim wrapping raw (reference, ref_offsets) arrays.
+
+    Implements the ``.fetch(contigs, starts, ends)`` interface used by
+    ``compute_flank_tokens``, ``compute_ref_window``, and ``compute_alt_window``,
+    backed by the ``get_reference`` FFI call so behavior is byte-identical to a
+    ``Reference`` object (same padded-slice logic, same OOB padding).
+    """
+
+    def __init__(
+        self,
+        reference: NDArray[np.uint8],
+        ref_offsets: NDArray[np.int64],
+        pad_char: int,
+    ) -> None:
+        self._ref = np.ascontiguousarray(reference, np.uint8)
+        self._off = np.ascontiguousarray(ref_offsets, np.int64)
+        self._pad = int(pad_char)
+
+    def fetch(
+        self,
+        contigs: NDArray[np.integer],
+        starts: NDArray[np.integer],
+        ends: NDArray[np.integer],
+    ) -> "Ragged":
+        contigs = np.ascontiguousarray(contigs, np.int32)
+        starts = np.ascontiguousarray(starts, np.int32)
+        ends = np.ascontiguousarray(ends, np.int32)
+        n = len(contigs)
+        lengths = np.asarray(ends - starts, np.int64)
+        out_offsets = lengths_to_offsets(lengths)
+        regions = np.stack([contigs, starts, ends], axis=1).astype(np.int32)
+        data = _get_reference_ffi(
+            regions, out_offsets, self._ref, self._off, self._pad, False, None
+        )
+        return Ragged.from_offsets(data.view("S1"), (n, None), out_offsets)
+
+
+def _assemble_variant_buffers_numba(
+    mode: int,
+    v_idxs: NDArray[np.int32],
+    row_offsets: NDArray[np.int64],
+    alt_global: NDArray[np.uint8],
+    alt_off_global: NDArray[np.int64],
+    ref_global: "NDArray[np.uint8] | None",
+    ref_off_global: "NDArray[np.int64] | None",
+    want_ref_bytes: bool,
+    want_flank: bool,
+    ref_mode: int,
+    alt_mode: int,
+    flank_len: int,
+    lut: "NDArray | None",
+    v_contigs: NDArray[np.int32],
+    v_starts: NDArray[np.int32],
+    ilens: NDArray[np.int32],
+    reference: NDArray[np.uint8],
+    ref_offsets: NDArray[np.int64],
+    pad_char: int,
+) -> "dict[str, tuple[NDArray, NDArray[np.int64]]]":
+    """Numba/numpy oracle for assemble_variant_buffers: composes existing helpers.
+
+    Mirrors the Rust ``assemble_variants_mode`` / ``assemble_windows_mode`` logic,
+    producing the same ``{name: (data, seq_offsets)}`` dict contract. Used as the
+    parity reference in ``assert_kernel_parity_dict``. Does NOT re-implement any
+    sub-kernel logic — delegates entirely to the registered helpers.
+    """
+    from ._flat_variants import _gather_alleles
+
+    v_idxs = np.ascontiguousarray(v_idxs, np.int32)
+    row_offsets = np.ascontiguousarray(row_offsets, np.int64)
+    alt_global = np.ascontiguousarray(alt_global, np.uint8)
+    alt_off_global = np.ascontiguousarray(alt_off_global, np.int64)
+
+    out: dict[str, tuple[NDArray, NDArray[np.int64]]] = {}
+
+    if mode == 0:  # variants mode
+        alt_data, alt_seq_off = _gather_alleles(v_idxs, alt_global, alt_off_global)
+        out["alt"] = (alt_data, alt_seq_off)
+
+        if want_ref_bytes and ref_global is not None and ref_off_global is not None:
+            rg = np.ascontiguousarray(ref_global, np.uint8)
+            ro = np.ascontiguousarray(ref_off_global, np.int64)
+            ref_data, ref_seq_off = _gather_alleles(v_idxs, rg, ro)
+            out["ref"] = (ref_data, ref_seq_off)
+
+        if want_flank:
+            # v_starts / ilens are GLOBAL per-variant arrays; gather by v_idxs.
+            starts_v = np.asarray(v_starts, np.int32)[v_idxs]
+            ilens_v = np.asarray(ilens, np.int32)[v_idxs]
+            ref_shim = _RefShim(reference, ref_offsets, pad_char)
+            tok, off = compute_flank_tokens(
+                ref_shim, v_contigs, starts_v, ilens_v, flank_len, lut, row_offsets
+            )
+            out["flank_tokens"] = (tok, off)
+
+    else:  # windows mode
+        alt_data, alt_seq_off = _gather_alleles(v_idxs, alt_global, alt_off_global)
+        # v_starts / ilens are GLOBAL; gather by v_idxs before passing to helpers.
+        starts_v = np.asarray(v_starts, np.int32)[v_idxs]
+        ilens_v = np.asarray(ilens, np.int32)[v_idxs]
+        ref_shim = _RefShim(reference, ref_offsets, pad_char)
+
+        if ref_mode == 1:  # flanked ref window: [start-L, end+L)
+            rw = compute_ref_window(
+                ref_shim, v_contigs, starts_v, ilens_v, flank_len, lut, row_offsets
+            )
+            out["ref_window"] = (rw.data, rw.seq_offsets)
+        elif ref_mode == 2:  # bare tokenized ref allele (no flanks)
+            rg = np.ascontiguousarray(ref_global, np.uint8)
+            ro = np.ascontiguousarray(ref_off_global, np.int64)
+            ref_data, ref_seq_off = _gather_alleles(v_idxs, rg, ro)
+            rw = tokenize_alleles(ref_data, ref_seq_off, lut, row_offsets)
+            out["ref"] = (rw.data, rw.seq_offsets)
+
+        if alt_mode == 1:  # flanked alt window: flank5 . alt . flank3
+            aw = compute_alt_window(
+                ref_shim,
+                v_contigs,
+                starts_v,
+                ilens_v,
+                alt_data,
+                alt_seq_off,
+                flank_len,
+                lut,
+                row_offsets,
+            )
+            out["alt_window"] = (aw.data, aw.seq_offsets)
+        elif alt_mode == 2:  # bare tokenized alt allele (no flanks)
+            aw = tokenize_alleles(alt_data, alt_seq_off, lut, row_offsets)
+            out["alt"] = (aw.data, aw.seq_offsets)
+
+    return out
diff --git a/python/genvarloader/_dataset/_flat_variants.py b/python/genvarloader/_dataset/_flat_variants.py
index 22fe5b5d..0979d6de 100644
--- a/python/genvarloader/_dataset/_flat_variants.py
+++ b/python/genvarloader/_dataset/_flat_variants.py
@@ -6,10 +6,29 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Literal
 
-import numba as nb
 import numpy as np
 from numpy.typing import NDArray
 
+from ..genvarloader import compact_keep_f32 as _compact_keep_f32_rust
+from ..genvarloader import compact_keep_i32 as _compact_keep_i32_rust
+from ..genvarloader import fill_empty_fixed_f32 as _fill_empty_fixed_f32_rust
+from ..genvarloader import fill_empty_fixed_i32 as _fill_empty_fixed_i32_rust
+from ..genvarloader import fill_empty_scalar_f32 as _fill_empty_scalar_f32_rust
+from ..genvarloader import fill_empty_scalar_i32 as _fill_empty_scalar_i32_rust
+from ..genvarloader import (
+    assemble_variant_buffers_i32 as _assemble_variant_buffers_i32_rust,
+)
+from ..genvarloader import (
+    assemble_variant_buffers_u8 as _assemble_variant_buffers_u8_rust,
+)
+from ..genvarloader import fill_empty_seq_i32 as _fill_empty_seq_i32_rust
+from ..genvarloader import fill_empty_seq_u8 as _fill_empty_seq_u8_rust
+from ..genvarloader import gather_alleles as _gather_alleles_rust
+from ..genvarloader import gather_rows_f32 as _gather_rows_f32_rust
+from ..genvarloader import gather_rows_i32 as _gather_rows_i32_rust
+from ..genvarloader import rc_alleles as _rc_alleles_rust_kernel
+from ._genotypes import _as_starts_stops
+
 if TYPE_CHECKING:
     from ._haps import Haps
 
@@ -99,26 +118,18 @@ def to_ragged(self):
     def reverse_masked(self, mask: NDArray[np.bool_]) -> "_FlatAlleles":
         """DNA reverse-complement the mask-selected rows' alleles, in place.
 
-        ``mask`` is one entry per region (length ``b``); it is broadcast across
-        ploidy then across each (b*p) row's variant count, exactly matching
-        ``RaggedVariants.rc_`` (``np.repeat(to_rc, ploidy)`` then
-        ``np.repeat(per_bp, np.diff(group_off))``).
+        ``mask`` is one entry per region (length ``b``); broadcast across ploidy
+        to a per-(b*p) row mask, then expanded per-allele inside the dispatched
+        ``rc_alleles`` kernel (rust default, seqpro reference).
         """
-        from seqpro.rag import Ragged
-
-        from .._ragged import reverse_complement_masked
-
         m = np.ascontiguousarray(mask, np.bool_).reshape(-1)
-        # per-(b*p) mask: broadcast each region's flag across ploidy
-        per_bp = np.repeat(m, self.ploidy)
-        # per-allele mask: repeat each row's flag across its variant count
-        per_allele = np.repeat(per_bp, np.diff(self.var_offsets))
-        view = Ragged.from_offsets(
-            self.byte_data.view("S1"),
-            (per_allele.size, None),
+        per_bp = np.repeat(m, self.ploidy)  # per-(b*p) row mask
+        _rc_alleles_rust(
+            self.byte_data,
             np.asarray(self.seq_offsets, np.int64),
+            np.asarray(self.var_offsets, np.int64),
+            per_bp,
         )
-        reverse_complement_masked(view, per_allele)  # mutates byte_data in place
         return self
 
     def reshape(self, shape: int | tuple[int, ...]) -> "_FlatAlleles":
@@ -429,233 +440,338 @@ def fill_empty_groups(
         return out
 
 
-@nb.njit(nogil=True, cache=True)
-def _gather_v_idxs(
-    geno_offset_idx, geno_offsets, geno_v_idxs
-):  # pragma: no cover - njit
-    """Gather per-row variant indices: for each row's offset slice into the
-    sparse arrays, copy its values out into flat ``(data, offsets)``.
+def _gather_alleles(v_idxs, allele_bytes, allele_offsets):
+    return _gather_alleles_rust(
+        np.ascontiguousarray(v_idxs, np.int32),
+        np.ascontiguousarray(allele_bytes, np.uint8),
+        np.ascontiguousarray(allele_offsets, np.int64),
+    )
 
-    ``geno_offsets`` must be 1-D contiguous (length n_rows + 1).  For the
-    non-contiguous (2, n_rows) starts/stops form use :func:`_gather_v_idxs_ss`.
-    """
-    n_rows = geno_offset_idx.shape[0]
-    out_offsets = np.empty(n_rows + 1, np.int64)
-    out_offsets[0] = 0
-    for i in range(n_rows):
-        goi = geno_offset_idx[i]
-        out_offsets[i + 1] = out_offsets[i] + (
-            geno_offsets[goi + 1] - geno_offsets[goi]
-        )
-    total = out_offsets[n_rows]
-    v_idxs = np.empty(total, geno_v_idxs.dtype)
-    dst = 0
-    for i in range(n_rows):
-        goi = geno_offset_idx[i]
-        s = geno_offsets[goi]
-        e = geno_offsets[goi + 1]
-        for k in range(s, e):
-            v_idxs[dst] = geno_v_idxs[k]
-            dst += 1
-    return v_idxs, out_offsets
-
-
-@nb.njit(nogil=True, cache=True)
-def _gather_v_idxs_ss(
-    geno_offset_idx, geno_starts, geno_stops, geno_v_idxs
-):  # pragma: no cover - njit
-    """Like :func:`_gather_v_idxs` but for non-contiguous (starts, stops) offsets.
-
-    ``geno_starts`` and ``geno_stops`` are the two rows of a ``(2, n)`` offset
-    array (``geno_starts = geno_offsets[0]``, ``geno_stops = geno_offsets[1]``).
-    """
+
+def _gather_rows_numpy(geno_offset_idx, off2d, data):
+    """Dtype-preserving row gather for arbitrary dtypes (numpy fallback)."""
+    geno_starts = off2d[0]
+    geno_stops = off2d[1]
     n_rows = geno_offset_idx.shape[0]
     out_offsets = np.empty(n_rows + 1, np.int64)
     out_offsets[0] = 0
     for i in range(n_rows):
-        goi = geno_offset_idx[i]
+        goi = int(geno_offset_idx[i])
         out_offsets[i + 1] = out_offsets[i] + (geno_stops[goi] - geno_starts[goi])
-    total = out_offsets[n_rows]
-    v_idxs = np.empty(total, geno_v_idxs.dtype)
+    total = int(out_offsets[n_rows])
+    out_data = np.empty(total, data.dtype)
     dst = 0
     for i in range(n_rows):
-        goi = geno_offset_idx[i]
-        s = geno_starts[goi]
-        e = geno_stops[goi]
-        for k in range(s, e):
-            v_idxs[dst] = geno_v_idxs[k]
-            dst += 1
-    return v_idxs, out_offsets
-
-
-@nb.njit(nogil=True, cache=True)
-def _gather_alleles(v_idxs, allele_bytes, allele_offsets):  # pragma: no cover - njit
-    """Gather variable-length allele bytestrings for ``v_idxs`` from the global
-    allele byte buffer into flat ``(data, seq_offsets)``."""
-    n = v_idxs.shape[0]
-    seq_offsets = np.empty(n + 1, np.int64)
-    seq_offsets[0] = 0
-    for i in range(n):
-        v = v_idxs[i]
-        seq_offsets[i + 1] = seq_offsets[i] + (
-            allele_offsets[v + 1] - allele_offsets[v]
-        )
-    data = np.empty(seq_offsets[n], np.uint8)
-    dst = 0
-    for i in range(n):
-        v = v_idxs[i]
-        s = allele_offsets[v]
-        e = allele_offsets[v + 1]
-        for k in range(s, e):
-            data[dst] = allele_bytes[k]
-            dst += 1
-    return data, seq_offsets
-
-
-@nb.njit(nogil=True, cache=True)
-def _compact_keep(v_idxs, row_offsets, keep):  # pragma: no cover - njit
-    """Drop variants where ``keep`` is False, rebuilding row offsets. The first
-    param is per-variant values to compact -- either ``v_idxs`` itself or a
-    parallel array (e.g. gathered dosage values) sharing the same row layout."""
+        goi = int(geno_offset_idx[i])
+        s = int(geno_starts[goi])
+        e = int(geno_stops[goi])
+        out_data[dst : dst + (e - s)] = data[s:e]
+        dst += e - s
+    return out_data, out_offsets
+
+
+def _compact_keep_numpy(v_idxs, row_offsets, keep):
+    """Dtype-preserving compact-keep for arbitrary dtypes (numpy fallback)."""
     n_rows = row_offsets.shape[0] - 1
     new_offsets = np.empty(n_rows + 1, np.int64)
     new_offsets[0] = 0
-    n_keep = 0
     for i in range(n_rows):
-        for j in range(row_offsets[i], row_offsets[i + 1]):
-            if keep[j]:
-                n_keep += 1
-        new_offsets[i + 1] = n_keep
+        cnt = int(np.count_nonzero(keep[row_offsets[i] : row_offsets[i + 1]]))
+        new_offsets[i + 1] = new_offsets[i] + cnt
+    n_keep = int(new_offsets[n_rows])
     new_v = np.empty(n_keep, v_idxs.dtype)
-    dst = 0
-    for j in range(v_idxs.shape[0]):
-        if keep[j]:
-            new_v[dst] = v_idxs[j]
-            dst += 1
+    new_v[:] = v_idxs[keep]
     return new_v, new_offsets
 
 
+def _compact_keep(v_idxs, row_offsets, keep):
+    """Dispatch compact-keep by dtype, preserving the input dtype without down-cast.
+
+    Routes int32 → compact_keep_i32 (Rust), float32 → compact_keep_f32 (Rust).
+    All other dtypes (e.g. int16, int64 custom FORMAT fields, issue #231) fall
+    back to the dtype-preserving numpy kernel so values are never silently
+    coerced.
+    """
+    values = np.ascontiguousarray(v_idxs)
+    row_offsets = np.ascontiguousarray(row_offsets, np.int64)
+    keep = np.ascontiguousarray(keep, np.bool_)
+    if values.dtype == np.int32:
+        return _compact_keep_i32_rust(values, row_offsets, keep)
+    if values.dtype == np.float32:
+        return _compact_keep_f32_rust(values, row_offsets, keep)
+    # Arbitrary dtypes (custom FORMAT fields, e.g. int16, int64): dtype-preserving
+    # numpy fallback — never down-cast.
+    return _compact_keep_numpy(values, row_offsets, keep)
+
+
 def _gather_rows(
     geno_offset_idx: NDArray[np.intp],
     offsets: NDArray[np.int64],
     data: NDArray,
 ) -> tuple[NDArray, NDArray[np.int64]]:
-    """Dispatch to the correct gather kernel based on offset array shape.
+    """Dispatch per-row gather (numba/rust), preserving data dtype.
 
-    ``offsets`` may be:
-    - 1-D ``(n + 1,)``: contiguous offsets — use :func:`_gather_v_idxs`.
-    - 2-D ``(2, n)``: non-contiguous starts/stops — use :func:`_gather_v_idxs_ss`.
+    Routes int32 and float32 to typed Rust cores; all other dtypes fall back to
+    the dtype-preserving numpy kernel so values are never silently down-cast
+    (e.g. custom per-call FORMAT fields, issue #231).
     """
-    if offsets.ndim == 1:
-        return _gather_v_idxs(geno_offset_idx, offsets, data)
-    else:
-        return _gather_v_idxs_ss(geno_offset_idx, offsets[0], offsets[1], data)
-
-
-@nb.njit(nogil=True, cache=True)
-def _fill_empty_scalar(data, offsets, fill):  # pragma: no cover - njit
-    """Insert one ``fill`` element into each empty row; copy non-empty rows
-    through. Returns ``(new_data, new_offsets)``."""
+    goi = np.ascontiguousarray(geno_offset_idx, np.int64)
+    off2d = _as_starts_stops(offsets)
+    data = np.ascontiguousarray(data)
+    if data.dtype == np.int32:
+        return _gather_rows_i32_rust(goi, off2d, data)
+    if data.dtype == np.float32:
+        return _gather_rows_f32_rust(goi, off2d, data)
+    # Arbitrary custom-FORMAT-field dtypes (#231): no typed Rust core — use the
+    # dtype-preserving numpy kernel directly so values are never down-cast.
+    return _gather_rows_numpy(goi, off2d, data)
+
+
+def _fill_empty_scalar_numpy(data, offsets, fill):
+    """Dtype-preserving fill-empty-scalar for arbitrary dtypes (numpy fallback)."""
     n_rows = offsets.shape[0] - 1
+    lengths = np.diff(offsets)
+    new_lengths = np.where(lengths > 0, lengths, 1)
     new_offsets = np.empty(n_rows + 1, np.int64)
     new_offsets[0] = 0
-    for i in range(n_rows):
-        ln = offsets[i + 1] - offsets[i]
-        new_offsets[i + 1] = new_offsets[i] + (ln if ln > 0 else 1)
+    new_offsets[1:] = np.cumsum(new_lengths)
     new_data = np.empty(new_offsets[n_rows], data.dtype)
     for i in range(n_rows):
-        s = offsets[i]
-        e = offsets[i + 1]
-        d = new_offsets[i]
+        s, e = int(offsets[i]), int(offsets[i + 1])
+        d = int(new_offsets[i])
         if e == s:
             new_data[d] = fill
         else:
-            for k in range(s, e):
-                new_data[d] = data[k]
-                d += 1
+            new_data[d : d + (e - s)] = data[s:e]
     return new_data, new_offsets
 
 
-@nb.njit(nogil=True, cache=True)
-def _fill_empty_seq(data, var_offsets, seq_offsets, dummy):  # pragma: no cover - njit
-    """Two-level analogue of ``_fill_empty_scalar`` for allele bytestrings.
-    Empty variant-rows receive one dummy allele of ``dummy`` bytes. Returns
-    ``(new_data, new_var_offsets, new_seq_offsets)``."""
+def _fill_empty_scalar(data, offsets, fill):
+    """Dtype-preserving dispatch for fill-empty-scalar.
+
+    Routes int32 and float32 to typed Rust cores; all other dtypes (e.g.
+    custom FORMAT fields, issue #231) fall back to the dtype-preserving numpy
+    kernel so values are never silently down-cast.
+    """
+    data = np.ascontiguousarray(data)
+    offsets = np.ascontiguousarray(offsets, np.int64)
+    if data.dtype == np.int32:
+        return _fill_empty_scalar_i32_rust(data, offsets, int(fill))
+    if data.dtype == np.float32:
+        return _fill_empty_scalar_f32_rust(data, offsets, float(fill))
+    # Arbitrary dtype (custom FORMAT fields): preserve dtype via numpy fallback.
+    return _fill_empty_scalar_numpy(data, offsets, fill)
+
+
+def _fill_empty_seq_numpy(data, var_offsets, seq_offsets, dummy):
+    """Dtype-preserving fill-empty-seq for arbitrary dtypes (numpy fallback)."""
     n_rows = var_offsets.shape[0] - 1
     L = dummy.shape[0]
+    nv_lengths = np.diff(var_offsets)
+    new_var_lengths = np.where(nv_lengths > 0, nv_lengths, 1)
     new_var = np.empty(n_rows + 1, np.int64)
     new_var[0] = 0
-    for i in range(n_rows):
-        nv = var_offsets[i + 1] - var_offsets[i]
-        new_var[i + 1] = new_var[i] + (nv if nv > 0 else 1)
-    total_vars = new_var[n_rows]
+    new_var[1:] = np.cumsum(new_var_lengths)
+    total_vars = int(new_var[n_rows])
     new_seq = np.empty(total_vars + 1, np.int64)
     new_seq[0] = 0
     vptr = 0
     for i in range(n_rows):
-        vs = var_offsets[i]
-        ve = var_offsets[i + 1]
+        vs, ve = int(var_offsets[i]), int(var_offsets[i + 1])
         if ve == vs:
             new_seq[vptr + 1] = new_seq[vptr] + L
             vptr += 1
         else:
             for v in range(vs, ve):
-                vlen = seq_offsets[v + 1] - seq_offsets[v]
+                vlen = int(seq_offsets[v + 1]) - int(seq_offsets[v])
                 new_seq[vptr + 1] = new_seq[vptr] + vlen
                 vptr += 1
-    new_data = np.empty(new_seq[total_vars], data.dtype)
+    total_bytes = int(new_seq[total_vars])
+    new_data = np.empty(total_bytes, data.dtype)
     vptr = 0
     dptr = 0
     for i in range(n_rows):
-        vs = var_offsets[i]
-        ve = var_offsets[i + 1]
+        vs, ve = int(var_offsets[i]), int(var_offsets[i + 1])
         if ve == vs:
-            for k in range(L):
-                new_data[dptr] = dummy[k]
-                dptr += 1
+            new_data[dptr : dptr + L] = dummy
+            dptr += L
             vptr += 1
         else:
             for v in range(vs, ve):
-                bs = seq_offsets[v]
-                be = seq_offsets[v + 1]
-                for k in range(bs, be):
-                    new_data[dptr] = data[k]
-                    dptr += 1
+                bs, be = int(seq_offsets[v]), int(seq_offsets[v + 1])
+                new_data[dptr : dptr + (be - bs)] = data[bs:be]
+                dptr += be - bs
                 vptr += 1
     return new_data, new_var, new_seq
 
 
-@nb.njit(nogil=True, cache=True)
-def _fill_empty_fixed(data, offsets, inner, fill):  # pragma: no cover - njit
-    """Fixed-inner-stride analogue of ``_fill_empty_scalar`` for ``flank_tokens``.
+def _fill_empty_seq(data, var_offsets, seq_offsets, dummy):
+    """Dtype-preserving dispatch for fill-empty-seq (two-level dummy-fill).
 
-    ``data`` holds ``n_var * inner`` tokens (variant-major); ``offsets`` are
-    *variant-level* (``b*p + 1``). Each empty row receives one dummy variant of
-    ``inner`` tokens all equal to ``fill``; non-empty rows pass through.
-    Returns ``(new_data, new_offsets)``."""
+    Routes uint8 (allele bytes) and int32 (token windows) to typed Rust cores.
+    All other dtypes fall back to the dtype-preserving numpy kernel so values
+    are never silently down-cast.
+    """
+    data = np.ascontiguousarray(data)
+    var_offsets = np.ascontiguousarray(var_offsets, np.int64)
+    seq_offsets = np.ascontiguousarray(seq_offsets, np.int64)
+    dummy = np.ascontiguousarray(dummy, data.dtype)
+    if data.dtype == np.uint8:
+        return _fill_empty_seq_u8_rust(data, var_offsets, seq_offsets, dummy)
+    if data.dtype == np.int32:
+        return _fill_empty_seq_i32_rust(data, var_offsets, seq_offsets, dummy)
+    # Arbitrary dtype: preserve via numpy fallback.
+    return _fill_empty_seq_numpy(data, var_offsets, seq_offsets, dummy)
+
+
+def _fill_empty_fixed_numpy(data, offsets, inner, fill):
+    """Dtype-preserving fill-empty-fixed for arbitrary dtypes (numpy fallback)."""
     n_rows = offsets.shape[0] - 1
+    lengths = np.diff(offsets)
+    new_lengths = np.where(lengths > 0, lengths, 1)
     new_offsets = np.empty(n_rows + 1, np.int64)
     new_offsets[0] = 0
-    for i in range(n_rows):
-        nv = offsets[i + 1] - offsets[i]
-        new_offsets[i + 1] = new_offsets[i] + (nv if nv > 0 else 1)
-    total_vars = new_offsets[n_rows]
+    new_offsets[1:] = np.cumsum(new_lengths)
+    total_vars = int(new_offsets[n_rows])
     new_data = np.empty(total_vars * inner, data.dtype)
     dptr = 0
     for i in range(n_rows):
-        vs = offsets[i]
-        ve = offsets[i + 1]
+        vs, ve = int(offsets[i]), int(offsets[i + 1])
         if ve == vs:
-            for _ in range(inner):
-                new_data[dptr] = fill
-                dptr += 1
+            new_data[dptr : dptr + inner] = fill
+            dptr += inner
         else:
-            for k in range(vs * inner, ve * inner):
-                new_data[dptr] = data[k]
-                dptr += 1
+            n = int(ve - vs) * inner
+            new_data[dptr : dptr + n] = data[vs * inner : ve * inner]
+            dptr += n
     return new_data, new_offsets
 
 
+def _fill_empty_fixed(data, offsets, inner, fill):
+    """Dtype-preserving dispatch for fill-empty-fixed.
+
+    Routes int32 and float32 to typed Rust cores; all other dtypes (e.g.
+    custom FORMAT fields, issue #231) fall back to the dtype-preserving numpy
+    kernel so values are never silently down-cast.
+    """
+    data = np.ascontiguousarray(data)
+    offsets = np.ascontiguousarray(offsets, np.int64)
+    if data.dtype == np.int32:
+        return _fill_empty_fixed_i32_rust(data, offsets, int(inner), int(fill))
+    if data.dtype == np.float32:
+        return _fill_empty_fixed_f32_rust(data, offsets, int(inner), float(fill))
+    # Arbitrary dtype (custom FORMAT fields): preserve dtype via numpy fallback.
+    return _fill_empty_fixed_numpy(data, offsets, inner, fill)
+
+
+def _assemble_variant_buffers_numba_entry(*args, **kwargs):
+    """Lazy wrapper for _assemble_variant_buffers_numba to avoid circular import.
+
+    ``_flat_flanks`` imports ``_FlatWindow`` from ``_flat_variants`` at module
+    level, so ``_flat_variants`` cannot import from ``_flat_flanks`` at module
+    level. This thin wrapper defers the import to call time.
+    """
+    from ._flat_flanks import _assemble_variant_buffers_numba
+
+    return _assemble_variant_buffers_numba(*args, **kwargs)
+
+
+def _assemble_variant_buffers_rust(
+    mode,
+    v_idxs,
+    row_offsets,
+    alt_global,
+    alt_off_global,
+    ref_global,
+    ref_off_global,
+    want_ref_bytes,
+    want_flank,
+    ref_mode,
+    alt_mode,
+    flank_len,
+    lut,
+    v_contigs,
+    v_starts,
+    ilens,
+    reference,
+    ref_offsets,
+    pad_char,
+):
+    """Dtype-selecting shim: routes to assemble_variant_buffers_u8/i32 by lut dtype.
+
+    If ``lut`` is None (variants mode with no flank tokens), defaults to the u8
+    monomorphization (token buffers are empty so dtype is irrelevant).
+    """
+    if lut is None:
+        fn = _assemble_variant_buffers_u8_rust
+        lut_arr = None
+    else:
+        lut_arr = np.asarray(lut)
+        if lut_arr.dtype == np.uint8:
+            fn = _assemble_variant_buffers_u8_rust
+            lut_arr = np.ascontiguousarray(lut_arr, np.uint8)
+        else:
+            fn = _assemble_variant_buffers_i32_rust
+            lut_arr = np.ascontiguousarray(lut_arr, np.int32)
+    return fn(
+        int(mode),
+        np.ascontiguousarray(v_idxs, np.int32),
+        np.ascontiguousarray(row_offsets, np.int64),
+        np.ascontiguousarray(alt_global, np.uint8),
+        np.ascontiguousarray(alt_off_global, np.int64),
+        None if ref_global is None else np.ascontiguousarray(ref_global, np.uint8),
+        None
+        if ref_off_global is None
+        else np.ascontiguousarray(ref_off_global, np.int64),
+        bool(want_ref_bytes),
+        bool(want_flank),
+        int(ref_mode),
+        int(alt_mode),
+        int(flank_len),
+        lut_arr,
+        np.ascontiguousarray(v_contigs, np.int32),
+        np.ascontiguousarray(v_starts, np.int32),
+        np.ascontiguousarray(ilens, np.int32),
+        np.ascontiguousarray(reference, np.uint8),
+        np.ascontiguousarray(ref_offsets, np.int64),
+        int(pad_char),
+    )
+
+
+def _rc_alleles_reference(byte_data, seq_offsets, var_offsets, to_rc_row):
+    """Reference backend: seqpro reverse_complement_masked on a flat allele view.
+
+    `to_rc_row` is the per-(b*p) row mask (already ploidy-broadcast); expand to
+    per-allele via `var_offsets`, then RC each masked allele in place. Mutates
+    `byte_data` in place; byte-identical to `rc_alleles_inplace`.
+    """
+    from seqpro.rag import Ragged
+
+    from .._ragged import reverse_complement_masked
+
+    seq_off = np.ascontiguousarray(seq_offsets, np.int64)
+    var_off = np.ascontiguousarray(var_offsets, np.int64)
+    row_mask = np.ascontiguousarray(to_rc_row, np.bool_).reshape(-1)
+    if not row_mask.any():
+        return
+    per_allele = np.repeat(row_mask, np.diff(var_off))
+    n_alleles = len(seq_off) - 1
+    view = Ragged.from_offsets(byte_data.view("S1"), (n_alleles, None), seq_off)
+    reverse_complement_masked(view, per_allele)  # mutates byte_data in place
+
+
+def _rc_alleles_rust(byte_data, seq_offsets, var_offsets, to_rc_row):
+    assert byte_data.dtype == np.uint8 and byte_data.flags.c_contiguous, (
+        "rc_alleles requires a contiguous uint8 byte_data for in-place RC"
+    )
+    _rc_alleles_rust_kernel(
+        byte_data,
+        np.ascontiguousarray(seq_offsets, np.int64),
+        np.ascontiguousarray(var_offsets, np.int64),
+        np.ascontiguousarray(to_rc_row, np.bool_),
+    )
+
+
 def get_variants_flat(
     haps: "Haps", idx: NDArray[np.integer], regions=None
 ) -> "_FlatVariants | _FlatVariantWindows":
@@ -730,25 +846,15 @@ def get_variants_flat(
 
     shape: tuple[int | None, ...] = (b, eff_ploidy, None)
 
-    fields: dict[str, Any] = {}
+    opt = haps.window_opt
 
-    # alt: ALWAYS (required)
-    alt_bytes = np.asarray(haps.variants.alt.data).view(np.uint8)
-    alt_off = np.asarray(haps.variants.alt.offsets, np.int64)
-    alt_data, alt_seq_off = _gather_alleles(v_idxs, alt_bytes, alt_off)
-    fields["alt"] = _FlatAlleles(alt_data, alt_seq_off, row_offsets, shape)
+    # --- Build scalar (non-allele) fields shared between both return paths ---
+    fields: dict[str, Any] = {}
 
-    # start: ALWAYS (added unconditionally by _get_variants)
+    # start: ALWAYS
     start_data = np.asarray(haps.variants.start)[v_idxs]
     fields["start"] = _Flat.from_offsets(start_data, shape, row_offsets)
 
-    # ref: if "ref" in var_fields
-    if "ref" in haps.var_fields:
-        ref_bytes = np.asarray(haps.variants.ref.data).view(np.uint8)
-        ref_off = np.asarray(haps.variants.ref.offsets, np.int64)
-        ref_data, ref_seq_off = _gather_alleles(v_idxs, ref_bytes, ref_off)
-        fields["ref"] = _FlatAlleles(ref_data, ref_seq_off, row_offsets, shape)
-
     # ilen: if "ilen" in var_fields
     if "ilen" in haps.var_fields:
         ilen_data = np.asarray(haps.variants.ilen)[v_idxs]
@@ -776,116 +882,163 @@ def get_variants_flat(
         info_data = np.asarray(haps.variants.info[k])[v_idxs]
         fields[k] = _Flat.from_offsets(info_data, shape, row_offsets)
 
-    flat = _FlatVariants(fields)
+    # --- Step 1: Compute shared kernel inputs ---
+    stat = haps.ffi_static
+    needs_fetch = (
+        regions is not None
+        and haps.token_lut is not None
+        and (
+            (issubclass(haps.kind, _FlatVariantWindows) and opt is not None)
+            or bool(haps.flank_length)
+        )
+    )
+    if needs_fetch:
+        regions_arr = np.asarray(regions)
+        group_contigs = np.repeat(regions_arr[:, 0], eff_ploidy)
+        v_contigs = np.repeat(group_contigs, np.diff(row_offsets)).astype(np.int32)
+    else:
+        v_contigs = np.zeros(len(v_idxs), np.int32)
 
-    # variant-windows kind: emit per-allele window/allele token buffers (a
-    # different output type) and return early.
-    opt = haps.window_opt
+    ref_present = "ref" in haps.var_fields and haps.variants.ref is not None
+    ref_global = ref_off_global = None
+    if ref_present or (
+        issubclass(haps.kind, _FlatVariantWindows)
+        and opt is not None
+        and (opt.ref == "allele")
+    ):
+        ref_global = np.asarray(haps.variants.ref.data).view(np.uint8)
+        ref_off_global = np.asarray(haps.variants.ref.offsets, np.int64)
+
+    # --- Step 2: variant-windows kind: emit per-allele token buffers (early return) ---
     if (
         regions is not None
         and issubclass(haps.kind, _FlatVariantWindows)
         and opt is not None
     ):
-        from ._flat_flanks import (
-            compute_alt_window,
-            compute_ref_window,
-            compute_windows,
-            tokenize_alleles,
-        )
-
         L = opt.flank_length
-        lut = haps.token_lut
-        starts_v = np.asarray(haps.variants.start)[v_idxs]
-        ilens_v = np.asarray(haps.variants.ilen)[v_idxs]
-        regions = np.asarray(regions)
-        group_contigs = np.repeat(regions[:, 0], eff_ploidy)
-        v_contigs = np.repeat(group_contigs, np.diff(row_offsets))
+        ref_mode = 1 if opt.ref == "window" else 2
+        alt_mode = 1 if opt.alt == "window" else 2
+        bufs = _assemble_variant_buffers_rust(
+            1,  # windows mode
+            v_idxs,
+            row_offsets,
+            stat.alt_alleles,
+            stat.alt_offsets,
+            ref_global,
+            ref_off_global,
+            False,  # want_ref_bytes (windows mode emits tokens, not raw bytes)
+            False,  # want_flank
+            ref_mode,
+            alt_mode,
+            L,
+            haps.token_lut,
+            v_contigs,
+            stat.v_starts,
+            stat.ilens,
+            stat.ref,
+            stat.ref_offsets,
+            haps.reference.pad_char,
+        )
         wshape = (b, eff_ploidy, None, None)
         wfields = {k: v for k, v in fields.items() if k not in ("alt", "ref")}
         win = _FlatVariantWindows(wfields)
-
-        if opt.ref == "window" and opt.alt == "window":
-            # Hot path: single fused fetch produces both windows.
-            rw, aw = compute_windows(
-                haps.reference,
-                v_contigs,
-                starts_v,
-                ilens_v,
-                alt_data,
-                alt_seq_off,
-                L,
-                lut,
-                row_offsets,
-            )
-            rw.shape = wshape
-            aw.shape = wshape
-            win.ref_window = rw
-            win.alt_window = aw
-        else:
-            if opt.ref == "window":
-                rw = compute_ref_window(
-                    haps.reference, v_contigs, starts_v, ilens_v, L, lut, row_offsets
-                )
-                rw.shape = wshape
-                win.ref_window = rw
-            else:  # "allele": bare tokenized ref allele
-                ref_bytes = np.asarray(haps.variants.ref.data).view(np.uint8)
-                ref_off = np.asarray(haps.variants.ref.offsets, np.int64)
-                ref_data, ref_seq_off = _gather_alleles(v_idxs, ref_bytes, ref_off)
-                rw = tokenize_alleles(ref_data, ref_seq_off, lut, row_offsets)
-                rw.shape = wshape
-                win.ref = rw
-
-            if opt.alt == "window":
-                aw = compute_alt_window(
-                    haps.reference,
-                    v_contigs,
-                    starts_v,
-                    ilens_v,
-                    alt_data,
-                    alt_seq_off,
-                    L,
-                    lut,
-                    row_offsets,
-                )
-                aw.shape = wshape
-                win.alt_window = aw
-            else:  # "allele": bare tokenized alt allele
-                aw = tokenize_alleles(alt_data, alt_seq_off, lut, row_offsets)
-                aw.shape = wshape
-                win.alt = aw
-
+        for name, (data, seq_off) in bufs.items():
+            fw = _FlatWindow(data, np.asarray(seq_off, np.int64), row_offsets, wshape)
+            setattr(win, name, fw)
         if haps.dummy_variant is not None:
             win = win.fill_empty_groups(
                 haps.dummy_variant, unk=haps.unknown_token, flank_length=L
             )
-
         return win
 
-    # ride-along flank tokens on the plain variants output.
-    if haps.flank_length and haps.token_lut is not None and regions is not None:
-        from ._flat_flanks import compute_flank_tokens
+    # --- Step 3: plain-variants path: route allele bytes + flank tokens through kernel ---
+    want_flank = bool(
+        haps.flank_length and haps.token_lut is not None and regions is not None
+    )
+    L = haps.flank_length or 0
+    bufs = _assemble_variant_buffers_rust(
+        0,  # variants mode
+        v_idxs,
+        row_offsets,
+        stat.alt_alleles,
+        stat.alt_offsets,
+        ref_global,
+        ref_off_global,
+        ref_present,  # want_ref_bytes
+        want_flank,
+        0,  # ref_mode (unused in variants mode)
+        0,  # alt_mode (unused)
+        L,
+        haps.token_lut,
+        v_contigs,
+        stat.v_starts,
+        stat.ilens,
+        stat.ref if stat.ref is not None else np.zeros(0, np.uint8),
+        stat.ref_offsets if stat.ref_offsets is not None else np.zeros(1, np.int64),
+        haps.reference.pad_char if haps.reference is not None else 0,
+    )
 
-        L = haps.flank_length
-        starts_v = np.asarray(haps.variants.start)[v_idxs]
-        ilens_v = np.asarray(haps.variants.ilen)[v_idxs]
-        regions = np.asarray(regions)
-        group_contigs = np.repeat(regions[:, 0], eff_ploidy)  # (b*eff_ploidy,)
-        v_contigs = np.repeat(group_contigs, np.diff(row_offsets))  # (n_var,)
+    # Build fields in ORIGINAL insertion order (alt FIRST, then start, ref, rest).
+    # Prepend alt; reconstruct from scalar fields inserting ref after start.
+    final_fields: dict[str, Any] = {}
+    alt_data, alt_seq_off = bufs["alt"]
+    final_fields["alt"] = _FlatAlleles(
+        np.asarray(alt_data, np.uint8),
+        np.asarray(alt_seq_off, np.int64),
+        row_offsets,
+        shape,
+    )
+    for k, v in fields.items():
+        if k == "start":
+            final_fields["start"] = v
+            # Insert ref immediately after start (original order: alt, start, ref, ilen, ...)
+            if "ref" in bufs:
+                ref_data, ref_seq_off = bufs["ref"]
+                final_fields["ref"] = _FlatAlleles(
+                    np.asarray(ref_data, np.uint8),
+                    np.asarray(ref_seq_off, np.int64),
+                    row_offsets,
+                    shape,
+                )
+        else:
+            final_fields[k] = v
 
-        tok, off = compute_flank_tokens(
-            haps.reference,
-            v_contigs,
-            starts_v,
-            ilens_v,
-            L,
-            haps.token_lut,
-            row_offsets,
+    flat = _FlatVariants(final_fields)
+
+    if "flank_tokens" in bufs:
+        tok, off = bufs["flank_tokens"]
+        flat.flank_tokens = _Flat.from_offsets(
+            tok, (b, eff_ploidy, None, 2 * L), np.asarray(off, np.int64)
         )
-        flat.flank_tokens = _Flat.from_offsets(tok, (b, eff_ploidy, None, 2 * L), off)
 
     # dummy-variant empty-group fill (scalars, alleles, and flank_tokens).
     if haps.dummy_variant is not None:
         flat = flat.fill_empty_groups(haps.dummy_variant, unk=haps.unknown_token)
 
     return flat
+
+
+def _gather_v_idxs_ss_numba(geno_offset_idx, geno_starts, geno_stops, geno_v_idxs):
+    """Gather variant-index rows using starts/stops 2D form.
+
+    Pure Python fallback (no numba). Name retained for test backward-compatibility.
+    Returns (v_idxs, offsets) where offsets has shape (n_rows+1,).
+    """
+    n_rows = geno_offset_idx.shape[0]
+    out_offsets = np.empty(n_rows + 1, np.int64)
+    out_offsets[0] = 0
+    for i in range(n_rows):
+        goi = int(geno_offset_idx[i])
+        out_offsets[i + 1] = out_offsets[i] + (
+            int(geno_stops[goi]) - int(geno_starts[goi])
+        )
+    total = int(out_offsets[n_rows])
+    out_data = np.empty(total, geno_v_idxs.dtype)
+    dst = 0
+    for i in range(n_rows):
+        goi = int(geno_offset_idx[i])
+        s = int(geno_starts[goi])
+        e = int(geno_stops[goi])
+        out_data[dst : dst + (e - s)] = geno_v_idxs[s:e]
+        dst += e - s
+    return out_data, out_offsets
diff --git a/python/genvarloader/_dataset/_genotypes.py b/python/genvarloader/_dataset/_genotypes.py
index 02fcba8d..5ef58364 100644
--- a/python/genvarloader/_dataset/_genotypes.py
+++ b/python/genvarloader/_dataset/_genotypes.py
@@ -1,10 +1,24 @@
-import numba as nb
 import numpy as np
 from numpy.typing import NDArray
 from seqpro.rag import OFFSET_TYPE
 
+from ..genvarloader import choose_exonic_variants as _choose_exonic_variants_rust
+from ..genvarloader import get_diffs_sparse as _get_diffs_sparse_rust
+from ..genvarloader import (
+    reconstruct_haplotypes_from_sparse as _reconstruct_haplotypes_from_sparse_rust,
+)
+from .._threads import should_parallelize
+
+
+def _as_starts_stops(offsets: NDArray[np.integer]) -> NDArray[np.int64]:
+    """Normalize 1-D (n+1,) or 2-D (2, n) offsets to a contiguous (2, n) int64
+    starts/stops array. Both backends consume this single form."""
+    o = np.asarray(offsets)
+    if o.ndim == 1:
+        return np.ascontiguousarray(np.stack([o[:-1], o[1:]]), dtype=np.int64)
+    return np.ascontiguousarray(o, dtype=np.int64)
+
 
-@nb.njit(parallel=True, nogil=True, cache=True)
 def get_diffs_sparse(
     geno_offset_idx: NDArray[np.integer],
     geno_v_idxs: NDArray[np.integer],
@@ -15,101 +29,26 @@ def get_diffs_sparse(
     q_starts: NDArray[np.integer] | None = None,
     q_ends: NDArray[np.integer] | None = None,
     v_starts: NDArray[np.integer] | None = None,
-):
-    """Get difference in length wrt reference genome for given genotypes.
-
-    If starts, ends, & positions are given, they take priority over keep and keep_offsets.
-
-    Parameters
-    ----------
-    geno_offset_idx : NDArray[np.intp]
-        Shape = (n_regions, ploidy) Indices for each region into offsets.
-    geno_v_idxs : NDArray[np.int32]
-        Shape = (variants*samples*ploidy) Sparse genotypes i.e. variant indices for ALT genotypes.
-    geno_offsets : NDArray[np.int32]
-        Shape = (regions*samples*ploidy + 1) Offsets into sparse genotypes.
-    ilens : NDArray[np.int32]
-        Shape = (total_variants) Size of all unique variants.
-    keep : Optional[NDArray[np.bool_]]
-        Shape = (variants*samples*ploidy) Keep mask for genotypes.
-    keep_offsets : Optional[NDArray[np.int64]]
-        Shape = (regions*samples*ploidy + 1) Offsets into keep.
-    q_starts : Optional[NDArray[np.int32]]
-        Shape = (regions) Start of query regions.
-    q_ends : Optional[NDArray[np.int32]]
-        Shape = (regions) End of query regions.
-    v_starts : Optional[NDArray[np.int32]]
-        Shape = (total_variants) Positions of unique variants.
-    """
-    n_queries, ploidy = geno_offset_idx.shape
-    diffs = np.empty((n_queries, ploidy), np.int32)
-    for query in nb.prange(n_queries):
-        for hap in nb.prange(ploidy):
-            o_idx = geno_offset_idx[query, hap]
-            if geno_offsets.ndim == 1:
-                o_s, o_e = geno_offsets[o_idx], geno_offsets[o_idx + 1]
-            else:
-                o_s, o_e = geno_offsets[:, o_idx]
-            n_variants = o_e - o_s
-            if n_variants == 0:
-                diffs[query, hap] = 0
-            elif q_starts is not None and q_ends is not None and v_starts is not None:
-                diffs[query, hap] = 0
-                ref_idx = q_starts[query]
-                for v in range(o_s, o_e):
-                    if keep is not None and keep_offsets is not None:
-                        k_s = keep_offsets[query * ploidy + hap]
-                        v_keep = keep[k_s + (v - o_s)]
-                        if not v_keep:
-                            continue
-
-                    v_idx: int = geno_v_idxs[v]
-                    v_start = v_starts[v_idx]
-                    v_ilen = ilens[v_idx]
-                    # +1 assumes atomized variants
-                    v_end = v_start - min(0, v_ilen) + 1
-
-                    if v_end <= q_starts[query]:
-                        # variant doesn't span region
-                        continue
-
-                    if v_start >= q_ends[query]:
-                        # variants are sorted by position so this variant and everything
-                        # after will be outside the region
-                        break
+) -> NDArray[np.int32]:
+    """Per-(query, hap) reference-length diffs; dispatches to Rust."""
+    goi = np.ascontiguousarray(geno_offset_idx, np.int64)
+    # output is (n_queries, ploidy) int32 — each cell is 4 bytes
+    total_out_bytes = int(goi.shape[0]) * int(goi.shape[1]) * 4
+    parallel = should_parallelize(total_out_bytes)
+    return _get_diffs_sparse_rust(
+        goi,
+        np.ascontiguousarray(geno_v_idxs, np.int32),
+        _as_starts_stops(geno_offsets),
+        np.ascontiguousarray(ilens, np.int32),
+        None if keep is None else np.ascontiguousarray(keep, np.bool_),
+        None if keep_offsets is None else np.ascontiguousarray(keep_offsets, np.int64),
+        None if q_starts is None else np.ascontiguousarray(q_starts, np.int32),
+        None if q_ends is None else np.ascontiguousarray(q_ends, np.int32),
+        None if v_starts is None else np.ascontiguousarray(v_starts, np.int32),
+        parallel,
+    )
 
-                    # skip overlapping variants within the region (mirrors reconstruction logic)
-                    if v_start >= q_starts[query] and v_start < ref_idx:
-                        continue
 
-                    # advance ref_idx to end of this variant
-                    ref_idx = max(ref_idx, v_end)
-
-                    # deletion may start before region
-                    #     0 1 2 3 4 5 6
-                    # DEL s - - r e - - : +max(0, 3 - 0) -> -3 + 3 = 0
-                    # DEL r - s - e - - : +max(0, 0 - 2) -> -1 + 0 = -1
-                    # where r is region start, s is variant start, e is variant end (exclusive)
-                    # count the "-" to get ilen
-                    # but also atomic deletions include 1 bp of ref so add it back (- 1)
-                    if v_ilen < 0:
-                        v_ilen += max(0, q_starts[query] - v_start - 1)
-                    # deletion may end after region
-                    v_ilen += max(0, v_end - q_ends[query])
-
-                    diffs[query, hap] += v_ilen
-            elif keep is not None and keep_offsets is not None:
-                v_idxs = geno_v_idxs[o_s:o_e]
-                k_idx = query * ploidy + hap
-                qh_keep = keep[keep_offsets[k_idx] : keep_offsets[k_idx + 1]]
-                v_idxs = v_idxs[qh_keep]
-                diffs[query, hap] = ilens[v_idxs].sum()
-            else:
-                diffs[query, hap] = ilens[geno_v_idxs[o_s:o_e]].sum()
-    return diffs
-
-
-@nb.njit(parallel=True, nogil=True, cache=True)
 def reconstruct_haplotypes_from_sparse(
     out: NDArray[np.uint8],
     out_offsets: NDArray[np.integer],
@@ -130,165 +69,85 @@ def reconstruct_haplotypes_from_sparse(
     annot_v_idxs: NDArray[np.integer] | None = None,
     annot_ref_pos: NDArray[np.integer] | None = None,
 ):
-    """Reconstruct haplotypes from reference sequence and variants.
-
-    Batched parallel driver: dispatches to :func:`reconstruct_haplotype_from_sparse`
-    (singular) for each ``(query, hap)`` pair.
+    """Reconstruct haplotypes from reference sequence and variants (dispatch wrapper).
 
-    Parameters
-    ----------
-    out : NDArray[np.uint8]
-        Ragged array of shape = (batch, ploidy, ~length) to write haplotypes into.
-    out_offsets : NDArray[np.int64]
-        Shape = (batch*ploidy + 1) Offsets into out.
-    regions : NDArray[np.int32]
-        Shape = (batch, 3) Regions to reconstruct haplotypes.
-    shifts : NDArray[np.uint32]
-        Shape = (batch, ploidy) Shifts for each region.
-    geno_offset_idx: NDArray[np.intp]
-        Shape = (batch, ploidy) Indices for each region into offsets.
-    geno_offsets : NDArray[np.uint32]
-        Shape = (batch*ploidy + 1) Offsets into genos.
-    geno_v_idxs : NDArray[np.int32]
-        Shape = (total_variants) Sparse genotypes of variants i.e. variant indices for ALT genotypes.
-    v_starts : NDArray[np.int32]
-        Shape = (unique_variants) Positions of variants.
-    ilens : NDArray[np.int32]
-        Shape = (unique_variants) Sizes of variants.
-    alt_alleles : NDArray[np.uint8]
-        Shape = (total_alt_length) ALT alleles.
-    alt_offsets : NDArray[np.uintp]
-        Shape = (unique_variants + 1) Offsets of ALT alleles.
-    ref : NDArray[np.uint8]
-        Shape = (ref_length) Reference sequence.
-    ref_offsets : NDArray[np.uint64]
-        Shape = (n_contigs) Offsets of reference sequences.
-    pad_char : int
-        Padding character.
-    keep : NDArray[np.bool_] | None
-        Shape = (variants) Keep mask for genotypes.
-    keep_offsets : NDArray[np.int64] | None
-        Shape = (batch*ploidy + 1) Offsets into keep.
-    annot_v_idxs : NDArray[np.int32] | None
-        Ragged buffer for shape (batch, ploidy, ~length). Variant indices for annotations.
-    annot_ref_pos : NDArray[np.int32] | None
-        Ragged buffer for shape (batch, ploidy, ~length). Reference positions for annotations.
+    Dispatches to the Rust backend. Normalizes array dtypes and layouts before dispatch.
     """
-    batch_size, ploidy = geno_offset_idx.shape
-    for query in nb.prange(batch_size):
-        q = regions[query]
-        c_idx: int = q[0]
-        c_s = ref_offsets[c_idx]
-        c_e = ref_offsets[c_idx + 1]
-        ref_start: int = q[1]
-        _reference = ref[c_s:c_e]
-
-        for hap in nb.prange(ploidy):
-            # index for full sparse genos
-            o_idx = geno_offset_idx[query, hap]
-            if geno_offsets.ndim == 1:
-                o_s, o_e = geno_offsets[o_idx], geno_offsets[o_idx + 1]
-            else:
-                o_s, o_e = geno_offsets[:, o_idx]
-            qh_v_idxs = geno_v_idxs[o_s:o_e]
-
-            # local index for subset of variants that are implied by offset_idxs
-            k_idx = query * ploidy + hap
-            if keep is not None and keep_offsets is not None:
-                qh_keep = keep[keep_offsets[k_idx] : keep_offsets[k_idx + 1]]
-            else:
-                qh_keep = None
-
-            # aligned to out sequence
-            out_s, out_e = out_offsets[k_idx], out_offsets[k_idx + 1]
-            qh_out = out[out_s:out_e]
-            qh_shift = shifts[query, hap]
+    total_out_bytes = int(np.asarray(out_offsets)[-1])
+    parallel = should_parallelize(total_out_bytes)
+    _reconstruct_haplotypes_from_sparse_rust(
+        out,
+        np.ascontiguousarray(out_offsets, np.int64),
+        np.ascontiguousarray(regions, np.int32),
+        np.ascontiguousarray(shifts, np.int32),
+        np.ascontiguousarray(geno_offset_idx, np.int64),
+        _as_starts_stops(geno_offsets),
+        np.ascontiguousarray(geno_v_idxs, np.int32),
+        np.ascontiguousarray(v_starts, np.int32),
+        np.ascontiguousarray(ilens, np.int32),
+        np.ascontiguousarray(alt_alleles, np.uint8),
+        np.ascontiguousarray(alt_offsets, np.int64),
+        np.ascontiguousarray(ref, np.uint8),
+        np.ascontiguousarray(ref_offsets, np.int64),
+        np.uint8(pad_char),
+        None if keep is None else np.ascontiguousarray(keep, np.bool_),
+        None if keep_offsets is None else np.ascontiguousarray(keep_offsets, np.int64),
+        annot_v_idxs,
+        annot_ref_pos,
+        parallel,
+    )
 
-            qh_annot_v_idxs = (
-                annot_v_idxs[out_s:out_e] if annot_v_idxs is not None else None
-            )
-            qh_annot_ref_pos = (
-                annot_ref_pos[out_s:out_e] if annot_ref_pos is not None else None
-            )
 
-            reconstruct_haplotype_from_sparse(
-                v_idxs=qh_v_idxs,
-                v_starts=v_starts,
-                ilens=ilens,
-                shift=qh_shift,
-                alt_alleles=alt_alleles,
-                alt_offsets=alt_offsets,
-                ref=_reference,
-                ref_start=ref_start,
-                out=qh_out,
-                pad_char=pad_char,
-                keep=qh_keep,
-                annot_v_idxs=qh_annot_v_idxs,
-                annot_ref_pos=qh_annot_ref_pos,
-            )
+def choose_exonic_variants(
+    starts: NDArray[np.integer],
+    ends: NDArray[np.integer],
+    geno_offset_idx: NDArray[np.integer],
+    geno_v_idxs: NDArray[np.integer],
+    geno_offsets: NDArray[np.integer],
+    v_starts: NDArray[np.integer],
+    ilens: NDArray[np.integer],
+) -> tuple[NDArray[np.bool_], NDArray[OFFSET_TYPE]]:
+    """Exonic keep-mask; dispatches to Rust. keep_offsets dtype == OFFSET_TYPE."""
+    keep, keep_offsets = _choose_exonic_variants_rust(
+        np.ascontiguousarray(starts, np.int32),
+        np.ascontiguousarray(ends, np.int32),
+        np.ascontiguousarray(geno_offset_idx, np.int64),
+        np.ascontiguousarray(geno_v_idxs, np.int32),
+        _as_starts_stops(geno_offsets),
+        np.ascontiguousarray(v_starts, np.int32),
+        np.ascontiguousarray(ilens, np.int32),
+    )
+    return keep, keep_offsets.astype(OFFSET_TYPE, copy=False)
 
 
-@nb.njit(nogil=True, cache=True)
 def reconstruct_haplotype_from_sparse(
-    v_idxs: NDArray[np.integer],
-    v_starts: NDArray[np.integer],
-    ilens: NDArray[np.integer],
+    v_idxs,
+    v_starts,
+    ilens,
     shift: int,
-    alt_alleles: NDArray[np.uint8],  # full set
-    alt_offsets: NDArray[np.integer],  # full set
-    ref: NDArray[np.uint8],  # full contig
-    ref_start: int,  # may be negative
-    out: NDArray[np.uint8],
+    alt_alleles,
+    alt_offsets,
+    ref,
+    ref_start: int,
+    out,
     pad_char: int,
-    keep: NDArray[np.bool_] | None = None,
-    annot_v_idxs: NDArray[np.integer] | None = None,
-    annot_ref_pos: NDArray[np.integer] | None = None,
+    keep=None,
+    annot_v_idxs=None,
+    annot_ref_pos=None,
 ):
     """Reconstruct a single haplotype from reference sequence and variants.
 
-    Single-haplotype inner kernel. Use :func:`reconstruct_haplotypes_from_sparse`
-    (plural) to reconstruct a batch in parallel.
-
-    Parameters
-    ----------
-    v_idxs : NDArray[np.integer]
-        Shape = (variants) Index of alt variants.
-    v_starts : NDArray[np.int32]
-        Shape = Offsets into variant indices.
-    ilens : NDArray[np.int32]
-        Shape = (total_variants) Positions of variants.
-    shift : int
-        Total amount to shift by.
-    alt_alleles : NDArray[np.uint8]
-        Shape = (total_alt_length) ALT alleles.
-    alt_offsets : NDArray[np.uintp]
-        Shape = (total_variants + 1) Offsets of ALT alleles.
-    ref : NDArray[np.uint8]
-        Shape = (ref_length) Reference sequence for the whole contig. ref_length >= out_length
-    ref_start : int
-        Start position of reference sequence, may be negative.
-    out : NDArray[np.uint8]
-        Shape = (out_length) Output array.
-    pad_char : int
-        Padding character.
-    keep: Optional[NDArray[np.bool_]]
-        Shape = (variants) Keep mask for genotypes.
-    annot_v_idxs: Optional[NDArray[np.int32]]
-        Shape = (out_length) Variant indices for annotations.
-    annot_ref_pos: Optional[NDArray[np.int32]]
-        Shape = (out_length) Reference positions for annotations
+    Pure Python fallback (no numba). Used directly by parity/unit tests.
+    Use :func:`reconstruct_haplotypes_from_sparse` (plural) to reconstruct a batch.
     """
+    import numpy as np
+
     length = len(out)
     n_variants = len(v_idxs)
-
-    # where to get next reference subsequence
     ref_idx = ref_start
-    # where to put next subsequence
     out_idx = 0
-    # how much we've shifted
     shifted = 0
 
-    # if ref_idx is negative, we need to pad the beginning of the haplotype
     if ref_idx < 0:
         pad_len = -ref_idx
         shifted = min(shift, pad_len)
@@ -305,66 +164,39 @@ def reconstruct_haplotype_from_sparse(
         if keep is not None and not keep[v]:
             continue
 
-        variant: int = v_idxs[v]
-        v_pos = v_starts[variant]
-        v_diff = ilens[variant]
-        allele = alt_alleles[alt_offsets[variant] : alt_offsets[variant + 1]]
+        variant = int(v_idxs[v])
+        v_pos = int(v_starts[variant])
+        v_diff = int(ilens[variant])
+        allele = alt_alleles[int(alt_offsets[variant]) : int(alt_offsets[variant + 1])]
         v_len = len(allele)
-        # +1 assumes atomized variants, exactly 1 nt shared between REF and ALT
         v_ref_end = v_pos - min(0, v_diff) + 1
 
-        # if variant is a DEL spanning start of query
         if v_pos < ref_start and v_diff < 0 and v_ref_end >= ref_start:
             ref_idx = v_ref_end
             continue
 
-        # overlapping variants
-        # v_rel_pos < ref_idx only if we see an ALT at a given position a second
-        # time or more. We'll do what bcftools consensus does and only use the
-        # first ALT variant we find.
         if v_pos < ref_idx:
             continue
 
-        # handle shift
         if shifted < shift:
             ref_shift_dist = v_pos - ref_idx
-            # not enough distance to finish the shift even with the variant
             if shifted + ref_shift_dist + v_len < shift:
-                # skip the variant
                 continue
-            # enough distance between ref_idx and start of variant to finish shift
             elif shifted + ref_shift_dist >= shift:
                 ref_idx += shift - shifted
                 shifted = shift
-                # can still use the variant and whatever ref is left between
-                # ref_idx and the variant
-            # ref + all or some of variant is enough to finish shift
             else:
-                # how much left to shift - amount of ref we can use
                 allele_start_idx = shift - shifted - ref_shift_dist
                 shifted = shift
-                #! without if statement, parallel=True can cause a SystemError!
-                # * parallel jit cannot handle changes in array dimension.
-                # * without this, allele can change from a 1D array to a 0D
-                # * array.
-                # enough dist with variant to complete shift
                 if allele_start_idx == v_len:
-                    # move ref to end of variant
                     ref_idx = v_ref_end
-                    # skip the variant
                     continue
-                # consume ref up to beginning of variant
-                # ref_idx will be moved to end of variant after using the variant
                 ref_idx = v_pos
-                # adjust variant to start at allele_start_idx
                 allele = allele[allele_start_idx:]
                 v_len = len(allele)
 
-        # add reference sequence
         ref_len = v_pos - ref_idx
         if out_idx + ref_len >= length:
-            # ref will get written by final clause
-            # handles case where extraneous variants downstream of the haplotype were provided
             break
         out[out_idx : out_idx + ref_len] = ref[ref_idx : ref_idx + ref_len]
         if annot_v_idxs is not None:
@@ -375,7 +207,6 @@ def reconstruct_haplotype_from_sparse(
             )
         out_idx += ref_len
 
-        # apply variant
         writable_length = min(v_len, length - out_idx)
         out[out_idx : out_idx + writable_length] = allele[:writable_length]
         if annot_v_idxs is not None:
@@ -384,23 +215,19 @@ def reconstruct_haplotype_from_sparse(
             annot_ref_pos[out_idx : out_idx + writable_length] = v_pos
         out_idx += writable_length
 
-        # advance ref_idx to end of variant
         ref_idx = v_ref_end
 
         if out_idx >= length:
             break
 
     if shifted < shift:
-        # need to shift the rest of the track
         ref_idx += shift - shifted
         ref_idx = min(ref_idx, len(ref))
         shifted = shift
 
-    # fill rest with reference sequence and right-pad with Ns
     unfilled_length = length - out_idx
     if unfilled_length > 0:
-        # fill with reference sequence
-        writable_ref = min(unfilled_length, len(ref) - ref_idx)
+        writable_ref = max(0, min(unfilled_length, len(ref) - ref_idx))
         out_end_idx = out_idx + writable_ref
         ref_end_idx = ref_idx + writable_ref
         out[out_idx:out_end_idx] = ref[ref_idx:ref_end_idx]
@@ -409,172 +236,11 @@ def reconstruct_haplotype_from_sparse(
         if annot_ref_pos is not None:
             annot_ref_pos[out_idx:out_end_idx] = np.arange(ref_idx, ref_end_idx)
 
-        # right-pad
         if out_end_idx < length:
             out[out_end_idx:] = pad_char
             if annot_v_idxs is not None:
                 annot_v_idxs[out_end_idx:] = -1
             if annot_ref_pos is not None:
-                annot_ref_pos[out_end_idx:] = np.iinfo(np.int32).max
+                import numpy as np
 
-
-@nb.njit(parallel=True, nogil=True, cache=True)
-def choose_exonic_variants(
-    starts: NDArray[np.integer],
-    ends: NDArray[np.integer],
-    geno_offset_idx: NDArray[np.integer],
-    geno_v_idxs: NDArray[np.integer],
-    geno_offsets: NDArray[np.integer],
-    v_starts: NDArray[np.integer],
-    ilens: NDArray[np.integer],
-) -> tuple[NDArray[np.bool_], NDArray[OFFSET_TYPE]]:
-    """Mark variants to keep for each haplotype.
-
-    Parameters
-    ----------
-    starts : NDArray[np.int32]
-        Shape = (n_regions) Start positions for each region.
-    ends : NDArray[np.int32]
-        Shape = (n_regions) Ends for each region.
-    geno_offset_idx : NDArray[np.intp]
-        Shape = (n_regions, ploidy) Indices for each region into offsets.
-    offsets : NDArray[np.int64]
-        Shape = (total_variants + 1) Offsets into sparse genotypes.
-    sparse_genos : NDArray[np.int32]
-        Shape = (total_variants) Sparse genotypes i.e. variant indices for ALT genotypes.
-    positions : NDArray[np.int32]
-        Shape = (total_variants) Positions of variants.
-    sizes : NDArray[np.int32]
-        Shape = (total_variants) Sizes of variants.
-    deterministic : bool
-        Whether to deterministically assign variants to groups
-    """
-    n_regions, ploidy = geno_offset_idx.shape
-
-    lengths = np.empty((n_regions, ploidy), np.int64)
-    for query in nb.prange(n_regions):
-        for hap in range(ploidy):
-            o_idx = geno_offset_idx[query, hap]
-            if geno_offsets.ndim == 1:
-                o_s, o_e = geno_offsets[o_idx], geno_offsets[o_idx + 1]
-            else:
-                o_s, o_e = geno_offsets[:, o_idx]
-            lengths[query, hap] = o_e - o_s
-    keep_offsets = np.empty(n_regions * ploidy + 1, OFFSET_TYPE)
-    keep_offsets[0] = 0
-    keep_offsets[1:] = lengths.cumsum()
-
-    n_variants = keep_offsets[-1]
-    keep = np.empty(n_variants, np.bool_)
-
-    for query in nb.prange(n_regions):
-        ref_start: int = starts[query]
-        ref_end: int = ends[query]
-        for hap in nb.prange(ploidy):
-            o_idx = geno_offset_idx[query, hap]
-            # Mirror filter_af's (2, n_slices) indexing (sibling kernel below).
-            if geno_offsets.ndim == 1:
-                o_s, o_e = geno_offsets[o_idx], geno_offsets[o_idx + 1]
-            else:
-                o_s, o_e = geno_offsets[:, o_idx]
-            qh_genos = geno_v_idxs[o_s:o_e]
-
-            k_idx = query * ploidy + hap
-            k_s, k_e = keep_offsets[k_idx], keep_offsets[k_idx + 1]
-            qh_keep = keep[k_s:k_e]
-
-            _choose_exonic_variants(
-                query_start=ref_start,
-                query_end=ref_end,
-                variant_idxs=qh_genos,
-                positions=v_starts,
-                sizes=ilens,
-                keep=qh_keep,
-            )
-
-    return keep, keep_offsets
-
-
-@nb.njit(nogil=True, cache=True)
-def _choose_exonic_variants(
-    query_start: int,
-    query_end: int,
-    variant_idxs: NDArray[np.integer],  # (v)
-    positions: NDArray[np.integer],  # (total variants)
-    sizes: NDArray[np.integer],  # (total variants)
-    keep: NDArray[np.bool_],  # (v)
-):
-    """Create a mask for variants that are fully contained within the query interval, which is
-    assumed to correspond to the exon boundaries."""
-    # no variants
-    if len(variant_idxs) == 0:
-        return
-
-    for v in range(len(variant_idxs)):
-        v_idx: int = variant_idxs[v]
-        v_pos = positions[v_idx]
-        # +1 for atomized
-        v_ref_end = v_pos - min(0, sizes[v_idx]) + 1
-
-        if v_pos >= query_start and v_ref_end <= query_end:
-            keep[v] = True
-        else:
-            keep[v] = False
-
-
-@nb.njit(parallel=True, nogil=True, cache=True)
-def filter_af(
-    geno_offset_idx: NDArray[np.integer],
-    geno_offsets: NDArray[np.integer],
-    geno_v_idxs: NDArray[np.integer],
-    afs: NDArray[np.number],
-    min_af: float | None,
-    max_af: float | None,
-) -> tuple[NDArray[np.bool_], NDArray[OFFSET_TYPE]]:
-    """Filter variants based on allele frequency, marking them to keep or not."""
-
-    batch_size, ploidy = geno_offset_idx.shape
-
-    if geno_offsets.ndim == 1:
-        keep_offsets = geno_offsets.astype(OFFSET_TYPE)
-        n_variants = geno_offsets[-1]
-    else:
-        # (2, n_slices)
-        n_vars_per_slice = geno_offsets[1] - geno_offsets[0]
-        n_slices = len(n_vars_per_slice)
-        keep_offsets = np.empty(n_slices + 1, OFFSET_TYPE)
-        keep_offsets[0] = 0
-        acc = OFFSET_TYPE(0)
-        for i in range(n_slices):
-            acc += n_vars_per_slice[i]
-            keep_offsets[i + 1] = acc
-        n_variants = n_vars_per_slice.sum()
-
-    keep = np.full(n_variants, True, np.bool_)
-
-    if min_af is None and max_af is None:
-        return keep, keep_offsets
-
-    for query in nb.prange(batch_size):
-        for hap in range(ploidy):
-            # index for full sparse genos
-            o_idx = geno_offset_idx[query, hap]
-            if geno_offsets.ndim == 1:
-                o_s, o_e = geno_offsets[o_idx], geno_offsets[o_idx + 1]
-            else:
-                o_s, o_e = geno_offsets[:, o_idx]
-
-            k_idx = query * ploidy + hap
-            k_s, k_e = keep_offsets[k_idx], keep_offsets[k_idx + 1]
-
-            for v, k in zip(range(o_s, o_e), range(k_s, k_e)):
-                v_idx = geno_v_idxs[v]
-                v_af = afs[v_idx]
-
-                if min_af is not None:
-                    keep[k] &= v_af >= min_af
-
-                if max_af is not None:
-                    keep[k] &= v_af <= max_af
-
-    return keep, keep_offsets
+                annot_ref_pos[out_end_idx:] = np.iinfo(np.int32).max
diff --git a/python/genvarloader/_dataset/_haps.py b/python/genvarloader/_dataset/_haps.py
index a7f29a3e..8d746260 100644
--- a/python/genvarloader/_dataset/_haps.py
+++ b/python/genvarloader/_dataset/_haps.py
@@ -35,11 +35,19 @@
 from ._flat_variants import _FlatVariantWindows, VarWindowOpt
 from .._utils import lengths_to_offsets
 from .._variants._records import RaggedAlleles
+from ..genvarloader import (
+    reconstruct_annotated_haplotypes_fused as reconstruct_annotated_haplotypes_fused,
+    reconstruct_annotated_haplotypes_spliced_fused as reconstruct_annotated_haplotypes_spliced_fused,
+    reconstruct_haplotypes_fused as reconstruct_haplotypes_fused,
+    reconstruct_haplotypes_spliced_fused as reconstruct_haplotypes_spliced_fused,
+)
 from ._genotypes import (
+    _as_starts_stops,
     choose_exonic_variants,
     get_diffs_sparse,
-    reconstruct_haplotypes_from_sparse,
 )
+from .._threads import should_parallelize
+from ._utils import _ffi_array
 from ._protocol import Reconstructor
 from ._rag_variants import RaggedVariants
 from ._reference import Reference
@@ -228,6 +236,20 @@ def _svar_format_fields(svar_dir: Path) -> dict[str, np.dtype]:
     return {name: np.dtype(dt) for name, dt in fields.items()}
 
 
+@dataclass(slots=True)
+class _HapsFfiStatic:
+    """FFI-ready, contiguous, correctly-typed sub-linear arrays consumed by the
+    fused kernels. Grows only with the variant/reference count (sub-linear in
+    samples), so it is cached for the lifetime of the Haps reconstructor."""
+
+    v_starts: NDArray[np.int32]
+    ilens: NDArray[np.int32]
+    alt_alleles: NDArray[np.uint8]
+    alt_offsets: NDArray[np.int64]
+    ref: "NDArray[np.uint8] | None"
+    ref_offsets: "NDArray[np.int64] | None"
+
+
 @dataclass(slots=True)
 class Haps(Reconstructor[_H]):
     path: Path
@@ -253,6 +275,7 @@ class Haps(Reconstructor[_H]):
     memmapped on the genotype offsets. Parallel to ``dosages``. See issue #231."""
     dummy_variant: "DummyVariant | None" = None
     available_var_fields: list[str] = field(init=False)
+    _ffi_static: "_HapsFfiStatic | None" = field(default=None, init=False)
     flank_length: int | None = None
     """Number of reference flank bases on each side for flank/window tokenization. ``0``/``None`` disables."""
     token_lut: NDArray | None = None
@@ -301,6 +324,27 @@ def __post_init__(self):
                 + "Doing this automatically is not yet supported."
             )
 
+    @property
+    def ffi_static(self) -> _HapsFfiStatic:
+        """Lazily-computed, cached FFI-ready sub-linear arrays (see _HapsFfiStatic)."""
+        if self._ffi_static is None:
+            ref = self.reference
+            self._ffi_static = _HapsFfiStatic(
+                v_starts=np.ascontiguousarray(self.variants.start, np.int32),
+                ilens=np.ascontiguousarray(self.variants.ilen, np.int32),
+                alt_alleles=np.ascontiguousarray(
+                    self.variants.alt.data.view(np.uint8), np.uint8
+                ),
+                alt_offsets=np.ascontiguousarray(self.variants.alt.offsets, np.int64),
+                ref=None
+                if ref is None
+                else np.ascontiguousarray(ref.reference, np.uint8),
+                ref_offsets=None
+                if ref is None
+                else np.ascontiguousarray(ref.offsets, np.int64),
+            )
+        return self._ffi_static
+
     def _has_dosage_file_on_disk(self) -> bool:
         """True iff the linked SVAR contains a dosages.npy.
 
@@ -539,6 +583,7 @@ def __call__(
         deterministic: bool,
         splice_plan: SplicePlan | None = None,
         flat: bool = False,
+        to_rc: "NDArray[np.bool_] | None" = None,
     ) -> _H:
         if issubclass(self.kind, (RaggedVariants, _FlatVariantWindows)):
             if splice_plan is not None:
@@ -567,6 +612,7 @@ def __call__(
                 rng=rng,
                 deterministic=deterministic,
                 splice_plan=splice_plan,
+                to_rc=to_rc,
             )
             return haps
 
@@ -578,6 +624,7 @@ def get_haps_and_shifts(
         rng: np.random.Generator,
         deterministic: bool,
         splice_plan: SplicePlan | None = None,
+        to_rc: "NDArray[np.bool_] | None" = None,
     ) -> tuple[
         _H,
         NDArray[np.intp],
@@ -598,9 +645,11 @@ def get_haps_and_shifts(
 
         # (b p l), (b p l), (b p l)
         if issubclass(self.kind, RaggedSeqs):
-            out = self._reconstruct_haplotypes(req)
+            out = self._reconstruct_haplotypes(req, to_rc=to_rc)
         elif issubclass(self.kind, RaggedAnnotatedHaps):
-            haps, annot_v_idx, annot_pos = self._reconstruct_annotated_haplotypes(req)
+            haps, annot_v_idx, annot_pos = self._reconstruct_annotated_haplotypes(
+                req, to_rc=to_rc
+            )
             out = _FlatAnnotatedHaps(haps, annot_v_idx, annot_pos)
         elif issubclass(self.kind, RaggedVariants):
             if splice_plan is not None:
@@ -757,33 +806,61 @@ def _allele_bytes_sum(
         csum = np.concatenate([[0], np.cumsum(v_lens, dtype=np.int64)])
         return csum[group_offsets[1:]] - csum[group_offsets[:-1]]
 
-    def _reconstruct_haplotypes(self, req: ReconstructionRequest) -> Ragged[np.bytes_]:
+    def _reconstruct_haplotypes(
+        self,
+        req: ReconstructionRequest,
+        to_rc: "NDArray[np.bool_] | None" = None,
+    ) -> Ragged[np.bytes_]:
         """Reconstruct haplotype byte sequences from sparse genotypes."""
         assert self.reference is not None
 
         if req.splice_plan is None:
-            out_data = np.empty(req.out_offsets[-1], np.uint8)
-            out_offsets = np.asarray(req.out_offsets, np.int64)
             shape = (*req.shifts.shape, None)
-            reconstruct_haplotypes_from_sparse(
-                geno_offset_idx=req.geno_offset_idx,
-                out=out_data,
-                out_offsets=out_offsets,
-                regions=req.regions,
-                shifts=req.shifts,
-                geno_offsets=self.genotypes.offsets,
-                geno_v_idxs=self.genotypes.data,
-                v_starts=self.variants.start,
-                ilens=self.variants.ilen,
-                alt_alleles=self.variants.alt.data.view(np.uint8),
-                alt_offsets=self.variants.alt.offsets,
-                ref=self.reference.reference,
-                ref_offsets=self.reference.offsets,
-                pad_char=self.reference.pad_char,
-                keep=req.keep,
-                keep_offsets=req.keep_offsets,
-                annot_v_idxs=None,
-                annot_ref_pos=None,
+            # --- fused path (Rust): one FFI crossing, no Python-side np.empty ---
+            # Detect ragged vs fixed-length output from req.out_offsets.
+            # Ragged: out_lengths == hap_lengths (per-hap variable length).
+            # Fixed:  out_lengths is all the same constant value.
+            _out_per = (req.out_offsets[1:] - req.out_offsets[:-1]).reshape(
+                req.shifts.shape
+            )
+            if np.array_equal(
+                _out_per.astype(np.int64), req.hap_lengths.astype(np.int64)
+            ):
+                _fused_output_length = np.int64(-1)  # ragged mode
+            else:
+                _fused_output_length = np.int64(
+                    int(req.out_offsets[1] - req.out_offsets[0])
+                )
+            # Expand per-query to_rc → per-(query, hap) for the fused kernel.
+            # req.shifts.shape == (b, ploidy); np.repeat broadcasts (b,) → (b*p,).
+            _ploidy = req.shifts.shape[1] if req.shifts.ndim > 1 else 1
+            _to_rc_hap = (
+                None
+                if to_rc is None
+                else np.ascontiguousarray(np.repeat(to_rc, _ploidy), np.bool_)
+            )
+            out_data, out_offsets = reconstruct_haplotypes_fused(
+                regions=np.ascontiguousarray(req.regions, np.int32),
+                shifts=np.ascontiguousarray(req.shifts, np.int32),
+                geno_offset_idx=np.ascontiguousarray(req.geno_offset_idx, np.int64),
+                geno_offsets=_as_starts_stops(self.genotypes.offsets),
+                geno_v_idxs=_ffi_array(self.genotypes.data, np.int32, "geno_v_idxs"),
+                v_starts=self.ffi_static.v_starts,
+                ilens=self.ffi_static.ilens,
+                alt_alleles=self.ffi_static.alt_alleles,
+                alt_offsets=self.ffi_static.alt_offsets,
+                ref_=self.ffi_static.ref,
+                ref_offsets=self.ffi_static.ref_offsets,
+                pad_char=np.uint8(self.reference.pad_char),
+                output_length=_fused_output_length,
+                keep=None
+                if req.keep is None
+                else np.ascontiguousarray(req.keep, np.bool_),
+                keep_offsets=None
+                if req.keep_offsets is None
+                else np.ascontiguousarray(req.keep_offsets, np.int64),
+                to_rc=_to_rc_hap,
+                parallel=should_parallelize(int(req.out_offsets[-1])),
             )
             return cast(
                 "Ragged[np.bytes_]",
@@ -796,31 +873,42 @@ def _reconstruct_haplotypes(self, req: ReconstructionRequest) -> Ragged[np.bytes
         )
         splice_plan = req.splice_plan
 
-        total = int(splice_plan.permuted_out_offsets[-1])
-        out_buf = np.empty(total, np.uint8)
+        per_elem_shape = (splice_plan.permuted_lengths.shape[0], None)
 
-        reconstruct_haplotypes_from_sparse(
-            geno_offset_idx=flat_geno_idx.reshape(-1, 1),
-            out=out_buf,
-            out_offsets=splice_plan.permuted_out_offsets,
-            regions=permuted_regions,
-            shifts=flat_shifts.reshape(-1, 1),
-            geno_offsets=self.genotypes.offsets,
-            geno_v_idxs=self.genotypes.data,
-            v_starts=self.variants.start,
-            ilens=self.variants.ilen,
-            alt_alleles=self.variants.alt.data.view(np.uint8),
-            alt_offsets=self.variants.alt.offsets,
-            ref=self.reference.reference,
-            ref_offsets=self.reference.offsets,
-            pad_char=self.reference.pad_char,
-            keep=keep_perm,
-            keep_offsets=keep_offsets_perm,
-            annot_v_idxs=None,
-            annot_ref_pos=None,
+        # Fused path (Rust): one FFI crossing, Python already holds out_offsets.
+        # to_rc is already in permuted per-element order (passed from
+        # _getitem_spliced as to_rc_per_elem = to_rc_flat[plan.permutation]).
+        _to_rc_spliced = (
+            None if to_rc is None else np.ascontiguousarray(to_rc, np.bool_)
+        )
+        out_buf = reconstruct_haplotypes_spliced_fused(
+            permuted_regions=np.ascontiguousarray(permuted_regions, np.int32),
+            flat_shifts=np.ascontiguousarray(flat_shifts.reshape(-1, 1), np.int32),
+            flat_geno_offset_idx=np.ascontiguousarray(
+                flat_geno_idx.reshape(-1, 1), np.int64
+            ),
+            out_offsets=np.ascontiguousarray(
+                splice_plan.permuted_out_offsets, np.int64
+            ),
+            geno_offsets=_as_starts_stops(self.genotypes.offsets),
+            geno_v_idxs=_ffi_array(self.genotypes.data, np.int32, "geno_v_idxs"),
+            v_starts=self.ffi_static.v_starts,
+            ilens=self.ffi_static.ilens,
+            alt_alleles=self.ffi_static.alt_alleles,
+            alt_offsets=self.ffi_static.alt_offsets,
+            ref_=self.ffi_static.ref,
+            ref_offsets=self.ffi_static.ref_offsets,
+            pad_char=np.uint8(self.reference.pad_char),
+            keep=None
+            if keep_perm is None
+            else np.ascontiguousarray(keep_perm, np.bool_),
+            keep_offsets=None
+            if keep_offsets_perm is None
+            else np.ascontiguousarray(keep_offsets_perm, np.int64),
+            to_rc=_to_rc_spliced,
+            parallel=should_parallelize(int(splice_plan.permuted_out_offsets[-1])),
         )
 
-        per_elem_shape = (splice_plan.permuted_lengths.shape[0], None)
         return cast(
             "Ragged[np.bytes_]",
             _Flat.from_offsets(
@@ -829,7 +917,9 @@ def _reconstruct_haplotypes(self, req: ReconstructionRequest) -> Ragged[np.bytes
         )
 
     def _reconstruct_annotated_haplotypes(
-        self, req: ReconstructionRequest
+        self,
+        req: ReconstructionRequest,
+        to_rc: "NDArray[np.bool_] | None" = None,
     ) -> tuple[Ragged[np.bytes_], Ragged[V_IDX_TYPE], Ragged[np.int32]]:
         """Reconstruct haplotypes plus per-nucleotide annotations.
 
@@ -840,32 +930,55 @@ def _reconstruct_annotated_haplotypes(
         assert self.reference is not None
 
         if req.splice_plan is None:
-            out_data = np.empty(req.out_offsets[-1], np.uint8)
-            annot_v_data = np.empty(req.out_offsets[-1], V_IDX_TYPE)
-            annot_pos_data = np.empty(req.out_offsets[-1], np.int32)
-            out_offsets = np.asarray(req.out_offsets, np.int64)
             shape = (*req.shifts.shape, None)
-
-            # annot offsets match haps offsets, so we share them.
-            reconstruct_haplotypes_from_sparse(
-                geno_offset_idx=req.geno_offset_idx,
-                out=out_data,
-                out_offsets=out_offsets,
-                regions=req.regions,
-                shifts=req.shifts,
-                geno_offsets=self.genotypes.offsets,
-                geno_v_idxs=self.genotypes.data,
-                v_starts=self.variants.start,
-                ilens=self.variants.ilen,
-                alt_alleles=self.variants.alt.data.view(np.uint8),
-                alt_offsets=self.variants.alt.offsets,
-                ref=self.reference.reference,
-                ref_offsets=self.reference.offsets,
-                pad_char=self.reference.pad_char,
-                keep=req.keep,
-                keep_offsets=req.keep_offsets,
-                annot_v_idxs=annot_v_data,
-                annot_ref_pos=annot_pos_data,
+            # --- fused path (Rust): one FFI crossing, no Python-side np.empty ---
+            # Detect ragged vs fixed-length output from req.out_offsets.
+            # Ragged: out_lengths == hap_lengths (per-hap variable length).
+            # Fixed:  out_lengths is all the same constant value.
+            _out_per = (req.out_offsets[1:] - req.out_offsets[:-1]).reshape(
+                req.shifts.shape
+            )
+            if np.array_equal(
+                _out_per.astype(np.int64), req.hap_lengths.astype(np.int64)
+            ):
+                _fused_output_length = np.int64(-1)  # ragged mode
+            else:
+                _fused_output_length = np.int64(
+                    int(req.out_offsets[1] - req.out_offsets[0])
+                )
+            # Expand per-query to_rc → per-(query, hap) for the fused kernel.
+            _ploidy = req.shifts.shape[1] if req.shifts.ndim > 1 else 1
+            _to_rc_hap = (
+                None
+                if to_rc is None
+                else np.ascontiguousarray(np.repeat(to_rc, _ploidy), np.bool_)
+            )
+            out_data, annot_v_data, annot_pos_data, out_offsets = (
+                reconstruct_annotated_haplotypes_fused(
+                    regions=np.ascontiguousarray(req.regions, np.int32),
+                    shifts=np.ascontiguousarray(req.shifts, np.int32),
+                    geno_offset_idx=np.ascontiguousarray(req.geno_offset_idx, np.int64),
+                    geno_offsets=_as_starts_stops(self.genotypes.offsets),
+                    geno_v_idxs=_ffi_array(
+                        self.genotypes.data, np.int32, "geno_v_idxs"
+                    ),
+                    v_starts=self.ffi_static.v_starts,
+                    ilens=self.ffi_static.ilens,
+                    alt_alleles=self.ffi_static.alt_alleles,
+                    alt_offsets=self.ffi_static.alt_offsets,
+                    ref_=self.ffi_static.ref,
+                    ref_offsets=self.ffi_static.ref_offsets,
+                    pad_char=np.uint8(self.reference.pad_char),
+                    output_length=_fused_output_length,
+                    keep=None
+                    if req.keep is None
+                    else np.ascontiguousarray(req.keep, np.bool_),
+                    keep_offsets=None
+                    if req.keep_offsets is None
+                    else np.ascontiguousarray(req.keep_offsets, np.int64),
+                    to_rc=_to_rc_hap,
+                    parallel=should_parallelize(int(req.out_offsets[-1])),
+                )
             )
             return (
                 cast(
@@ -887,35 +1000,45 @@ def _reconstruct_annotated_haplotypes(
             self._permute_request_for_splice(req)
         )
         splice_plan = req.splice_plan
+        per_elem_shape = (splice_plan.permuted_lengths.shape[0], None)
+        off = splice_plan.permuted_out_offsets
 
-        total = int(splice_plan.permuted_out_offsets[-1])
-        out_buf = np.empty(total, np.uint8)
-        annot_v_buf = np.empty(total, V_IDX_TYPE)
-        annot_pos_buf = np.empty(total, np.int32)
-
-        reconstruct_haplotypes_from_sparse(
-            geno_offset_idx=flat_geno_idx.reshape(-1, 1),
-            out=out_buf,
-            out_offsets=splice_plan.permuted_out_offsets,
-            regions=permuted_regions,
-            shifts=flat_shifts.reshape(-1, 1),
-            geno_offsets=self.genotypes.offsets,
-            geno_v_idxs=self.genotypes.data,
-            v_starts=self.variants.start,
-            ilens=self.variants.ilen,
-            alt_alleles=self.variants.alt.data.view(np.uint8),
-            alt_offsets=self.variants.alt.offsets,
-            ref=self.reference.reference,
-            ref_offsets=self.reference.offsets,
-            pad_char=self.reference.pad_char,
-            keep=keep_perm,
-            keep_offsets=keep_offsets_perm,
-            annot_v_idxs=annot_v_buf,
-            annot_ref_pos=annot_pos_buf,
+        # Fused path (Rust): one FFI crossing. RC is folded in-kernel (sequence bytes
+        # reverse-complemented, annotation rows reversed), so there is NO Python
+        # reverse_masked post-pass. to_rc is already in permuted per-element order
+        # (from _getitem_spliced), and _getitem_spliced treats the rust output as
+        # already-RC'd (its post-pass is numba-only).
+        _to_rc_spliced = (
+            None if to_rc is None else np.ascontiguousarray(to_rc, np.bool_)
+        )
+        out_buf, annot_v_buf, annot_pos_buf = (
+            reconstruct_annotated_haplotypes_spliced_fused(
+                permuted_regions=np.ascontiguousarray(permuted_regions, np.int32),
+                flat_shifts=np.ascontiguousarray(flat_shifts.reshape(-1, 1), np.int32),
+                flat_geno_offset_idx=np.ascontiguousarray(
+                    flat_geno_idx.reshape(-1, 1), np.int64
+                ),
+                out_offsets=np.ascontiguousarray(off, np.int64),
+                geno_offsets=_as_starts_stops(self.genotypes.offsets),
+                geno_v_idxs=_ffi_array(self.genotypes.data, np.int32, "geno_v_idxs"),
+                v_starts=self.ffi_static.v_starts,
+                ilens=self.ffi_static.ilens,
+                alt_alleles=self.ffi_static.alt_alleles,
+                alt_offsets=self.ffi_static.alt_offsets,
+                ref_=self.ffi_static.ref,
+                ref_offsets=self.ffi_static.ref_offsets,
+                pad_char=np.uint8(self.reference.pad_char),
+                keep=None
+                if keep_perm is None
+                else np.ascontiguousarray(keep_perm, np.bool_),
+                keep_offsets=None
+                if keep_offsets_perm is None
+                else np.ascontiguousarray(keep_offsets_perm, np.int64),
+                to_rc=_to_rc_spliced,
+                parallel=should_parallelize(int(off[-1])),
+            )
         )
 
-        per_elem_shape = (splice_plan.permuted_lengths.shape[0], None)
-        off = splice_plan.permuted_out_offsets
         haps_rag = cast(
             "Ragged[np.bytes_]",
             _Flat.from_offsets(out_buf, per_elem_shape, off).view("S1"),
diff --git a/python/genvarloader/_dataset/_intervals.py b/python/genvarloader/_dataset/_intervals.py
index cca51cf0..c51def0f 100644
--- a/python/genvarloader/_dataset/_intervals.py
+++ b/python/genvarloader/_dataset/_intervals.py
@@ -1,97 +1,13 @@
-import numba as nb
 import numpy as np
 from numpy.typing import NDArray
 
-from .._dispatch import get, register
 from ..genvarloader import intervals_to_tracks as _intervals_to_tracks_rust
+from ..genvarloader import tracks_to_intervals as _tracks_to_intervals_rust
+from .._threads import should_parallelize
 
 __all__ = []
 
 
-@nb.njit(parallel=True, nogil=True, cache=True)
-def _intervals_to_tracks_numba(
-    offset_idxs: NDArray[np.integer],
-    starts: NDArray[np.int32],
-    itv_starts: NDArray[np.int32],
-    itv_ends: NDArray[np.int32],
-    itv_values: NDArray[np.float32],
-    itv_offsets: NDArray[np.int64],
-    out: NDArray[np.float32],
-    out_offsets: NDArray[np.int64],
-):
-    """Convert intervals to tracks at base-pair resolution.
-    Assumptions:
-    - intervals are sorted by start
-    - intervals do not overlap
-
-    Parameters
-    ----------
-    offset_idxs : NDArray[np.intp]
-        Shape = (batch) Indexes into offsets.
-    starts : NDArray[np.int32]
-        Shape = (batch) Starts for each query.
-    itv_starts : NDArray[np.int32]
-        Shape = (n_intervals) Starts for each interval.
-    itv_ends : NDArray[np.int32]
-        Shape = (n_intervals) Ends for each interval.
-    itv_values : NDArray[np.float32]
-        Shape = (n_intervals) Values for each interval.
-    itv_offsets : NDArray[np.uint32]
-        Shape = (n_slices + 1) Offsets into intervals and values.
-        For a GVL Dataset, n_interval_sets = n_samples * n_regions with that layout.
-    out : NDArray[np.float32]
-        Shape = (batch*length) Output tracks.
-    out_offsets : NDArray[np.int64]
-        Shape = (batch + 1) Offsets into output tracks.
-
-    Returns
-    -------
-    data : NDArray[np.float32]
-        Ragged shape = (batch*length) Values for ragged array of tracks.
-    offsets : NDArray[np.int32]
-        Shape = (batch + 1) Offsets for ragged array of tracks.
-    """
-    n_queries = len(starts)
-    out[:] = 0.0
-    for query in nb.prange(n_queries):
-        idx = offset_idxs[query]
-        itv_s, itv_e = itv_offsets[idx], itv_offsets[idx + 1]
-        n_intervals = itv_e - itv_s
-        if n_intervals == 0:
-            continue
-
-        out_s, out_e = out_offsets[query], out_offsets[query + 1]
-        length = out_e - out_s
-        _out = out[out_s:out_e]
-
-        query_start = starts[query]
-
-        # if parallelized, a data race will occur if there are any overlapping intervals
-        for interval in range(itv_s, itv_e):
-            start = itv_starts[interval] - query_start
-            end = itv_ends[interval] - query_start
-            value = itv_values[interval]
-            if start >= length:
-                #! assumes intervals are sorted by start
-                # cannot break if parallelized
-                break
-            # Clip to the query window. Intervals may start before query_start
-            # (jitter-expanded storage vs. the per-read query origin; see #242)
-            # or end past it.
-            s = max(start, 0)
-            e = min(end, length)
-            if e > s:
-                _out[s:e] = value
-
-
-register(
-    "intervals_to_tracks",
-    numba=_intervals_to_tracks_numba,
-    rust=_intervals_to_tracks_rust,
-    default="rust",
-)
-
-
 def intervals_to_tracks(
     offset_idxs: NDArray[np.integer],
     starts: NDArray[np.int32],
@@ -104,10 +20,9 @@ def intervals_to_tracks(
 ) -> None:
     """Paint base-pair-resolution tracks from intervals, writing ``out`` in place.
 
-    Dispatches to the numba or Rust backend via :mod:`genvarloader._dispatch`
-    (default ``rust``). Read-only inputs are coerced to canonical dtypes so both
-    backends receive byte-identical bytes (see tests/parity); ``out`` is passed
-    through untouched so in-place writes land in the caller's buffer.
+    Dispatches to the Rust backend. Read-only inputs are coerced to canonical dtypes so
+    the backend receives byte-identical bytes; ``out`` is passed through untouched so
+    in-place writes land in the caller's buffer.
     """
     offset_idxs = np.ascontiguousarray(offset_idxs, dtype=np.int64)
     starts = np.ascontiguousarray(starts, dtype=np.int32)
@@ -116,7 +31,9 @@ def intervals_to_tracks(
     itv_values = np.ascontiguousarray(itv_values, dtype=np.float32)
     itv_offsets = np.ascontiguousarray(itv_offsets, dtype=np.int64)
     out_offsets = np.ascontiguousarray(out_offsets, dtype=np.int64)
-    get("intervals_to_tracks")(
+    # out is f32; total output bytes used to decide parallelism threshold.
+    total_out_bytes = int(out_offsets[-1]) * 4
+    _intervals_to_tracks_rust(
         offset_idxs,
         starts,
         itv_starts,
@@ -125,10 +42,10 @@ def intervals_to_tracks(
         itv_offsets,
         out,
         out_offsets,
+        should_parallelize(total_out_bytes),
     )
 
 
-@nb.njit(parallel=True, nogil=True, cache=True)
 def tracks_to_intervals(
     regions: NDArray[np.int32],
     tracks: NDArray[np.float32],
@@ -136,88 +53,31 @@ def tracks_to_intervals(
 ) -> tuple[
     NDArray[np.int32], NDArray[np.int32], NDArray[np.float32], NDArray[np.int64]
 ]:
-    """Convert tracks to intervals. Note that this will include 0-value intervals.
+    """RLE-encode a ragged f32 track buffer into (starts, ends, values, offsets) intervals.
+
+    Includes 0-value intervals (no filtering on value == 0.0). Dispatches to the Rust backend. Read-only inputs
+    are coerced to canonical dtypes so both backends receive byte-identical bytes.
 
     Parameters
     ----------
     regions : NDArray[np.int32]
-        Shape = (n_queries, 3) Regions for each query.
+        Shape = (n_queries, 3) Regions for each query (contig_idx, start, end).
     tracks : NDArray[np.float32]
-        Shape = (n_queries*query_length) Ragged array of tracks.
-    offsets : NDArray[np.int64]
-        Shape = (n_queries + 1) Offsets into ragged track data.
+        Shape = (total_track_len,) Ragged flat array of track values.
+    track_offsets : NDArray[np.int64]
+        Shape = (n_queries + 1,) Offsets into ragged track data.
 
     Returns
     -------
-    out : NDArray[np.void]
-        Shape = (n_intervals) Intervals.
-
-    Notes
-    -----
-    Implementation closely follows [CUDA RLE](https://erkaman.github.io/posts/cuda_rle.html).
+    all_starts : NDArray[np.int32]
+    all_ends : NDArray[np.int32]
+    all_values : NDArray[np.float32]
+    interval_offsets : NDArray[np.int64]
     """
-    n_queries = len(regions)
-
-    n_intervals = np.empty(n_queries, np.int32)
-    scanned_masks = np.empty_like(tracks, np.int64)
-    for query in nb.prange(n_queries):
-        o_s = track_offsets[query]
-        o_e = track_offsets[query + 1]
-        if o_s == o_e:
-            n_intervals[query] = 0
-            continue
-        track = tracks[o_s:o_e]
-        scanned_backward_mask = scanned_masks[o_s:o_e]
-        _scanned_mask(track, scanned_backward_mask)
-        n_intervals[query] = scanned_backward_mask[-1]
-
-    interval_offsets = np.empty(n_queries + 1, np.int64)
-    interval_offsets[0] = 0
-    interval_offsets[1:] = n_intervals.cumsum()
-
-    all_starts = np.empty(interval_offsets[-1], np.int32)
-    all_ends = np.empty(interval_offsets[-1], np.int32)
-    all_values = np.empty(interval_offsets[-1], np.float32)
-    for query in nb.prange(n_queries):
-        o_s = track_offsets[query]
-        o_e = track_offsets[query + 1]
-        if o_s == o_e:
-            continue
-        scanned_backward_mask = scanned_masks[o_s:o_e]
-        compacted_backward_mask = _compact_mask(scanned_backward_mask)
-        track = tracks[o_s:o_e]
-        values = track[compacted_backward_mask[:-1]]
-        s = interval_offsets[query]
-        start = regions[query, 1]
-        compacted_backward_mask += start
-        n = len(values)
-        all_starts[s : s + n] = compacted_backward_mask[:-1]
-        all_ends[s : s + n] = compacted_backward_mask[1:]
-        all_values[s : s + n] = values
-
-    return all_starts, all_ends, all_values, interval_offsets
-
-
-@nb.njit(parallel=True, nogil=True, cache=True)
-def _scanned_mask(track: NDArray[np.float32], out: NDArray[np.int64]):
-    backward_mask = np.empty(len(track), np.bool_)
-    backward_mask[0] = True
-    backward_mask[1:] = track[:-1] != track[1:]
-    out[:] = backward_mask.cumsum()
-
-
-@nb.njit(parallel=True, nogil=True, cache=True)
-def _compact_mask(
-    scanned_backward_mask: NDArray[np.int64],
-):
-    n_elems = len(scanned_backward_mask)
-    n_runs = scanned_backward_mask[-1]
-    compacted_backward_mask = np.empty(n_runs + 1, np.int32)
-    compacted_backward_mask[-1] = n_elems
-    for i in nb.prange(n_elems):
-        if i == 0:
-            compacted_backward_mask[i] = 0
-        # 0 < i < n_elems - 1
-        elif scanned_backward_mask[i] != scanned_backward_mask[i - 1]:
-            compacted_backward_mask[scanned_backward_mask[i] - 1] = i
-    return compacted_backward_mask
+    regions = np.ascontiguousarray(regions, dtype=np.int32)
+    tracks = np.ascontiguousarray(tracks, dtype=np.float32)
+    track_offsets = np.ascontiguousarray(track_offsets, dtype=np.int64)
+    total_bytes = int(track_offsets[-1]) * 4  # f32 = 4 bytes per element
+    return _tracks_to_intervals_rust(
+        regions, tracks, track_offsets, should_parallelize(total_bytes)
+    )
diff --git a/python/genvarloader/_dataset/_migrate.py b/python/genvarloader/_dataset/_migrate.py
new file mode 100644
index 00000000..756dc4b7
--- /dev/null
+++ b/python/genvarloader/_dataset/_migrate.py
@@ -0,0 +1,115 @@
+"""In-place, streaming, idempotent migration of a 1.x AoS dataset to 2.0 SoA.
+
+Per track under ``intervals/<track>/`` and ``annot_intervals/<track>/``:
+stream ``intervals.npy`` (INTERVAL_DTYPE) in record chunks into three contiguous
+``starts/ends/values.npy`` files. Only after every track's SoA is durable do we
+bump ``metadata.json`` (last durable write); then delete the AoS files.
+
+Crash-safety by ordering: an interruption before the metadata bump leaves the
+dataset still-1.x (old AoS intact, re-runnable); an interruption after the bump
+but before deletion leaves both layouts, and a re-run completes the cleanup.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from collections.abc import Iterator
+from pathlib import Path
+
+import numpy as np
+from loguru import logger
+from pydantic_extra_types.semantic_version import SemanticVersion
+
+from .._ragged import INTERVAL_DTYPE
+from ._write import DATASET_FORMAT_VERSION
+
+_CHUNK = 1_000_000  # records per streamed block
+
+
+def _track_dirs(path: Path) -> Iterator[Path]:
+    for base in ("intervals", "annot_intervals"):
+        d = path / base
+        if d.is_dir():
+            for child in sorted(d.iterdir()):
+                if child.is_dir():
+                    yield child
+
+
+def _migrate_track(track_dir: Path) -> None:
+    """Stream one track's AoS intervals.npy into SoA starts/ends/values.npy.
+
+    No-op if intervals.npy is absent (already migrated or never AoS). Leaves the
+    AoS file in place; the caller deletes it only after metadata is bumped.
+    """
+    aos = track_dir / "intervals.npy"
+    if not aos.exists():
+        return
+    src = np.memmap(aos, dtype=INTERVAL_DTYPE, mode="r")
+    n = int(src.shape[0])
+    starts = np.memmap(track_dir / "starts.npy", dtype=np.int32, mode="w+", shape=n)
+    ends = np.memmap(track_dir / "ends.npy", dtype=np.int32, mode="w+", shape=n)
+    values = np.memmap(track_dir / "values.npy", dtype=np.float32, mode="w+", shape=n)
+    for i in range(0, n, _CHUNK):
+        j = min(i + _CHUNK, n)
+        block = src[i:j]
+        starts[i:j] = block["start"]
+        ends[i:j] = block["end"]
+        values[i:j] = block["value"]
+    for m in (starts, ends, values):
+        m.flush()
+    logger.info(f"Migrated {n} intervals in {track_dir} to SoA.")
+    del src, starts, ends, values
+
+
+def migrate(path: str | Path) -> None:
+    """Migrate a GVL dataset's track intervals from format 1.x (array-of-structs)
+    to format 2.0 (struct-of-arrays), in place.
+
+    Streaming and crash-safe: peak extra disk is one track's interval store.
+    Genotypes, regions, and reference are untouched. Idempotent — a no-op (with
+    leftover-AoS cleanup) on a dataset that is already 2.0.
+
+    Parameters
+    ----------
+    path
+        Path to the GVL dataset directory.
+    """
+    path = Path(path)
+    meta_path = path / "metadata.json"
+    if not meta_path.exists():
+        raise FileNotFoundError(f"No metadata.json at {meta_path}")
+    raw = json.loads(meta_path.read_text())
+    fv = raw.get("format_version")
+    already_v2 = (
+        fv is not None
+        and SemanticVersion.parse(fv).major >= DATASET_FORMAT_VERSION.major
+    )
+    track_dirs = list(_track_dirs(path))
+
+    if already_v2:
+        # Idempotent cleanup: remove leftover AoS from an interrupted delete.
+        for d in track_dirs:
+            aos = d / "intervals.npy"
+            if aos.exists() and (d / "starts.npy").exists():
+                aos.unlink()
+        return
+
+    # 1. Convert every track to SoA (AoS left in place).
+    for d in track_dirs:
+        _migrate_track(d)
+
+    # 2. Durably bump metadata LAST (atomic replace).
+    raw["format_version"] = str(DATASET_FORMAT_VERSION)
+    tmp = meta_path.with_suffix(".json.tmp")
+    tmp.write_text(json.dumps(raw))
+    with open(tmp, "rb") as f:
+        os.fsync(f.fileno())
+    os.replace(tmp, meta_path)
+
+    # 3. Delete AoS files.
+    for d in track_dirs:
+        aos = d / "intervals.npy"
+        if aos.exists():
+            aos.unlink()
+    logger.info(f"Migrated dataset {path} to format {DATASET_FORMAT_VERSION}.")
diff --git a/python/genvarloader/_dataset/_open.py b/python/genvarloader/_dataset/_open.py
index 988909c3..c720a266 100644
--- a/python/genvarloader/_dataset/_open.py
+++ b/python/genvarloader/_dataset/_open.py
@@ -24,7 +24,7 @@
 from ._reference import Reference
 from ._utils import bed_to_regions
 from ._validate import validate_dataset
-from ._write import Metadata
+from ._write import Metadata, _check_dataset_format_version
 
 if TYPE_CHECKING:
     from ._impl import RaggedDataset
@@ -103,6 +103,7 @@ def _validate_path(self) -> None:
     def _load_metadata(self) -> Metadata:
         with _py_open(self.path / "metadata.json") as f:
             metadata = Metadata.model_validate_json(f.read())
+        _check_dataset_format_version(metadata, self.path)
         validate_dataset(metadata, self.path)
         return metadata
 
diff --git a/python/genvarloader/_dataset/_protocol.py b/python/genvarloader/_dataset/_protocol.py
index 0e26ea11..71984e0f 100644
--- a/python/genvarloader/_dataset/_protocol.py
+++ b/python/genvarloader/_dataset/_protocol.py
@@ -32,8 +32,13 @@ def __call__(
         deterministic: bool,
         splice_plan: SplicePlan | None = None,
         flat: bool = False,
+        to_rc: "NDArray[np.bool_] | None" = None,
     ) -> T:
         """``flat`` only changes behavior for :class:`Haps` producing
         ``RaggedVariants`` (it returns a flat ``_FlatVariants`` instead); all
-        other reconstructors are already flat-native and accept-and-ignore it."""
+        other reconstructors are already flat-native and accept-and-ignore it.
+
+        ``to_rc`` is a per-row boolean mask (True = reverse-complement this row).
+        On the Rust backend, flat-seq kinds fold RC in-kernel; on numba the
+        caller's post-pass handles it and this param is ignored by each method."""
         ...
diff --git a/python/genvarloader/_dataset/_query.py b/python/genvarloader/_dataset/_query.py
index ff75b6c8..a8d65301 100644
--- a/python/genvarloader/_dataset/_query.py
+++ b/python/genvarloader/_dataset/_query.py
@@ -171,6 +171,10 @@ def _getitem_unspliced(
     regions[:, 1] += jitter_off
     regions[:, 2] = regions[:, 1] + lengths
 
+    to_rc: NDArray[np.bool_] | None = (
+        view.full_regions[r_idx, 3] == -1 if view.rc_neg else None
+    )
+
     recon = view.recon(
         idx=ds_idx,
         r_idx=r_idx,
@@ -180,14 +184,23 @@ def _getitem_unspliced(
         rng=view.rng,
         deterministic=view.deterministic,
         flat=view.flat_output,
+        to_rc=to_rc,
     )
 
     if not isinstance(recon, tuple):
         recon = (recon,)
 
-    if view.rc_neg:
-        to_rc: NDArray[np.bool_] = view.full_regions[r_idx, 3] == -1
-        recon = tuple(reverse_complement_ragged(r, to_rc) for r in recon)
+    if view.rc_neg and to_rc is not None:
+        # Rust: flat-seq kinds (bytes, tracks, annotated-haps) have RC
+        # folded into the kernel or handled Python-side inside the
+        # reconstructor.  Variant types have no in-kernel RC and are
+        # deferred here.  (_FlatVariantWindows RC is a no-op in
+        # reverse_complement_ragged; RaggedVariants is Target 7.)
+        _VARIANT_TYPES = (RaggedVariants, _FlatVariants, _FlatVariantWindows)
+        recon = tuple(
+            reverse_complement_ragged(r, to_rc) if isinstance(r, _VARIANT_TYPES) else r
+            for r in recon
+        )
 
     return recon, squeeze, out_reshape
 
@@ -237,6 +250,27 @@ def _getitem_spliced(
         n_samples=n_samples_sel,
     )
 
+    # Compute the permuted per-element to_rc mask (used for both the in-kernel
+    # pass and the post-pass guard below).
+    to_rc_per_elem: NDArray[np.bool_] | None = None
+    if view.rc_neg:
+        B = regions.shape[0]
+        n_k = int(plan.permutation.shape[0])
+        inner_factor, rem = divmod(n_k, B)
+        if rem != 0:
+            raise AssertionError(
+                "plan.permutation length is not a multiple of len(regions); "
+                "inner-fixed flatten factor inconsistent."
+            )
+        to_rc_unperm = regions[:, 3] == -1
+        if inner_factor == 1:
+            to_rc_flat = to_rc_unperm
+        else:
+            # (B, E) C-order: same value across the inner axis for a given
+            # query. np.repeat gives (B*E,) in (query, inner) C-order.
+            to_rc_flat = np.repeat(to_rc_unperm, inner_factor)
+        to_rc_per_elem = to_rc_flat[plan.permutation]
+
     recon = view.recon(
         idx=ds_idx,
         r_idx=r_idx,
@@ -247,6 +281,7 @@ def _getitem_spliced(
         deterministic=view.deterministic,
         splice_plan=plan,
         flat=view.flat_output,
+        to_rc=to_rc_per_elem,
     )
 
     if not isinstance(recon, tuple):
@@ -256,29 +291,6 @@ def _getitem_spliced(
         tuple[Ragged[np.bytes_ | np.float32] | RaggedAnnotatedHaps, ...], recon
     )
 
-    if view.rc_neg:
-        # Permute the per-region to_rc mask the same way the plan permuted
-        # the kernel queries. The plan acts on a flattened (B, *inner_fixed)
-        # k-index, so first replicate to_rc across the inner axes, then
-        # gather via plan.permutation.
-        B = regions.shape[0]
-        n_k = int(plan.permutation.shape[0])
-        inner_factor, rem = divmod(n_k, B)
-        if rem != 0:
-            raise AssertionError(
-                "plan.permutation length is not a multiple of len(regions); "
-                "inner-fixed flatten factor inconsistent."
-            )
-        to_rc_unperm = regions[:, 3] == -1
-        if inner_factor == 1:
-            to_rc_flat = to_rc_unperm
-        else:
-            # (B, E) C-order: same value across the inner axis for a given
-            # query. np.repeat gives (B*E,) in (query, inner) C-order.
-            to_rc_flat = np.repeat(to_rc_unperm, inner_factor)
-        to_rc_per_elem: NDArray[np.bool_] = to_rc_flat[plan.permutation]
-        recon = tuple(reverse_complement_ragged(r, to_rc_per_elem) for r in recon)
-
     # Rewrap each per-element Ragged with the plan's group_offsets to expose
     # one contiguous spliced element per (row, sample[, inner]) cell. Collapse
     # (n_rows, n_samples) into a single leading "pair" axis so the downstream
diff --git a/python/genvarloader/_dataset/_rag_variants.py b/python/genvarloader/_dataset/_rag_variants.py
index 7003f8e4..04169038 100644
--- a/python/genvarloader/_dataset/_rag_variants.py
+++ b/python/genvarloader/_dataset/_rag_variants.py
@@ -9,6 +9,7 @@
 from seqpro.rag import Ragged
 from seqpro.rag import concatenate as _rag_concatenate
 
+from ._flat_variants import _rc_alleles_rust
 from .._torch import TORCH_AVAILABLE, requires_torch
 
 if TORCH_AVAILABLE:
@@ -294,10 +295,6 @@ def end(self) -> Ragged:
         return self.start - np.clip(ilen, None, 0) + 1
 
     def rc_(self, to_rc: NDArray[np.bool_] | None = None) -> "RaggedVariants":
-        from .._ragged import _COMP
-
-        from seqpro.rag import reverse_complement as _sp_reverse_complement
-
         b = self.shape[0]
         if to_rc is None:
             to_rc = np.ones(b, np.bool_)
@@ -320,9 +317,8 @@ def rc_(self, to_rc: NDArray[np.bool_] | None = None) -> "RaggedVariants":
                 char_off = chars._layout.offsets[-1]  # char-level: (n_alleles+1,)
                 n_alleles = len(char_off) - 1
 
-                # Build a flat allele-level R=1 view on a copy of the data buffer.
+                # Copy the data buffer; rc_alleles mutates it in place.
                 data = chars.data.copy()
-                view = Ragged.from_offsets(data, (n_alleles, None), char_off)
 
                 # Expand to_rc (per-batch, size b) to per-allele (size n_alleles).
                 # Batch element i_b owns alleles var_off[i_b*p] .. var_off[(i_b+1)*p]-1.
@@ -330,7 +326,12 @@ def rc_(self, to_rc: NDArray[np.bool_] | None = None) -> "RaggedVariants":
                 alleles_per_batch = var_off[batch_starts + p] - var_off[batch_starts]
                 allele_mask = np.repeat(to_rc, alleles_per_batch)
 
-                _sp_reverse_complement(view, _COMP, mask=allele_mask, copy=False)
+                _rc_alleles_rust(
+                    data.view(np.uint8),
+                    np.asarray(char_off, np.int64),
+                    np.arange(n_alleles + 1, dtype=np.int64),
+                    allele_mask,
+                )
 
                 # Rebuild as opaque-string field with the same shape and offsets.
                 rebuilt = Ragged.from_offsets(
diff --git a/python/genvarloader/_dataset/_reconstruct.py b/python/genvarloader/_dataset/_reconstruct.py
index 28e73be2..0d6b80e5 100644
--- a/python/genvarloader/_dataset/_reconstruct.py
+++ b/python/genvarloader/_dataset/_reconstruct.py
@@ -23,16 +23,30 @@
 from .._flat import _Flat
 from .._ragged import RaggedAnnotatedHaps, RaggedIntervals, RaggedSeqs, RaggedTracks
 from .._utils import lengths_to_offsets
+from ._genotypes import _as_starts_stops
 from ._haps import _H, Haps, ReconstructionRequest, _NewH, _Variants
 from ._insertion_fill import Repeat5p
 from ._insertion_fill import lower as _lower_insertion_fills
 from ._flat_variants import _FlatVariantWindows
-from ._intervals import intervals_to_tracks
 from ._protocol import Reconstructor
 from ._rag_variants import RaggedVariants
 from ._ref import Ref
 from ._splice import SplicePlan
-from ._tracks import _T, Tracks, TrackType, _NewT, shift_and_realign_tracks_sparse
+from ._tracks import (
+    _T,
+    Tracks,
+    TrackType,
+    _NewT,
+)  # noqa: F401
+from ._utils import _ffi_array
+from .._threads import should_parallelize
+
+# Fused tracks entry (Task 14): intervals → scratch → realign, one FFI crossing.
+# Imported at module level so the spy in test_fused_tracks_parity can monkeypatch it.
+from ..genvarloader import (
+    intervals_and_realign_track_fused as intervals_and_realign_track_fused,
+)
+
 
 # Re-exports for back-compat (callers historically imported these from
 # ``_reconstruct``):
@@ -70,6 +84,7 @@ def __call__(
         deterministic: bool,
         splice_plan: SplicePlan | None = None,
         flat: bool = False,
+        to_rc: "NDArray[np.bool_] | None" = None,
     ) -> tuple[Any, _T]:
         if splice_plan is not None:
             raise NotImplementedError(
@@ -84,6 +99,7 @@ def __call__(
             rng=rng,
             deterministic=deterministic,
             flat=flat,
+            to_rc=to_rc,
         )
         tracks = self.tracks(
             idx=idx,
@@ -94,6 +110,7 @@ def __call__(
             rng=rng,
             deterministic=deterministic,
             flat=flat,
+            to_rc=to_rc,
         )
         return seqs, tracks
 
@@ -121,6 +138,7 @@ def __call__(
         deterministic: bool,
         splice_plan: SplicePlan | None = None,
         flat: bool = False,
+        to_rc: "NDArray[np.bool_] | None" = None,
     ) -> tuple[_H, _T]:
         if splice_plan is not None:
             raise NotImplementedError(
@@ -137,6 +155,7 @@ def __call__(
                 output_length=output_length,
                 rng=rng,
                 deterministic=deterministic,
+                to_rc=to_rc,
             )
         )
 
@@ -182,48 +201,72 @@ def __call__(
                     rng.integers(0, np.iinfo(np.uint64).max, dtype=np.uint64)
                 )
 
+            # Pre-compute (2, n) geno_offsets once for the fused Rust path
+            # (avoids re-computing _as_starts_stops n_tracks times).
+            _geno_offsets_2d = _as_starts_stops(self.haps.genotypes.offsets)
+
             for track_ofst, (name, tracktype) in enumerate(
                 self.tracks.active_tracks.items()
             ):
                 intervals = self.tracks.intervals[name]
 
-                # ragged (b l)
-                _tracks = np.empty(track_ofsts_per_t[-1], np.float32)
-
                 if tracktype is TrackType.SAMPLE:
                     o_idx = idx
                 else:
                     o_idx = r_idx
 
-                intervals_to_tracks(
-                    offset_idxs=o_idx,  # (b)
-                    starts=regions[:, 1],  # (b)
-                    itv_starts=intervals.starts.data,
-                    itv_ends=intervals.ends.data,
-                    itv_values=intervals.values.data,
-                    itv_offsets=intervals.starts.offsets,
-                    out=_tracks,  # (b*l)
-                    out_offsets=track_ofsts_per_t,  # (b+1)
-                )
-
                 _out = out[track_ofst * n_per_track : (track_ofst + 1) * n_per_track]
-                shift_and_realign_tracks_sparse(
-                    out=_out,  # (b*p*l)
-                    out_offsets=out_ofsts_per_t,  # (b*p+1)
-                    regions=regions,  # (b, 3)
-                    shifts=shifts,  # (b p)
-                    geno_offset_idx=geno_idx,  # (b p)
-                    geno_v_idxs=self.haps.genotypes.data,  # (r*s*p*v)
-                    geno_offsets=self.haps.genotypes.offsets,  # (r*s*p+1)
-                    v_starts=self.haps.variants.start,  # (tot_v)
-                    ilens=self.haps.variants.ilen,  # (tot_v)
-                    tracks=_tracks,  # ragged (b l)
-                    track_offsets=track_ofsts_per_t,  # (b+1)
-                    params=strat_params[track_ofst],
-                    keep=keep,  # (b*p*v)
-                    keep_offsets=keep_offsets,  # (b*p+1)
+
+                # Fused path (Rust): one FFI crossing, no Python-side
+                # intermediate buffer.  Replaces:
+                #   _tracks = np.empty(...)                (audit T2)
+                #   intervals_to_tracks(...)               (FFI crossing #3)
+                #   shift_and_realign_tracks_sparse(...)   (FFI crossing #4)
+                #
+                # _out is a contiguous f32 slice of the pre-allocated `out`
+                # buffer (np.empty, step=1).  No ascontiguousarray needed for
+                # `out`; the fused entry writes in-place into its buffer.
+                # Expand per-query to_rc to per-(query, hap) for the track kernel.
+                # out_ofsts_per_t is (b*p+1); ploidy = geno_idx.shape[-1].
+                _ploidy = geno_idx.shape[-1]
+                _to_rc_hap = (
+                    None
+                    if to_rc is None
+                    else np.ascontiguousarray(np.repeat(to_rc, _ploidy), np.bool_)
+                )
+                intervals_and_realign_track_fused(
+                    out=_out,
+                    out_offsets=np.ascontiguousarray(out_ofsts_per_t, np.int64),
+                    regions=np.ascontiguousarray(regions, np.int32),
+                    shifts=np.ascontiguousarray(shifts, np.int32),
+                    geno_offset_idx=np.ascontiguousarray(geno_idx, np.int64),
+                    geno_v_idxs=_ffi_array(
+                        self.haps.genotypes.data, np.int32, "geno_v_idxs"
+                    ),
+                    geno_offsets=_geno_offsets_2d,
+                    v_starts=self.haps.ffi_static.v_starts,
+                    ilens=self.haps.ffi_static.ilens,
+                    offset_idxs=np.ascontiguousarray(o_idx, np.int64),
+                    itv_starts=_ffi_array(
+                        intervals.starts.data, np.int32, "itv_starts"
+                    ),
+                    itv_ends=_ffi_array(intervals.ends.data, np.int32, "itv_ends"),
+                    itv_values=_ffi_array(
+                        intervals.values.data, np.float32, "itv_values"
+                    ),
+                    itv_offsets=_ffi_array(
+                        intervals.starts.offsets, np.int64, "itv_offsets"
+                    ),
+                    track_offsets=np.ascontiguousarray(track_ofsts_per_t, np.int64),
+                    params=np.ascontiguousarray(strat_params[track_ofst], np.float64),
                     strategy_id=int(strat_ids[track_ofst]),
-                    base_seed=base_seed,
+                    base_seed=int(base_seed),
+                    keep=None if keep is None else np.ascontiguousarray(keep, np.bool_),
+                    keep_offsets=None
+                    if keep_offsets is None
+                    else np.ascontiguousarray(keep_offsets, np.int64),
+                    to_rc=_to_rc_hap,
+                    parallel=should_parallelize(int(out_ofsts_per_t[-1]) * 4),
                 )
 
             out_shape = (
diff --git a/python/genvarloader/_dataset/_ref.py b/python/genvarloader/_dataset/_ref.py
index da96329f..c3043dd9 100644
--- a/python/genvarloader/_dataset/_ref.py
+++ b/python/genvarloader/_dataset/_ref.py
@@ -36,6 +36,7 @@ def __call__(
         deterministic: bool,
         splice_plan: SplicePlan | None = None,
         flat: bool = False,
+        to_rc: "NDArray[np.bool_] | None" = None,
     ) -> Ragged[np.bytes_]:
         batch_size = len(idx)
 
@@ -52,13 +53,14 @@ def __call__(
             # (b+1)
             out_offsets = lengths_to_offsets(out_lengths)
 
-            # ragged (b ~l)
+            # ragged (b ~l) — on Rust backend, RC is folded into the kernel.
             ref = get_reference(
                 regions=regions,
                 out_offsets=out_offsets,
                 reference=self.reference.reference,
                 ref_offsets=self.reference.offsets,
                 pad_char=self.reference.pad_char,
+                to_rc=to_rc,
             )  # uint8 flat buffer
 
             return cast(
@@ -67,10 +69,12 @@ def __call__(
             )
 
         # Spliced path: delegate to the shared kernel-dispatch helper.
+        # to_rc is the permuted per-element mask from _getitem_spliced.
         return _fetch_spliced_ref(
             regions=regions,
             plan=splice_plan,
             reference=self.reference.reference,
             ref_offsets=self.reference.offsets,
             pad_char=self.reference.pad_char,
+            to_rc=to_rc,
         )
diff --git a/python/genvarloader/_dataset/_reference.py b/python/genvarloader/_dataset/_reference.py
index a488222f..4d95f794 100644
--- a/python/genvarloader/_dataset/_reference.py
+++ b/python/genvarloader/_dataset/_reference.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 from typing import Generic, Literal, TypeVar, cast, overload
 
-import numba as nb
 import numpy as np
 import polars as pl
 from genoray._utils import ContigNormalizer
@@ -16,14 +15,15 @@
 
 from .._flat import _Flat
 from .._fasta_cache import ensure_cache
-from .._ragged import RaggedSeqs, reverse_complement_masked, to_padded
+from .._ragged import RaggedSeqs, to_padded
 from .._torch import TORCH_AVAILABLE, get_dataloader, no_torch_error
 from .._types import Idx, StrIdx
 from .._utils import is_dtype
 from ._indexing import is_str_arr, s2i
 from ._splice import SpliceMap, SplicePlan, build_splice_plan
-from ._utils import bed_to_regions, padded_slice
+from ._utils import bed_to_regions
 from .._threads import should_parallelize
+from ..genvarloader import get_reference as _get_reference_rust_ffi
 
 INT64_MAX = np.iinfo(np.int64).max
 
@@ -130,57 +130,21 @@ def fetch(
 
         lengths = ends - starts
         offsets = lengths_to_offsets(lengths)
-        seqs = np.empty(offsets[-1], np.uint8)
-        kernel = (
-            _fetch_impl_par if should_parallelize(int(offsets[-1])) else _fetch_impl_ser
+        regions = np.stack(
+            [
+                np.asarray(c_idxs, np.int32),
+                np.asarray(starts, np.int32),
+                np.asarray(ends, np.int32),
+            ],
+            axis=1,
         )
-        kernel(
-            c_idxs,
-            starts,
-            ends,
-            self.reference,
-            self.offsets,
-            self.pad_char,
-            seqs,
-            offsets,
+        seqs = get_reference(
+            regions, offsets, self.reference, self.offsets, int(self.pad_char)
         )
-
         seqs = Ragged.from_offsets(seqs.view("S1"), (len(contigs), None), offsets)
-
         return seqs
 
 
-@nb.njit(nogil=True, cache=True, inline="always")
-def _fetch_row(
-    i, c_idxs, starts, ends, reference, ref_offsets, pad_char, out, out_offsets
-):
-    r_s, r_e = ref_offsets[c_idxs[i]], ref_offsets[c_idxs[i] + 1]
-    o_s, o_e = out_offsets[i], out_offsets[i + 1]
-    padded_slice(reference[r_s:r_e], starts[i], ends[i], pad_char, out[o_s:o_e])
-
-
-@nb.njit(parallel=True, nogil=True, cache=True)
-def _fetch_impl_par(
-    c_idxs, starts, ends, reference, ref_offsets, pad_char, out, out_offsets
-):
-    for i in nb.prange(len(c_idxs)):
-        _fetch_row(
-            i, c_idxs, starts, ends, reference, ref_offsets, pad_char, out, out_offsets
-        )
-    return out
-
-
-@nb.njit(nogil=True, cache=True)
-def _fetch_impl_ser(
-    c_idxs, starts, ends, reference, ref_offsets, pad_char, out, out_offsets
-):
-    for i in range(len(c_idxs)):
-        _fetch_row(
-            i, c_idxs, starts, ends, reference, ref_offsets, pad_char, out, out_offsets
-        )
-    return out
-
-
 T = TypeVar("T", NDArray[np.bytes_], RaggedSeqs)
 
 
@@ -461,22 +425,21 @@ def _getitem_spliced(self, idx: Idx) -> T:
         # Delegate kernel dispatch to the shared helper (eliminates duplication
         # with Ref.__call__'s splice branch). Returns a per-element _Flat (n_elements, None)
         # already in permuted write order.
+        to_rc_perm: "NDArray[np.bool_] | None" = None
+        if self.rc_neg:
+            to_rc_unperm = regions[:, 3] == -1
+            if to_rc_unperm.any():
+                to_rc_perm = to_rc_unperm[plan.permutation]
+
         per_elem = _fetch_spliced_ref(
             regions=regions,
             plan=plan,
             reference=self.reference.reference,
             ref_offsets=self.reference.offsets,
             pad_char=self.reference.pad_char,
+            to_rc=to_rc_perm,  # Rust: RC done in kernel
         )
 
-        if self.rc_neg:
-            to_rc_unperm = regions[:, 3] == -1
-            if to_rc_unperm.any():
-                from .._ragged import _COMP
-
-                to_rc_perm = to_rc_unperm[plan.permutation]
-                per_elem = per_elem.reverse_masked(to_rc_perm, comp=_COMP)
-
         # Rewrap with group_offsets at (n_rows, None) — skip the (n_rows, 1, None)
         # + squeeze(1) trick since RefDataset has no sample axis.
         ref = cast(
@@ -541,22 +504,24 @@ def _getitem_unspliced(self, idx: Idx) -> T:
         out_offsets = lengths_to_offsets(out_lengths)
 
         # ragged (b ~l)
+        # On the Rust backend, RC is folded into the kernel via to_rc.
+        # get_reference handles to_rc in kernel (Rust)
+        # below preserves the original behaviour.
+        _to_rc_arr = regions[:, 3] == -1
+        _to_rc: "NDArray[np.bool_] | None" = _to_rc_arr if _to_rc_arr.any() else None
         ref = get_reference(
             regions=regions,
             out_offsets=out_offsets,
             reference=self.reference.reference,
             ref_offsets=self.reference.offsets,
             pad_char=self.reference.pad_char,
+            to_rc=_to_rc,
         ).view("S1")
 
         ref = cast(
             Ragged[np.bytes_], Ragged.from_offsets(ref, (batch_size, None), out_offsets)
         )
 
-        to_rc = regions[:, 3] == -1
-        if to_rc.any():
-            ref = reverse_complement_masked(ref, to_rc)
-
         if out_reshape is not None:
             ref = ref.reshape(out_reshape)
 
@@ -565,7 +530,7 @@ def _getitem_unspliced(self, idx: Idx) -> T:
         elif self.output_length == "variable":
             out = to_padded(ref, pad_value=bytes([self.reference.pad_char]))
         else:
-            out = ref.to_numpy()
+            out = ref.to_numpy(validate=False)
 
         if squeeze:
             out = out.squeeze(0)
@@ -682,31 +647,18 @@ def to_dataloader(
         )
 
 
-@nb.njit(nogil=True, cache=True, inline="always")
-def _get_reference_row(i, regions, out_offsets, reference, ref_offsets, pad_char, out):
-    o_s, o_e = out_offsets[i], out_offsets[i + 1]
-    c_idx, start, end = regions[i, 0], regions[i, 1], regions[i, 2]
-    c_s = ref_offsets[c_idx]
-    c_e = ref_offsets[c_idx + 1]
-    padded_slice(reference[c_s:c_e], start, end, pad_char, out[o_s:o_e])
-
-
-@nb.njit(parallel=True, nogil=True, cache=True)
-def _get_reference_par(regions, out_offsets, reference, ref_offsets, pad_char, out):
-    for i in nb.prange(len(regions)):
-        _get_reference_row(
-            i, regions, out_offsets, reference, ref_offsets, pad_char, out
-        )
-    return out
-
-
-@nb.njit(nogil=True, cache=True)
-def _get_reference_ser(regions, out_offsets, reference, ref_offsets, pad_char, out):
-    for i in range(len(regions)):
-        _get_reference_row(
-            i, regions, out_offsets, reference, ref_offsets, pad_char, out
-        )
-    return out
+def _get_reference_rust(
+    regions, out_offsets, reference, ref_offsets, pad_char, parallel, to_rc=None
+):
+    return _get_reference_rust_ffi(
+        np.ascontiguousarray(regions, np.int32),
+        np.ascontiguousarray(out_offsets, np.int64),
+        np.ascontiguousarray(reference, np.uint8),
+        np.ascontiguousarray(ref_offsets, np.int64),
+        int(pad_char),
+        bool(parallel),
+        to_rc,
+    )
 
 
 def get_reference(
@@ -715,14 +667,18 @@ def get_reference(
     reference: NDArray[np.integer],
     ref_offsets: NDArray[np.integer],
     pad_char: int,
+    to_rc: "NDArray[np.bool_] | None" = None,
 ) -> NDArray[np.uint8]:
-    out = np.empty(out_offsets[-1], np.uint8)
-    kernel = (
-        _get_reference_par
-        if should_parallelize(int(out_offsets[-1]))
-        else _get_reference_ser
+    """Fetch reference-genome bytes for a batch of regions.
+
+    ``to_rc`` is a per-query boolean mask (True = reverse-complement that query).
+    The mask is consumed in-kernel by the Rust backend.
+    """
+    parallel = should_parallelize(int(out_offsets[-1]))
+    _to_rc = None if to_rc is None else np.ascontiguousarray(to_rc, np.bool_)
+    return _get_reference_rust(
+        regions, out_offsets, reference, ref_offsets, pad_char, parallel, _to_rc
     )
-    return kernel(regions, out_offsets, reference, ref_offsets, pad_char, out)
 
 
 def _fetch_spliced_ref(
@@ -731,12 +687,17 @@ def _fetch_spliced_ref(
     reference: NDArray[np.uint8],
     ref_offsets: NDArray[np.int64],
     pad_char: int,
+    to_rc: "NDArray[np.bool_] | None" = None,
 ) -> "_Flat[np.bytes_]":
     """Fetch reference bytes in splice-permuted order, returning a per-element
     flat ragged of shape ``(n_elements, None)``.
 
     This is the kernel-dispatch core shared by :class:`Ref.__call__`'s splice
     branch and :meth:`RefDataset._getitem_spliced`.
+
+    ``to_rc`` is the permuted per-element boolean mask (True = RC that element).
+    On the Rust backend it is passed into the ``get_reference`` kernel directly;
+    the Rust backend handles it in-kernel.
     """
     permuted_regions = regions[plan.permutation]
     raw = get_reference(
@@ -745,6 +706,7 @@ def _fetch_spliced_ref(
         reference=reference,
         ref_offsets=ref_offsets,
         pad_char=pad_char,
+        to_rc=to_rc,
     )  # uint8 flat buffer
     n_elements = plan.permuted_lengths.shape[0]
     return cast(
@@ -794,3 +756,30 @@ def __getitem__(self, idx: list[int]):
 
 else:
     TorchDataset = no_torch_error
+
+
+def _get_reference_row(i, regions, out_offsets, reference, ref_offsets, pad_char, out):
+    """Extract a single reference row with padding (pure Python fallback)."""
+    from ._utils import padded_slice
+
+    o_s, o_e = out_offsets[i], out_offsets[i + 1]
+    c_idx, start, end = int(regions[i, 0]), int(regions[i, 1]), int(regions[i, 2])
+    c_s = int(ref_offsets[c_idx])
+    c_e = int(ref_offsets[c_idx + 1])
+    padded_slice(reference[c_s:c_e], start, end, pad_char, out[o_s:o_e])
+
+
+def _get_reference_ser(regions, out_offsets, reference, ref_offsets, pad_char, out):
+    """Extract reference rows serially (pure Python fallback)."""
+    for i in range(len(regions)):
+        _get_reference_row(
+            i, regions, out_offsets, reference, ref_offsets, pad_char, out
+        )
+    return out
+
+
+def _get_reference_par(regions, out_offsets, reference, ref_offsets, pad_char, out):
+    """Extract reference rows (parallel flavor; falls back to serial in pure Python)."""
+    return _get_reference_ser(
+        regions, out_offsets, reference, ref_offsets, pad_char, out
+    )
diff --git a/python/genvarloader/_dataset/_tracks.py b/python/genvarloader/_dataset/_tracks.py
index 71b87e36..fc2dc11a 100644
--- a/python/genvarloader/_dataset/_tracks.py
+++ b/python/genvarloader/_dataset/_tracks.py
@@ -7,15 +7,15 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal, TypeVar, cast
 
-import numba as nb
 import numpy as np
 from einops import repeat
 from numpy.typing import NDArray
 from seqpro.rag import Ragged
 
 from .._flat import _Flat
-from .._ragged import INTERVAL_DTYPE, FlatIntervals, RaggedIntervals, RaggedTracks
+from .._ragged import FlatIntervals, RaggedIntervals, RaggedTracks
 from .._utils import lengths_to_offsets
+from ._genotypes import _as_starts_stops
 from ._indexing import DatasetIndexer
 from ._insertion_fill import InsertionFill, Repeat5p
 from ._intervals import intervals_to_tracks
@@ -34,112 +34,12 @@
 _INTERPOLATE = 4
 
 
-@nb.njit(nogil=True, cache=True, inline="always")
-def _xorshift64(x: np.uint64) -> np.uint64:
-    """Single round of xorshift64. Pure function — safe in parallel."""
-    x ^= x << np.uint64(13)
-    x ^= x >> np.uint64(7)
-    x ^= x << np.uint64(17)
-    return x
+from ..genvarloader import (  # noqa: E402
+    shift_and_realign_tracks_sparse as _shift_and_realign_tracks_sparse_rust,
+)
 
 
-@nb.njit(nogil=True, cache=True, inline="always")
-def _hash4(a: np.uint64, b: np.uint64, c: np.uint64, d: np.uint64) -> np.uint64:
-    """Hash four uint64 values into one. Used as a per-position deterministic seed."""
-    h = a
-    h = _xorshift64(h ^ b)
-    h = _xorshift64(h ^ c)
-    h = _xorshift64(h ^ d)
-    return h
-
-
-@nb.njit(nogil=True, cache=True, inline="always")
-def _apply_insertion_fill(
-    out: NDArray[np.floating],
-    out_idx: int,
-    writable_length: int,
-    v_len: int,
-    track: NDArray[np.floating],
-    v_rel_pos: int,
-    strategy_id: int,
-    params: NDArray[np.float64],
-    base_seed: np.uint64,
-    query: int,
-    hap: int,
-):
-    """Write `writable_length` values at out[out_idx:] according to strategy.
-
-    v_len is the total length of the insertion stretch (v_diff + 1); the kernel
-    may truncate the actual write to writable_length when running out of output.
-    """
-    track_len = len(track)
-
-    # The _REPEAT_5P branch is unreachable from the outer kernel (which short-circuits
-    # this strategy before calling). Kept for completeness and direct-helper-call safety.
-    if strategy_id == _REPEAT_5P:
-        val = track[v_rel_pos]
-        for i in range(writable_length):
-            out[out_idx + i] = val
-
-    elif strategy_id == _REPEAT_5P_NORM:
-        val = track[v_rel_pos] / v_len
-        for i in range(writable_length):
-            out[out_idx + i] = val
-
-    elif strategy_id == _CONSTANT:
-        val = params[0]
-        for i in range(writable_length):
-            out[out_idx + i] = val
-
-    elif strategy_id == _FLANK_SAMPLE:
-        width = np.int64(params[0])
-        pool_lo = max(0, v_rel_pos - width)
-        pool_hi = min(track_len - 1, v_rel_pos + width)
-        pool_size = pool_hi - pool_lo + 1
-        for i in range(writable_length):
-            seed = _hash4(
-                base_seed,
-                np.uint64(query),
-                np.uint64(hap),
-                np.uint64(out_idx + i),
-            )
-            offset = np.int64(seed % np.uint64(pool_size))
-            out[out_idx + i] = track[pool_lo + offset]
-
-    elif strategy_id == _INTERPOLATE:
-        order = np.int64(params[0])
-        # Number of anchor values per side: ceil((order+1)/2)
-        k = (order + 1 + 1) // 2  # ceil((order+1)/2)
-        # Anchors: 5' side at x = 0, -1, -2, ...; 3' side at x = v_len, v_len+1, ...
-        n_anchors = 2 * k
-        xs = np.empty(n_anchors, dtype=np.float64)
-        ys = np.empty(n_anchors, dtype=np.float64)
-        for j in range(k):
-            ref_idx = v_rel_pos - j
-            ref_idx = max(ref_idx, 0)
-            xs[j] = -float(j)
-            ys[j] = track[ref_idx]
-        for j in range(k):
-            ref_idx = v_rel_pos + 1 + j
-            ref_idx = min(ref_idx, track_len - 1)
-            xs[k + j] = float(v_len) + float(j)
-            ys[k + j] = track[ref_idx]
-        # Lagrange interpolation at each output position in [0, writable_length)
-        for i in range(writable_length):
-            x = float(i)
-            acc = 0.0
-            for a in range(n_anchors):
-                term = ys[a]
-                for b in range(n_anchors):
-                    if b == a:
-                        continue
-                    term *= (x - xs[b]) / (xs[a] - xs[b])
-                acc += term
-            out[out_idx + i] = acc
-
-
-@nb.njit(parallel=True, nogil=True, cache=True)
-def shift_and_realign_tracks_sparse(
+def _shift_and_realign_tracks_sparse_rust_wrapper(
     out: NDArray[np.floating],
     out_offsets: NDArray[np.integer],
     regions: NDArray[np.integer],
@@ -156,248 +56,31 @@ def shift_and_realign_tracks_sparse(
     keep_offsets: NDArray[np.integer] | None = None,
     strategy_id: int = 0,
     base_seed: np.uint64 = np.uint64(0),
-):
-    """Shift and realign tracks to correspond to haplotypes.
-
-    Parameters
-    ----------
-    out : NDArray[np.float32]
-        Ragged array with shape (batch, ploidy). Shifted and re-aligned tracks.
-    out_offsets : NDArray[np.int64]
-        Shape = (batch*ploidy + 1) Offsets into out.
-    regions : NDArray[np.int32]
-        Shape = (batch, 3) Regions, each is (contig_idx, start, end).
-    shifts : NDArray[np.int32]
-        Shape = (batch, ploidy) Shifts for each haplotype.
-    geno_offset_idx : NDArray[np.intp]
-        Shape = (batch, ploidy) Indices into offsets for each region.
-    geno_v_idxs : NDArray[np.int32]
-        Shape = (variants) Indices of variants.
-    geno_offsets : NDArray[np.uint32]
-        Shape = (tot_regions*samples*ploidy + 1) Offsets into variant idxs.
-    positions : NDArray[np.int32]
-        Shape = (total_variants) Positions of variants.
-    sizes : NDArray[np.int32]
-        Shape = (total_variants) Sizes of variants.
-    tracks : NDArray[np.float32]
-        Shape = (batch*ploidy*length) Tracks.
-    track_offsets : NDArray[np.int64]
-        Shape = (batch + 1) Offsets into tracks.
-    keep : Optional[NDArray[np.bool_]]
-        Shape = (batch*ploidy*variants) Keep mask for genotypes.
-    keep_offsets : Optional[NDArray[np.int64]]
-        Shape = (batch*ploidy + 1) Offsets into keep.
-    """
-    n_regions, ploidy = geno_offset_idx.shape
-    for query in nb.prange(n_regions):
-        t_s, t_e = track_offsets[query], track_offsets[query + 1]
-        q_track = tracks[t_s:t_e]
-        # assumes start is never altered upstream by differing hap lengths (true for left-aligned variants)
-        q_start = regions[query, 1]
-
-        for hap in nb.prange(ploidy):
-            o_idx = geno_offset_idx[query, hap]
-
-            k_idx = query * ploidy + hap
-            if keep is not None and keep_offsets is not None:
-                qh_keep = keep[keep_offsets[k_idx] : keep_offsets[k_idx + 1]]
-            else:
-                qh_keep = None
-
-            out_s, out_e = out_offsets[k_idx], out_offsets[k_idx + 1]
-            qh_out = out[out_s:out_e]
-            qh_shifts = shifts[query, hap]
-
-            shift_and_realign_track_sparse(
-                offset_idx=o_idx,
-                geno_v_idxs=geno_v_idxs,
-                geno_offsets=geno_offsets,
-                v_starts=v_starts,
-                ilens=ilens,
-                shift=qh_shifts,
-                track=q_track,
-                query_start=q_start,
-                out=qh_out,
-                params=params,
-                keep=qh_keep,
-                strategy_id=strategy_id,
-                base_seed=base_seed,
-                query=query,
-                hap=hap,
-            )
-
-
-@nb.njit(nogil=True, cache=True)
-def shift_and_realign_track_sparse(
-    offset_idx: int,
-    geno_v_idxs: NDArray[np.integer],
-    geno_offsets: NDArray[np.integer],
-    v_starts: NDArray[np.integer],
-    ilens: NDArray[np.integer],
-    shift: int,
-    track: NDArray[np.floating],
-    query_start: int,
-    out: NDArray[np.floating],
-    params: NDArray[np.float64],
-    keep: NDArray[np.bool_] | None = None,
-    strategy_id: int = 0,
-    base_seed: np.uint64 = np.uint64(0),
-    query: int = 0,
-    hap: int = 0,
-):
-    """Shift and realign a track to correspond to a haplotype.
-
-    Parameters
-    ----------
-    offset_idx : NDArray[np.int32]
-        Shape = (n_variants) Genotypes of variants.
-    positions : NDArray[np.int32]
-        Shape = (total_variants) Positions of variants.
-    sizes : NDArray[np.int32]
-        Shape = (total_variants) Sizes of variants.
-    shift : int
-        Total amount to shift by.
-    track : NDArray[np.float32]
-        Shape = (length) Track.
-    out : NDArray[np.uint8]
-        Shape = (out_length) Shifted and re-aligned track.
-    keep : Optional[NDArray[np.bool_]]
-        Shape = (n_variants) Keep mask for genotypes.
-    """
-    if geno_offsets.ndim == 1:
-        o_s, o_e = geno_offsets[offset_idx], geno_offsets[offset_idx + 1]
-    else:
-        o_s, o_e = geno_offsets[:, offset_idx]
-    _variant_idxs = geno_v_idxs[o_s:o_e]
-    length = len(out)
-    n_variants = len(_variant_idxs)
-
-    if n_variants == 0:
-        # guaranteed to have shift = 0
-        out[:] = track[:length]
-        return
-
-    # where to get next track value
-    track_idx = 0
-    # where to put next value
-    out_idx = 0
-    # how much we've shifted
-    shifted = 0
-
-    for v in range(n_variants):
-        if keep is not None and not keep[v]:
-            continue
-
-        variant: np.int32 = _variant_idxs[v]
-
-        # position of variant relative to ref from fetch(contig, start, q_end)
-        # i.e. has been put into same coordinate system as ref_idx
-        v_rel_pos = v_starts[variant] - query_start
-        v_diff = ilens[variant]
-        # +1 assumes atomized variants, exactly 1 nt shared between REF and ALT
-        v_rel_end = v_rel_pos - min(0, v_diff) + 1
-
-        # variant is a DEL spanning start
-        if v_diff < 0 and v_rel_pos < 0 and v_rel_end >= 0:
-            track_idx = v_rel_end
-            continue
-
-        # overlapping variants
-        # v_rel_pos < ref_idx only if we see an ALT at a given position a second
-        # time or more. We'll do what bcftools consensus does and only use the
-        # first ALT variant we find.
-        if v_rel_pos < track_idx:
-            continue
-
-        v_len = max(0, v_diff) + 1
-
-        # handle shift
-        if shifted < shift:
-            ref_shift_dist = v_rel_pos - track_idx
-            # need more than variant to finish shift
-            if shifted + ref_shift_dist + v_len < shift:
-                # skip the variant
-                continue
-            # can finish shift without using variant
-            elif shifted + ref_shift_dist >= shift:
-                track_idx += shift - shifted
-                shifted = shift
-                # can still use the variant and whatever ref is left between
-                # ref_idx and the variant
-            # ref + (some of) variant is enough to finish shift
-            else:
-                # how much left to shift - amount of ref we can use
-                allele_start_idx = shift - shifted - ref_shift_dist
-                shifted = shift
-                #! without if statement, parallel=True can cause a SystemError!
-                # * parallel jit cannot handle changes in array dimension.
-                # * without this, allele can change from a 1D array to a 0D
-                # * array.
-                if allele_start_idx == v_len:
-                    # consume track up to end of variant
-                    track_idx = v_rel_end
-                    continue
-                # consume track up to start of variant
-                track_idx = v_rel_pos
-                # adjust variant length
-                v_len -= allele_start_idx
-
-        # SNPs (but not MNPs because we don't have ALT length, MNPs are not atomic)
-        # skipped because for tracks they always match the reference
-        if v_diff == 0:
-            continue
-
-        # add track values up to variant
-        track_len = v_rel_pos - track_idx
-        if out_idx + track_len >= length:
-            # track will get written by final clause
-            # handles case where extraneous variants downstream of the haplotype were provided
-            break
-        out[out_idx : out_idx + track_len] = track[track_idx : track_idx + track_len]
-        out_idx += track_len
-
-        # indels (substitutions are skipped above and then handled by above clause)
-        writable_length = min(v_len, length - out_idx)
-        if v_diff > 0 and strategy_id != _REPEAT_5P:
-            _apply_insertion_fill(
-                out=out,
-                out_idx=out_idx,
-                writable_length=writable_length,
-                v_len=v_len,
-                track=track,
-                v_rel_pos=v_rel_pos,
-                strategy_id=strategy_id,
-                params=params,
-                base_seed=base_seed,
-                query=query,
-                hap=hap,
-            )
-        else:
-            # Deletions and Repeat5p insertions: original behavior.
-            for i in range(writable_length):
-                out[out_idx + i] = track[v_rel_pos]
-        out_idx += writable_length
-        track_idx = v_rel_end
-
-        if out_idx >= length:
-            break
-
-    if shifted < shift:
-        # need to shift the rest of the track
-        track_idx += shift - shifted
-        track_idx = min(track_idx, len(track))
-        shifted = shift
-
-    # fill rest with track and pad with 0
-    unfilled_length = length - out_idx
-    if unfilled_length > 0:
-        writable_ref = min(unfilled_length, len(track) - track_idx)
-        out_end_idx = out_idx + writable_ref
-        ref_end_idx = track_idx + writable_ref
-        out[out_idx:out_end_idx] = track[track_idx:ref_end_idx]
-
-        if out_end_idx < length:
-            out[out_end_idx:] = 0
+    parallel: bool = False,
+) -> None:
+    """Rust wrapper: normalizes geno_offsets to (2, n) form then dispatches."""
+    geno_offsets_2d = _as_starts_stops(geno_offsets)
+    _shift_and_realign_tracks_sparse_rust(
+        out=out,
+        out_offsets=np.asarray(out_offsets, dtype=np.int64),
+        regions=np.asarray(regions, dtype=np.int32),
+        shifts=np.asarray(shifts, dtype=np.int32),
+        geno_offset_idx=np.asarray(geno_offset_idx, dtype=np.int64),
+        geno_v_idxs=np.asarray(geno_v_idxs, dtype=np.int32),
+        geno_offsets=geno_offsets_2d,
+        v_starts=np.asarray(v_starts, dtype=np.int32),
+        ilens=np.asarray(ilens, dtype=np.int32),
+        tracks=np.asarray(tracks, dtype=np.float32),
+        track_offsets=np.asarray(track_offsets, dtype=np.int64),
+        params=np.asarray(params, dtype=np.float64),
+        keep=keep,
+        keep_offsets=np.asarray(keep_offsets, dtype=np.int64)
+        if keep_offsets is not None
+        else None,
+        strategy_id=int(strategy_id),
+        base_seed=int(base_seed),
+        parallel=parallel,
+    )
 
 
 # -----------------------------------------------------------------------------
@@ -511,7 +194,7 @@ def _ragged_stack_tracks(tracks: "list[Ragged]") -> "Ragged":
 
 
 # -----------------------------------------------------------------------------
-# Tracks reconstructor (Python-level wrapper around the numba kernels above).
+# Tracks reconstructor.
 # -----------------------------------------------------------------------------
 
 
@@ -648,19 +331,13 @@ def _open_intervals(path: Path, n_regions: int, n_samples: int) -> RaggedInterva
             shape = (n_regions, None)
         else:
             shape = (n_regions, n_samples, None)
-        itvs = np.memmap(
-            path / "intervals.npy",
-            dtype=INTERVAL_DTYPE,
-            mode="r",
-        )
-        offsets = np.memmap(
-            path / "offsets.npy",
-            dtype=np.int64,
-            mode="r",
-        )
-        starts = Ragged.from_offsets(itvs["start"], shape, offsets)
-        ends = Ragged.from_offsets(itvs["end"], shape, offsets)
-        values = Ragged.from_offsets(itvs["value"], shape, offsets)
+        starts_data = np.memmap(path / "starts.npy", dtype=np.int32, mode="r")
+        ends_data = np.memmap(path / "ends.npy", dtype=np.int32, mode="r")
+        values_data = np.memmap(path / "values.npy", dtype=np.float32, mode="r")
+        offsets = np.memmap(path / "offsets.npy", dtype=np.int64, mode="r")
+        starts = Ragged.from_offsets(starts_data, shape, offsets)
+        ends = Ragged.from_offsets(ends_data, shape, offsets)
+        values = Ragged.from_offsets(values_data, shape, offsets)
         return RaggedIntervals(starts, ends, values)
 
     def to_kind(self, kind: type[_NewT]) -> Tracks[_NewT]:
@@ -678,6 +355,7 @@ def __call__(
         deterministic: bool,
         splice_plan: SplicePlan | None = None,
         flat: bool = False,
+        to_rc: "NDArray[np.bool_] | None" = None,
     ) -> _T:
         if splice_plan is not None and not issubclass(self.kind, RaggedTracks):
             raise NotImplementedError(
@@ -685,7 +363,7 @@ def __call__(
             )
         if issubclass(self.kind, RaggedTracks):
             out = self._call_float32(
-                idx, r_idx, regions, output_length, splice_plan=splice_plan
+                idx, r_idx, regions, output_length, splice_plan=splice_plan, to_rc=to_rc
             )
         else:
             out = self._call_intervals(idx, flat=flat)
@@ -698,6 +376,7 @@ def _call_float32(
         regions: NDArray[np.int32],
         output_length: Literal["ragged", "variable"] | int,
         splice_plan: SplicePlan | None = None,
+        to_rc: "NDArray[np.bool_] | None" = None,
     ) -> RaggedTracks:
         batch_size = len(idx)
 
@@ -740,8 +419,19 @@ def _call_float32(
                 )
 
             out_shape = (len(idx), len(self.active_tracks), None)
-            # flat (b t l)
-            return cast(RaggedTracks, _Flat.from_offsets(out, out_shape, out_offsets))
+            result = _Flat.from_offsets(out, out_shape, out_offsets)
+
+            # Apply reversal in Python (intervals_to_tracks has no to_rc; no indel
+            # realignment is needed here).  Each query's n_tracks rows share the
+            # same to_rc value, so repeat across tracks.
+            if to_rc is not None:
+                n_tracks = len(self.active_tracks)
+                to_rc_expanded = np.ascontiguousarray(
+                    np.repeat(to_rc, n_tracks), np.bool_
+                )
+                result = result.reverse_masked(to_rc_expanded, comp=None)
+
+            return cast(RaggedTracks, result)
 
         # ---- splice plan path ----
         assert not isinstance(output_length, int), (
@@ -792,11 +482,20 @@ def _call_float32(
 
         # Per-element flat (caller rewraps with group_offsets via _regroup).
         out_shape = (splice_plan.permuted_lengths.shape[0], None)
-        return cast(
-            RaggedTracks,
-            _Flat.from_offsets(out_buf, out_shape, splice_plan.permuted_out_offsets),
+        result_spliced = _Flat.from_offsets(
+            out_buf, out_shape, splice_plan.permuted_out_offsets
         )
 
+        # Apply per-element reversal in Python (no fused kernel with to_rc for
+        # standalone tracks).  to_rc is already the permuted per-element mask
+        # from _getitem_spliced.
+        if to_rc is not None:
+            result_spliced = result_spliced.reverse_masked(
+                np.ascontiguousarray(to_rc, np.bool_), comp=None
+            )
+
+        return cast(RaggedTracks, result_spliced)
+
     def _call_intervals(
         self, idx: NDArray[np.integer], flat: bool = False
     ) -> RaggedIntervals | FlatIntervals:
@@ -919,3 +618,209 @@ def build_flat_intervals(
         ends=_Flat.from_offsets(data_ends[src], shape, final_offsets),
         values=_Flat.from_offsets(data_values[src], shape, final_offsets),
     )
+
+
+def _xorshift64(x: int) -> int:
+    """Single round of xorshift64 (pure Python). Safe and deterministic."""
+    x = int(x) & 0xFFFFFFFFFFFFFFFF
+    x ^= (x << 13) & 0xFFFFFFFFFFFFFFFF
+    x ^= (x >> 7) & 0xFFFFFFFFFFFFFFFF
+    x ^= (x << 17) & 0xFFFFFFFFFFFFFFFF
+    return x & 0xFFFFFFFFFFFFFFFF
+
+
+def _hash4(a: int, b: int, c: int, d: int) -> int:
+    """Hash four uint64 values into one (pure Python fallback)."""
+    h = int(a) & 0xFFFFFFFFFFFFFFFF
+    h = _xorshift64(h ^ (int(b) & 0xFFFFFFFFFFFFFFFF))
+    h = _xorshift64(h ^ (int(c) & 0xFFFFFFFFFFFFFFFF))
+    h = _xorshift64(h ^ (int(d) & 0xFFFFFFFFFFFFFFFF))
+    return h
+
+
+def _apply_insertion_fill(
+    out,
+    out_idx: int,
+    writable_length: int,
+    v_len: int,
+    track,
+    v_rel_pos: int,
+    strategy_id: int,
+    params,
+    base_seed: int = 0,
+    query: int = 0,
+    hap: int = 0,
+):
+    """Write writable_length values at out[out_idx:] according to insertion-fill strategy.
+
+    Pure Python fallback (no numba). Used by shift_and_realign_track_sparse.
+    """
+    import numpy as np
+
+    track_len = len(track)
+
+    if strategy_id == _REPEAT_5P:
+        out[out_idx : out_idx + writable_length] = track[v_rel_pos]
+
+    elif strategy_id == _REPEAT_5P_NORM:
+        out[out_idx : out_idx + writable_length] = track[v_rel_pos] / v_len
+
+    elif strategy_id == _CONSTANT:
+        out[out_idx : out_idx + writable_length] = params[0]
+
+    elif strategy_id == _FLANK_SAMPLE:
+        width = int(params[0])
+        pool_lo = max(0, v_rel_pos - width)
+        pool_hi = min(track_len - 1, v_rel_pos + width)
+        pool_size = pool_hi - pool_lo + 1
+        for i in range(writable_length):
+            seed = _hash4(base_seed, query, hap, out_idx + i)
+            offset = seed % pool_size
+            out[out_idx + i] = track[pool_lo + offset]
+
+    elif strategy_id == _INTERPOLATE:
+        order = int(params[0])
+        k = (order + 1 + 1) // 2
+        n_anchors = 2 * k
+        xs = np.empty(n_anchors, dtype=np.float64)
+        ys = np.empty(n_anchors, dtype=np.float64)
+        for j in range(k):
+            ref_idx = max(v_rel_pos - j, 0)
+            xs[j] = -float(j)
+            ys[j] = track[ref_idx]
+        for j in range(k):
+            ref_idx = min(v_rel_pos + 1 + j, track_len - 1)
+            xs[k + j] = float(v_len) + float(j)
+            ys[k + j] = track[ref_idx]
+        for i in range(writable_length):
+            x = float(i)
+            acc = 0.0
+            for a in range(n_anchors):
+                term = float(ys[a])
+                for b in range(n_anchors):
+                    if b == a:
+                        continue
+                    term *= (x - xs[b]) / (xs[a] - xs[b])
+                acc += term
+            out[out_idx + i] = acc
+
+
+def shift_and_realign_track_sparse(
+    offset_idx: int,
+    geno_v_idxs,
+    geno_offsets,
+    v_starts,
+    ilens,
+    shift: int,
+    track,
+    query_start: int,
+    out,
+    params,
+    keep=None,
+    strategy_id: int = 0,
+    base_seed: int = 0,
+    query: int = 0,
+    hap: int = 0,
+):
+    """Shift and realign a single track to correspond to a haplotype.
+
+    Pure Python fallback (no numba). Used directly by parity/unit tests.
+    Use :func:`_shift_and_realign_tracks_sparse_rust_wrapper` for batched Rust path.
+    """
+    if geno_offsets.ndim == 1:
+        o_s, o_e = int(geno_offsets[offset_idx]), int(geno_offsets[offset_idx + 1])
+    else:
+        o_s, o_e = int(geno_offsets[0, offset_idx]), int(geno_offsets[1, offset_idx])
+    _variant_idxs = geno_v_idxs[o_s:o_e]
+    length = len(out)
+    n_variants = len(_variant_idxs)
+
+    if n_variants == 0:
+        out[:] = track[:length]
+        return
+
+    track_idx = 0
+    out_idx = 0
+    shifted = 0
+
+    for v in range(n_variants):
+        if keep is not None and not keep[v]:
+            continue
+
+        variant = int(_variant_idxs[v])
+        v_rel_pos = int(v_starts[variant]) - query_start
+        v_diff = int(ilens[variant])
+        v_rel_end = v_rel_pos - min(0, v_diff) + 1
+
+        if v_diff < 0 and v_rel_pos < 0 and v_rel_end >= 0:
+            track_idx = v_rel_end
+            continue
+
+        if v_rel_pos < track_idx:
+            continue
+
+        v_len = max(0, v_diff) + 1
+
+        if shifted < shift:
+            ref_shift_dist = v_rel_pos - track_idx
+            if shifted + ref_shift_dist + v_len < shift:
+                continue
+            elif shifted + ref_shift_dist >= shift:
+                track_idx += shift - shifted
+                shifted = shift
+            else:
+                allele_start_idx = shift - shifted - ref_shift_dist
+                shifted = shift
+                if allele_start_idx == v_len:
+                    track_idx = v_rel_end
+                    continue
+                track_idx = v_rel_pos
+                v_len -= allele_start_idx
+
+        if v_diff == 0:
+            continue
+
+        track_len = v_rel_pos - track_idx
+        if out_idx + track_len >= length:
+            break
+        out[out_idx : out_idx + track_len] = track[track_idx : track_idx + track_len]
+        out_idx += track_len
+
+        writable_length = min(v_len, length - out_idx)
+        if v_diff > 0 and strategy_id != _REPEAT_5P:
+            _apply_insertion_fill(
+                out=out,
+                out_idx=out_idx,
+                writable_length=writable_length,
+                v_len=v_len,
+                track=track,
+                v_rel_pos=v_rel_pos,
+                strategy_id=strategy_id,
+                params=params,
+                base_seed=base_seed,
+                query=query,
+                hap=hap,
+            )
+        else:
+            for i in range(writable_length):
+                out[out_idx + i] = track[v_rel_pos]
+        out_idx += writable_length
+        track_idx = v_rel_end
+
+        if out_idx >= length:
+            break
+
+    if shifted < shift:
+        track_idx += shift - shifted
+        track_idx = min(track_idx, len(track))
+        shifted = shift
+
+    unfilled_length = length - out_idx
+    if unfilled_length > 0:
+        writable_ref = max(0, min(unfilled_length, len(track) - track_idx))
+        out_end_idx = out_idx + writable_ref
+        ref_end_idx = track_idx + writable_ref
+        out[out_idx:out_end_idx] = track[track_idx:ref_end_idx]
+
+        if out_end_idx < length:
+            out[out_end_idx:] = 0
diff --git a/python/genvarloader/_dataset/_utils.py b/python/genvarloader/_dataset/_utils.py
index 5b2b607b..8913c539 100644
--- a/python/genvarloader/_dataset/_utils.py
+++ b/python/genvarloader/_dataset/_utils.py
@@ -1,6 +1,5 @@
 from collections.abc import Sequence
 
-import numba as nb
 import numpy as np
 import polars as pl
 from genoray._utils import ContigNormalizer
@@ -11,41 +10,27 @@
 __all__ = []
 
 
-@nb.njit(nogil=True, cache=True)
-def padded_slice(
-    arr: NDArray[DTYPE],
-    start: int,
-    stop: int,
-    pad_val: int,
-    out: NDArray[DTYPE],
-) -> NDArray[DTYPE]:
-    if start >= stop:
-        return out
-    elif stop < 0:
-        out[:] = pad_val
-        return out
+def _ffi_array(arr: np.ndarray, dtype, name: str) -> np.ndarray:
+    """Assert a per-sample-scale FFI argument crosses zero-copy.
 
-    pad_left = -min(0, start)
-    pad_right = max(0, stop - len(arr))
-
-    if pad_left == 0 and pad_right == 0:
-        out[:] = arr[start:stop]
-        return out
-
-    if pad_left > 0 and pad_right > 0:
-        out_stop = len(out) - pad_right
-        out[:pad_left] = pad_val
-        out[pad_left:out_stop] = arr[:]
-        out[out_stop:] = pad_val
-    elif pad_left > 0:
-        out[:pad_left] = pad_val
-        out[pad_left:] = arr[:stop]
-    elif pad_right > 0:
-        out_stop = len(out) - pad_right
-        out[:out_stop] = arr[start:]
-        out[out_stop:] = pad_val
-
-    return out
+    Returns ``arr`` unchanged iff it is C-contiguous with exactly ``dtype``;
+    otherwise raises a precise ``ValueError`` naming ``name``. This replaces a
+    silent ``np.ascontiguousarray`` that would copy the whole per-sample-scale
+    memmap (GB-scale at the >1M-sample design target). Use it ONLY for
+    sample-scale memmap args; batch-bounded arrays may keep coercing.
+    """
+    dt = np.dtype(dtype)
+    if not arr.flags["C_CONTIGUOUS"]:
+        raise ValueError(
+            f"FFI argument {name!r} must be C-contiguous to cross zero-copy; got "
+            f"a non-contiguous array (coercing would force a sample-scale copy)."
+        )
+    if arr.dtype != dt:
+        raise ValueError(
+            f"FFI argument {name!r} must have dtype {dt}; got {arr.dtype} "
+            f"(coercing would force a sample-scale cast/copy)."
+        )
+    return arr
 
 
 def oidx_to_raveled_idx(row_idx: ArrayLike, col_idx: ArrayLike, shape: tuple[int, int]):
@@ -123,7 +108,7 @@ def bed_to_regions(
         # versions where it doesn't, the strand column survives the
         # ``select(...)`` call as Categorical, and ``to_numpy()`` on a frame
         # mixing ``Int32`` + ``Categorical`` collapses to ``dtype=object``,
-        # which downstream numba kernels reject with
+        # which downstream kernels reject with
         # ``non-precise type array(pyobject)``. Casting to Utf8 first keeps
         # the strand column numeric and the regions array stays ``int32``.
         cols.append(
@@ -139,40 +124,6 @@ def bed_to_regions(
     return bed.select(cols).to_numpy()
 
 
-@nb.njit(nogil=True, cache=True)
-def splits_sum_le_value(arr: NDArray[np.number], max_value: float) -> NDArray[np.intp]:
-    """Get index offsets for groups that sum to no more than a value.
-    Note that values greater than the maximum will be kept in their own group.
-
-    Parameters
-    ----------
-    arr : NDArray[np.number]
-        Array to split.
-    max_value : float
-        Maximum value.
-
-    Returns
-    -------
-    NDArray[np.intp]
-        Split indices.
-
-    Examples
-    --------
-    >>> splits_sum_le_value(np.array([5, 5, 11, 9, 2, 7]), 10)
-    # (5 5) (11) (9) (2 7)
-    array([0, 2, 3, 4, 6])
-    """
-    indices = [0]
-    current_sum = 0
-    for idx, value in enumerate(arr):
-        current_sum += value
-        if current_sum > max_value:
-            indices.append(idx)
-            current_sum = value
-    indices.append(len(arr))
-    return np.array(indices, np.intp)
-
-
 def reduceat_offsets(
     ufunc: np.ufunc, arr: NDArray[DTYPE], offsets: NDArray[np.integer], axis: int = 0
 ) -> NDArray[DTYPE]:
@@ -216,3 +167,40 @@ def reduceat_offsets(
     identity_indices = tuple(identity_indices)
     out_arr[identity_indices] = ufunc.identity
     return out_arr.swapaxes(axis, -1)
+
+
+def padded_slice(
+    arr,
+    start: int,
+    stop: int,
+    pad_val: int,
+    out,
+):
+    """Slice arr into out with padding on left/right if start<0 or stop>len(arr)."""
+    if start >= stop:
+        return out
+    elif stop < 0:
+        out[:] = pad_val
+        return out
+
+    pad_left = -min(0, start)
+    pad_right = max(0, stop - len(arr))
+
+    if pad_left == 0 and pad_right == 0:
+        out[:] = arr[start:stop]
+        return out
+
+    if pad_left > 0 and pad_right > 0:
+        out_stop = len(out) - pad_right
+        out[:pad_left] = pad_val
+        out[pad_left:out_stop] = arr[:]
+        out[out_stop:] = pad_val
+    elif pad_left > 0:
+        out[:pad_left] = pad_val
+        out[pad_left:] = arr[:stop]
+    elif pad_right > 0:
+        out_stop = len(out) - pad_right
+        out[:out_stop] = arr[start:]
+        out[out_stop:] = pad_val
+
+    return out
diff --git a/python/genvarloader/_dataset/_write.py b/python/genvarloader/_dataset/_write.py
index 405d1bb1..f3587430 100644
--- a/python/genvarloader/_dataset/_write.py
+++ b/python/genvarloader/_dataset/_write.py
@@ -34,18 +34,39 @@
 from tqdm.auto import tqdm
 
 from .._atomic import atomic_dir
-from .._ragged import INTERVAL_DTYPE
+from .._ragged import INTERVAL_DTYPE  # noqa: F401  # Task 3 migration reader imports this
 from .._utils import lengths_to_offsets, normalize_contig_name
 from .._variants._utils import path_is_pgen, path_is_vcf
 from ._svar_link import SvarLink
-from ._utils import bed_to_regions, regions_to_bed, splits_sum_le_value
+from ._utils import bed_to_regions, regions_to_bed
 
 
-DATASET_FORMAT_VERSION = SemanticVersion.parse("1.0.0")
+DATASET_FORMAT_VERSION = SemanticVersion.parse("2.0.0")
 """On-disk layout version for a gvl.write dataset directory. Bump MAJOR only when
 an existing dataset can no longer be read correctly by new code."""
 
 
+def _check_dataset_format_version(meta: "Metadata", path: Path) -> None:
+    """Validate a dataset's on-disk format version against the supported major.
+
+    Pre-versioning datasets (``format_version is None``) and any older major are
+    treated as needing migration. A newer major means the reader is too old.
+    """
+    fv = meta.format_version
+    current = DATASET_FORMAT_VERSION
+    if fv is None or fv.major < current.major:
+        raise ValueError(
+            f"Dataset at {path} uses format version {fv} but this genvarloader "
+            f"expects {current}. Run `genvarloader.migrate({str(path)!r})` to "
+            f"upgrade it in place."
+        )
+    if fv.major > current.major:
+        raise ValueError(
+            f"Dataset at {path} was written by a newer genvarloader (format "
+            f"version {fv} > supported {current}). Upgrade genvarloader."
+        )
+
+
 def _run_jobs(jobs: "list[Callable[[int], None]]", max_mem: int) -> None:
     """Run track/annot writer jobs, each called with a per-job max_mem budget.
 
@@ -1084,18 +1105,17 @@ def _write_phased_variants_chunk(
 
 def _write_ragged_intervals(out_dir: Path, itvs: "RaggedIntervals") -> None:
     """Write a RaggedIntervals (values/starts/ends share offsets) to out_dir as
-    intervals.npy + offsets.npy. Single-chunk writer used for annotation tracks."""
+    struct-of-arrays: starts/ends/values.npy + offsets.npy. Single-chunk writer
+    used for annotation tracks (format 2.0)."""
     out_dir.mkdir(parents=True, exist_ok=True)
-    out = np.memmap(
-        out_dir / "intervals.npy",
-        dtype=INTERVAL_DTYPE,
-        mode="w+",
-        shape=itvs.values.data.shape,
-    )
-    out["start"] = itvs.starts.data
-    out["end"] = itvs.ends.data
-    out["value"] = itvs.values.data
-    out.flush()
+    for name, data, dt in (
+        ("starts", itvs.starts.data, np.int32),
+        ("ends", itvs.ends.data, np.int32),
+        ("values", itvs.values.data, np.float32),
+    ):
+        out = np.memmap(out_dir / f"{name}.npy", dtype=dt, mode="w+", shape=data.shape)
+        out[:] = data
+        out.flush()
 
     offsets = itvs.values.offsets
     out = np.memmap(
@@ -1231,135 +1251,6 @@ def _write_annot_track(
     _write_ragged_intervals(out_dir, itvs)
 
 
-def _write_track_legacy(
-    out_dir: Path,
-    bed: pl.DataFrame,
-    track: "IntervalTrack",
-    samples: list[str] | None,
-    max_mem: int,
-):
-    if samples is None:
-        _samples = track.samples
-    else:
-        if missing := (set(samples) - set(track.samples)):
-            raise ValueError(f"Samples {missing} not found in track.")
-        _samples = samples
-
-    MEM_PER_INTERVAL = (
-        12 * 2
-    )  # start u32, end u32, value f32, times 2 for intermediate copies
-    chunk_labels = np.empty(bed.height, np.uint32)
-    chunk_offsets: dict[int, NDArray[np.int64]] = {}
-    n_chunks = 0
-    last_chunk_offset = 0
-    pbar = tqdm(total=bed["chrom"].n_unique())
-    for (contig,), part in bed.partition_by(
-        "chrom", as_dict=True, include_key=False, maintain_order=True
-    ).items():
-        pbar.set_description(f"Calculating memory usage for {part.height} regions")
-        contig = cast(str, contig)
-        _contig = normalize_contig_name(contig, track.contigs)
-        if _contig is not None:
-            starts = part["chromStart"].to_numpy()
-            ends = part["chromEnd"].to_numpy()
-
-            # (regions, samples)
-            n_per_query = track.count_intervals(contig, starts, ends, sample=_samples)
-            # (regions)
-            mem_per_r = n_per_query.sum(1) * MEM_PER_INTERVAL
-
-            if np.any(mem_per_r > max_mem):
-                # TODO subset by samples as well if needed
-                raise NotImplementedError(
-                    f"""Memory usage per region exceeds maximum of {max_mem / 1e9} GB.
-                    Largest amount needed for a single region is {mem_per_r.max() / 1e9} GB, set
-                    `max_mem` to this value or higher. Otherwise, chunking by region and sample is
-                    not yet implemented."""
-                )
-
-            split_offsets = splits_sum_le_value(mem_per_r, max_mem)
-            split_lengths = np.diff(split_offsets)
-            for i in range(len(split_lengths)):
-                o_s, o_e = split_offsets[i], split_offsets[i + 1]
-                chunk_idx = n_chunks + i
-                chunk_offsets[chunk_idx] = lengths_to_offsets(
-                    n_per_query[o_s:o_e].ravel()
-                )
-            first_chunk_idx = n_chunks
-            last_chunk_idx = n_chunks + len(split_lengths)
-            _chunk_labels = np.arange(
-                first_chunk_idx, last_chunk_idx, dtype=np.uint32
-            ).repeat(split_lengths)
-            chunk_labels[last_chunk_offset : last_chunk_offset + len(_chunk_labels)] = (
-                _chunk_labels
-            )
-            n_chunks += len(split_lengths)
-            last_chunk_offset += len(_chunk_labels)
-        pbar.update()
-    pbar.close()
-    bed = bed.with_columns(chunk=pl.lit(chunk_labels))
-
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    interval_offset = 0
-    offset_offset = 0
-    last_offset = 0
-    pbar = tqdm(total=bed["chunk"].n_unique())
-    for (chunk_idx,), part in bed.partition_by(
-        "chunk", as_dict=True, include_key=False, maintain_order=True
-    ).items():
-        chunk_idx = cast(int, chunk_idx)
-        contig = cast(str, part[0, "chrom"])
-        pbar.set_description(f"Reading intervals for {part.height} regions on {contig}")
-        starts = part["chromStart"].to_numpy()
-        ends = part["chromEnd"].to_numpy()
-        _offsets = chunk_offsets[chunk_idx]
-
-        intervals = track._intervals_from_offsets(
-            contig, starts, ends, _offsets, sample=_samples
-        )
-
-        pbar.set_description(f"Writing intervals for {part.height} regions on {contig}")
-        out = np.memmap(
-            out_dir / "intervals.npy",
-            dtype=INTERVAL_DTYPE,
-            mode="w+" if interval_offset == 0 else "r+",
-            shape=intervals.values.data.shape,
-            offset=interval_offset,
-        )
-        out["start"] = intervals.starts.data
-        out["end"] = intervals.ends.data
-        out["value"] = intervals.values.data
-        out.flush()
-        interval_offset += out.nbytes
-
-        offsets = intervals.values.offsets
-        offsets += last_offset
-        last_offset = offsets[-1]
-        out = np.memmap(
-            out_dir / "offsets.npy",
-            dtype=offsets.dtype,
-            mode="w+" if offset_offset == 0 else "r+",
-            shape=len(offsets) - 1,
-            offset=offset_offset,
-        )
-        out[:] = offsets[:-1]
-        out.flush()
-        offset_offset += out.nbytes
-        pbar.update()
-    pbar.close()
-
-    out = np.memmap(
-        out_dir / "offsets.npy",
-        dtype=offsets.dtype,
-        mode="r+",
-        shape=1,
-        offset=offset_offset,
-    )
-    out[-1] = offsets[-1]
-    out.flush()
-
-
 def _write_track_rust(
     out_dir: Path,
     bed: pl.DataFrame,
@@ -1440,4 +1331,7 @@ def _write_track(
         if missing := (set(_samples) - set(track.samples)):
             raise ValueError(f"Samples {missing} not found in track.")
         return _write_track_table(out_dir, bed, track, _samples, max_mem)
-    return _write_track_legacy(out_dir, bed, track, samples, max_mem)
+    raise TypeError(
+        f"Unsupported track type {type(track).__name__!r}; "
+        "tracks must be a genvarloader.BigWigs or genvarloader.Table."
+    )
diff --git a/python/genvarloader/_dispatch.py b/python/genvarloader/_dispatch.py
deleted file mode 100644
index d8a4487a..00000000
--- a/python/genvarloader/_dispatch.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""Backend dispatch registry for the Rust migration strangler window.
-
-Each migratable Python-entry kernel registers a numba and a rust implementation.
-Production code calls ``get(name)(...)``; ``GVL_BACKEND=numba|rust`` force-overrides
-all kernels (used by CI parity sweeps). Deleted wholesale in migration Phase 5.
-"""
-
-from __future__ import annotations
-
-import os
-from collections.abc import Callable
-from typing import Literal
-
-_Backend = Literal["numba", "rust"]
-_REGISTRY: dict[str, dict[str, object]] = {}
-
-
-def register(
-    name: str,
-    *,
-    numba: Callable,
-    rust: Callable,
-    default: _Backend = "numba",
-) -> None:
-    if default not in ("numba", "rust"):
-        raise ValueError(f"default must be 'numba' or 'rust', got {default!r}")
-    _REGISTRY[name] = {"numba": numba, "rust": rust, "default": default}
-
-
-def _entry(name: str) -> dict[str, object]:
-    try:
-        return _REGISTRY[name]
-    except KeyError:
-        raise KeyError(
-            f"no kernel registered as {name!r}; registered: {registered_names()}"
-        ) from None
-
-
-def get(name: str) -> Callable:
-    entry = _entry(name)
-    backend = os.environ.get("GVL_BACKEND")
-    if backend is None:
-        backend = entry["default"]  # type: ignore[assignment]
-    elif backend not in ("numba", "rust"):
-        raise ValueError(f"GVL_BACKEND must be 'numba' or 'rust', got {backend!r}")
-    return entry[backend]  # type: ignore[return-value]
-
-
-def backends(name: str) -> tuple[Callable, Callable]:
-    entry = _entry(name)
-    return entry["numba"], entry["rust"]  # type: ignore[return-value]
-
-
-def registered_names() -> list[str]:
-    return sorted(_REGISTRY)
diff --git a/python/genvarloader/_flat.py b/python/genvarloader/_flat.py
index 2e561ced..79683351 100644
--- a/python/genvarloader/_flat.py
+++ b/python/genvarloader/_flat.py
@@ -11,7 +11,6 @@
 from dataclasses import dataclass
 from typing import Any, Generic
 
-import numba as nb
 import numpy as np
 from numpy.typing import NDArray
 from seqpro.rag import RDTYPE_co as RDTYPE
@@ -19,19 +18,12 @@
 from seqpro.rag import to_padded as _sp_to_padded
 
 
-@nb.njit(parallel=True, cache=True)
-def _reverse_rows_masked(data, offsets, mask):  # pragma: no cover - njit
+def _reverse_rows_masked(data, offsets, mask):
     n = mask.shape[0]
-    for i in nb.prange(n):
+    for i in range(n):
         if mask[i]:
-            lo = offsets[i]
-            hi = offsets[i + 1] - 1
-            while lo < hi:
-                tmp = data[lo]
-                data[lo] = data[hi]
-                data[hi] = tmp
-                lo += 1
-                hi -= 1
+            s, e = int(offsets[i]), int(offsets[i + 1])
+            data[s:e] = data[s:e][::-1]
 
 
 @dataclass(slots=True, frozen=True)
diff --git a/python/genvarloader/_ragged.py b/python/genvarloader/_ragged.py
index 0644ff12..10fcdd66 100644
--- a/python/genvarloader/_ragged.py
+++ b/python/genvarloader/_ragged.py
@@ -4,7 +4,6 @@
 from functools import partial
 from typing import TYPE_CHECKING, Any, TypedDict, cast
 
-import numba as nb
 import numpy as np
 from numpy.typing import NDArray
 from phantom import Phantom
@@ -330,7 +329,6 @@ def to_padded(rag: Ragged[RDTYPE], pad_value: Any) -> NDArray[RDTYPE]:
 _COMP = np.frombuffer(bytes.maketrans(b"ACGT", b"TGCA"), np.uint8)
 
 
-@nb.vectorize(["u1(u1)"], nopython=True)
 def ufunc_comp_dna(seq: NDArray[np.uint8]) -> NDArray[np.uint8]:
     return _COMP[seq]
 
diff --git a/python/genvarloader/_threads.py b/python/genvarloader/_threads.py
index 13a9cc3d..48d255d9 100644
--- a/python/genvarloader/_threads.py
+++ b/python/genvarloader/_threads.py
@@ -1,47 +1,53 @@
-"""Cgroup-aware numba thread cap + a per-thread dispatch predicate.
+"""Cgroup-aware thread-count resolver + rayon pool initializer.
 
-numba.get_num_threads() reports host logical CPUs, not the cgroup allocation
-(e.g. 208 reported vs. 52 allocated). Forking the misdetected count makes
-parallel=True regions pay a flat ~37 ms fork-join for trivial work. We cap the
-worker count down to the real allocation once at import, and route copy kernels
-to a serial variant unless there is enough work to amortize the fork-join.
+Resolves the effective worker count from GVL_NUM_THREADS or the
+cgroup cpuset (Linux sched_getaffinity). Seeds RAYON_NUM_THREADS so
+rayon's global pool picks it up on first use. Must run before the
+first rust parallel call (rayon reads the env var at global-pool init
+time). Idempotent.
 """
 
 from __future__ import annotations
 
 import os
 
-import numba
-
-# Parallel only pays off when each worker gets at least this many bytes to copy.
-# Below `num_threads * _MIN_BYTES_PER_THREAD` total, the serial kernel wins.
 _MIN_BYTES_PER_THREAD = 1 << 20  # 1 MiB
+_NUM_THREADS: int | None = None
+
+
+def _detect_cpus() -> int:
+    try:
+        return max(1, len(os.sched_getaffinity(0)))  # respects cgroup cpuset (Linux)
+    except AttributeError:
+        return max(1, os.cpu_count() or 1)
 
 
 def _resolve_num_threads() -> int:
-    hard_max = numba.get_num_threads()
     env = os.environ.get("GVL_NUM_THREADS")
     if env:
         try:
-            return max(1, min(int(env), hard_max))
+            return max(1, int(env))
         except ValueError:
-            # A malformed override (e.g. "auto") must not break `import
-            # genvarloader`; fall through to cgroup detection instead.
             pass
-    try:
-        real = len(os.sched_getaffinity(0))  # respects cgroup cpuset (Linux)
-    except AttributeError:
-        real = os.cpu_count() or 1  # non-Linux fallback
-    return max(1, min(real, hard_max))
+    return _detect_cpus()
+
+
+def cap_threads() -> int:
+    """Resolve worker count once and pin rayon's pool via RAYON_NUM_THREADS.
+
+    Must run before the first rust parallel call (rayon reads RAYON_NUM_THREADS
+    at global-pool init). Idempotent.
+    """
+    global _NUM_THREADS
+    if _NUM_THREADS is None:
+        _NUM_THREADS = _resolve_num_threads()
+        os.environ.setdefault("RAYON_NUM_THREADS", str(_NUM_THREADS))
+    return _NUM_THREADS
 
 
-def cap_numba_threads() -> int:
-    """Cap numba's parallel worker count to the resolved value. Idempotent."""
-    n = _resolve_num_threads()
-    numba.set_num_threads(n)
-    return n
+def num_threads() -> int:
+    return cap_threads()
 
 
 def should_parallelize(total_bytes: int) -> bool:
-    """True iff a copy of `total_bytes` is large enough to justify fork-join."""
-    return total_bytes >= numba.get_num_threads() * _MIN_BYTES_PER_THREAD
+    return total_bytes >= num_threads() * _MIN_BYTES_PER_THREAD
diff --git a/python/genvarloader/_variants/_sitesonly.py b/python/genvarloader/_variants/_sitesonly.py
index df95f6dc..9803b9f3 100644
--- a/python/genvarloader/_variants/_sitesonly.py
+++ b/python/genvarloader/_variants/_sitesonly.py
@@ -4,7 +4,6 @@
 from pathlib import Path
 from typing import Generic, overload
 
-import numba as nb
 import numpy as np
 import pandera.polars as pa
 import polars as pl
@@ -285,7 +284,6 @@ def __getitem__(
 
 
 # * fixed length, SNPs only
-@nb.njit(parallel=True, nogil=True, cache=True)
 def apply_site_only_variants(
     haps: NDArray[np.uint8],  # (b p ~l)
     v_idxs: NDArray[np.int32],  # (b p ~l)
@@ -297,8 +295,8 @@ def apply_site_only_variants(
     batch_size, ploidy, _ = haps.shape
     flags = np.empty((batch_size, ploidy), dtype=np.uint8)
 
-    for b in nb.prange(batch_size):
-        for p in nb.prange(ploidy):
+    for b in range(batch_size):
+        for p in range(ploidy):
             bp_hap = haps[b, p]
             bp_idx = v_idxs[b, p]
             bp_ref_coord = ref_coords[b, p]
diff --git a/python/genvarloader/genvarloader.pyi b/python/genvarloader/genvarloader.pyi
index 2d7a1ce1..4ec8f5e6 100644
--- a/python/genvarloader/genvarloader.pyi
+++ b/python/genvarloader/genvarloader.pyi
@@ -71,11 +71,13 @@ def intervals_to_tracks(
     itv_offsets: NDArray[np.int64],
     out: NDArray[np.float32],
     out_offsets: NDArray[np.int64],
+    parallel: bool,
 ) -> None:
     """Paint base-pair-resolution tracks from intervals, writing ``out`` in place.
 
     Rust backend for the dispatched ``intervals_to_tracks`` kernel (byte-identical
     to the numba reference in ``_dataset/_intervals.py``). Zeros ``out`` then, per
     query, copies each interval's value into its base-pair slice. Assumes intervals
-    are sorted by start, non-overlapping, and start at >= the query start.
+    are sorted by start and non-overlapping; interval starts before the query start
+    are clipped to the query window (per #242).
     """
diff --git a/skills/genvarloader/SKILL.md b/skills/genvarloader/SKILL.md
index 78c1cb85..b04835a8 100644
--- a/skills/genvarloader/SKILL.md
+++ b/skills/genvarloader/SKILL.md
@@ -163,7 +163,9 @@ Scalar fields (`start`/`ilen`/`dosage`/`info[...]`) are still filled from `Dummy
 
 **`with_settings(unphased_union=...)`** — fold the stored diploid haplotypes onto a single haploid sequence: the union of called ALTs per `(region, sample)`. When `True`, `ds.ploidy` reports `1` (instead of the stored `2`); `n_variants(...)` reports a single ploidy slot (shape `(..., 1)`), with counts equal to the naive per-haplotype sum (a hom call appears twice — once per haplotype — with no dedup). `"variants"` and `"variant-windows"` output decode at ploidy `1`; ALT occurrences are concatenated across haplotypes with no sort and no dedup. Phase is discarded — intended for haploid somatic modeling of unphased somatic calls. Requires a dataset with genotypes (raises `ValueError` on reference-only datasets). Incompatible with `"haplotypes"` / `"annotated"` output — `with_seqs("haplotypes")` or `with_seqs("annotated")` (or setting this flag while one of those is the active output kind) raises `ValueError`. See issue #222.
 
-**Format validation:** `Dataset.open` validates the dataset's `format_version` and structural integrity (file presence + sizes). An incompatible or corrupt dataset raises a `ValueError` instructing regeneration with `gvl.write`. Datasets do **not** auto-rebuild.
+**Format validation:** `Dataset.open` validates the dataset's `format_version` and structural integrity (file presence + sizes). A corrupt dataset raises a `ValueError` instructing regeneration with `gvl.write`. Datasets do **not** auto-rebuild.
+
+**Format version gate (2.0):** the current on-disk format is **2.0.0**. Opening a dataset written by genvarloader **< 2.0** (or any unversioned dataset) raises a `ValueError` whose message points at `gvl.migrate(path)`; a dataset written by a *newer* major raises a `ValueError` telling you to upgrade genvarloader. Run `gvl.migrate(path)` **once** to upgrade a pre-2.0 dataset in place — it is streaming (peak extra disk is one track's interval store), idempotent, and crash-safe (metadata is bumped only after every track's struct-of-arrays files are durable, then the old array-of-structs files are deleted). It converts the track-interval storage only; genotypes, regions, and reference are untouched.
 
 - **`var_fields: list[str] | None`** — Variant fields to include on `RaggedVariants` output. Defaults to the minimum useful set `["alt", "ilen", "start"]`. Pass additional names (e.g. `"ref"`, `"dosage"`, or any numeric info column in the source variants table) to load them eagerly at open time. Must be a subset of `Dataset.available_var_fields`. Can be reconfigured later via `Dataset.with_settings(var_fields=...)`, which lazily loads any newly-requested columns. `"dosage"` must be requested explicitly — it is *not* added automatically even when `dosages.npy` exists on disk. Beyond the built-ins (`alt`, `start`, `ref`, `ilen`, `dosage`) and per-variant INFO columns, a genoray `.svar` may register arbitrary per-call (`Number=G`) FORMAT fields in `<svar>/metadata.json["fields"]`; these appear in `Dataset.available_var_fields` and can be requested via `Dataset.open(..., var_fields=[...])` or `with_settings(var_fields=[...])`. Each surfaces in `variants`, `variant-windows`, and `flat` outputs as a per-call ragged field aligned with the genotypes. A FORMAT field shadows a same-named INFO column.
 
@@ -348,6 +350,7 @@ Footprint is computed exactly via `Dataset._output_bytes_per_instance(...)` (use
 - `gvl.FlatVariantWindows` — returned by `with_seqs("variant-windows", VarWindowOpt(...))` in flat mode. `.fields`: dict of scalar `FlatRagged` (`start`/`ilen`/`dosage`/info; raw byte alleles are dropped). Per-allele token buffers — exactly one of `.ref_window` (flanked ref window, `"window"` mode) or `.ref` (bare ref allele tokens, `"allele"` mode) is set; same for `.alt_window` / `.alt`. Each non-None buffer is a two-level token buffer (internal `_FlatWindow`, not the public `FlatRagged`) of shape `(b, p, ~v, ~len)` with its own `.to_ragged()`. The container's `.shape` delegates to `fields["start"].shape`. Methods: `.to_ragged()` (returns dict of ragged parts), `.reshape(shape)`, `.squeeze(axis)`. Source: `python/genvarloader/_dataset/_flat_variants.py`.
 - `gvl.VarWindowOpt` — frozen config dataclass for `with_seqs("variant-windows", ...)`. Fields: `flank_length` (int), `token_alphabet` (bytes), `unknown_token` (int), `ref` ∈ `{"window","allele"}`, `alt` ∈ `{"window","allele"}`. `ref` and `alt` are chosen independently. `"window"` = flanked + tokenized reference read (ref) or flank·alt·flank assembly (alt); `"allele"` = bare tokenized allele with no flanks. Source: `python/genvarloader/_dataset/_flat_variants.py`.
 - `gvl.DummyVariant` — frozen dataclass used with `with_settings(dummy_variant=...)`. Fields and defaults: `start: int = -1`, `ilen: int = 0`, `dosage: float = 0.0`, `ref: bytes = b"N"`, `alt: bytes = b"N"`, `info: dict = {}`. Unspecified `info` keys default to `0` for integer columns and `NaN` for float columns. Source: `python/genvarloader/_dataset/_flat_variants.py`.
+- `gvl.migrate(path)` — upgrade a pre-2.0 (array-of-structs) dataset to format 2.0 (struct-of-arrays) **in place**. Streaming, idempotent, crash-safe; converts `intervals/<track>/` and `annot_intervals/<track>/` interval storage and bumps `metadata.json`. A no-op (with leftover-AoS cleanup) on an already-2.0 dataset. Source: `python/genvarloader/_dataset/_migrate.py`. (Distinct from `gvl.migrate_svar_link`, which upgrades legacy SVAR symlink layouts.)
 - `gvl.to_nested_tensor(ragged)` — convert to a PyTorch nested tensor (requires `torch`).
 - `gvl.get_dummy_dataset()` — small in-memory dataset for examples/tests.
 - `gvl.RefDataset` — reference-only dataset (no genotypes).
@@ -368,6 +371,8 @@ ds.gvl/
 └── annot_intervals/<track>/   # sample-independent annotation track data
 ```
 
+In **format 2.0**, each `intervals/<track>/` (and `annot_intervals/<track>/`) directory stores its intervals as **struct-of-arrays** — three contiguous files `starts.npy` (int32), `ends.npy` (int32), `values.npy` (float32), sharing one `offsets.npy` (int64) — replacing the format 1.x single `intervals.npy` record array. This lets the contiguous memmaps cross the Python→Rust boundary zero-copy. Upgrade a 1.x dataset with `gvl.migrate(path)` (see the format version gate above).
+
 See `docs/source/format.md` for the full schema, versioning, and SVAR-link details.
 
 ## Where to look next
@@ -386,12 +391,14 @@ See `docs/source/format.md` for the full schema, versioning, and SVAR-link detai
 | Track re-alignment internals          | `python/genvarloader/_dataset/_tracks.py`, `_reconstruct.py` |
 | Insertion fill internals              | `python/genvarloader/_dataset/_insertion_fill.py`      |
 | SVAR back-reference / migration       | `python/genvarloader/_dataset/_svar_link.py`           |
+| Format 1.x → 2.0 migration internals  | `python/genvarloader/_dataset/_migrate.py`             |
 | Flat-buffer ragged containers         | `python/genvarloader/_flat.py`                         |
 | Flat variants + alleles types         | `python/genvarloader/_dataset/_flat_variants.py`       |
 | Flank fetch + tokenization + windows  | `python/genvarloader/_dataset/_flat_flanks.py`         |
 
 ## Common gotchas
 
+- **Pre-2.0 datasets must be migrated once before opening.** `Dataset.open` rejects any dataset written by genvarloader < 2.0 (or unversioned) with a `ValueError` pointing at `gvl.migrate(path)`. Run it once (in place, idempotent, crash-safe). A dataset written by a *newer* major raises a different `ValueError` asking you to upgrade genvarloader. Note `gvl.migrate` (format upgrade) is **not** the same as `gvl.migrate_svar_link` (SVAR symlink-layout upgrade).
 - **`gvl.update` does not hot-reload open datasets.** A `Dataset` instance that was opened before `gvl.update` ran will not see the new track; reopen the dataset to pick it up. The update itself is safe to run while readers are active — each track is published atomically so a reader never sees a half-written track.
 - **`Dataset.write_annot_tracks` has been removed.** Use `gvl.update(dataset, annot_tracks={"name": source})` instead, or pass `annot_tracks=` to `gvl.write` at creation time.
 - **`gvl.Table` is a core public API.** No extra install required. It uses a Rust COITrees overlap engine and is CI-covered. Import it as `gvl.Table` (re-exported from the top-level package).
diff --git a/src/bigwig.rs b/src/bigwig.rs
index 68de99ae..e619630a 100644
--- a/src/bigwig.rs
+++ b/src/bigwig.rs
@@ -37,7 +37,9 @@ pub fn write_track(
     let starts = starts.as_slice().expect("starts contiguous");
     let ends = ends.as_slice().expect("ends contiguous");
 
-    let mut itv_writer = BufWriter::new(File::create(out_dir.join("intervals.npy"))?);
+    let mut starts_writer = BufWriter::new(File::create(out_dir.join("starts.npy"))?);
+    let mut ends_writer = BufWriter::new(File::create(out_dir.join("ends.npy"))?);
+    let mut values_writer = BufWriter::new(File::create(out_dir.join("values.npy"))?);
     // offsets accumulated in memory; region-major, sample-minor; final total appended.
     let mut offsets: Vec<i64> = Vec::with_capacity(n_regions * n_samples + 1);
     offsets.push(0);
@@ -105,9 +107,9 @@ pub fn write_track(
             let per_sample = region?;
             for sample_vals in per_sample {
                 for v in sample_vals {
-                    itv_writer.write_all(&(v.start as i32).to_le_bytes())?;
-                    itv_writer.write_all(&(v.end as i32).to_le_bytes())?;
-                    itv_writer.write_all(&v.value.to_le_bytes())?;
+                    starts_writer.write_all(&(v.start as i32).to_le_bytes())?;
+                    ends_writer.write_all(&(v.end as i32).to_le_bytes())?;
+                    values_writer.write_all(&v.value.to_le_bytes())?;
                     acc += 1;
                 }
                 offsets.push(acc);
@@ -115,7 +117,9 @@ pub fn write_track(
         }
         batch_start = batch_end;
     }
-    itv_writer.flush()?;
+    starts_writer.flush()?;
+    ends_writer.flush()?;
+    values_writer.flush()?;
 
     let mut off_writer = BufWriter::new(File::create(out_dir.join("offsets.npy"))?);
     for o in &offsets {
@@ -316,15 +320,18 @@ mod tests {
         }
         .unwrap();
 
-        // Expected intervals.npy bytes: [i32 start, i32 end, f32 value] per row.
-        let mut expected = Vec::new();
+        // Expected SoA bytes: separate i32 starts, i32 ends, f32 values.
+        let mut exp_starts = Vec::new();
+        let mut exp_ends = Vec::new();
+        let mut exp_values = Vec::new();
         for i in 0..vals.len() {
-            expected.extend_from_slice(&(coords[[i, 0]] as i32).to_le_bytes());
-            expected.extend_from_slice(&(coords[[i, 1]] as i32).to_le_bytes());
-            expected.extend_from_slice(&vals[i].to_le_bytes());
+            exp_starts.extend_from_slice(&(coords[[i, 0]] as i32).to_le_bytes());
+            exp_ends.extend_from_slice(&(coords[[i, 1]] as i32).to_le_bytes());
+            exp_values.extend_from_slice(&vals[i].to_le_bytes());
         }
-        let got = fs::read(tmp.join("intervals.npy")).unwrap();
-        assert_eq!(got, expected, "intervals.npy bytes mismatch");
+        assert_eq!(fs::read(tmp.join("starts.npy")).unwrap(), exp_starts, "starts mismatch");
+        assert_eq!(fs::read(tmp.join("ends.npy")).unwrap(), exp_ends, "ends mismatch");
+        assert_eq!(fs::read(tmp.join("values.npy")).unwrap(), exp_values, "values mismatch");
 
         // Expected offsets.npy bytes: i64 little-endian, full offsets vec.
         let mut expected_off = Vec::new();
diff --git a/src/ffi/mod.rs b/src/ffi/mod.rs
index 2d4f2255..b1ca34fd 100644
--- a/src/ffi/mod.rs
+++ b/src/ffi/mod.rs
@@ -1,8 +1,69 @@
 //! PyO3 boundary for migrated core kernels. The ONLY place new kernels touch Python.
-use numpy::{PyReadonlyArray1, PyReadwriteArray1};
+use ndarray::Array1;
+use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2, PyReadwriteArray1};
 use pyo3::prelude::*;
+use pyo3::types::PyDict;
 
+use crate::variants::windows::{assemble_variants_mode, assemble_windows_mode, VariantBufs};
+
+use crate::genotypes;
 use crate::intervals;
+use crate::reference;
+use crate::variants;
+
+/// Allocate an output buffer of `len` elements WITHOUT zero-initialization.
+///
+/// SAFETY/INVARIANT: every element is fully overwritten by the reconstruct/track
+/// core before it is read. For in-contract inputs the core writes every output
+/// position; out-of-contract inputs (e.g. a deletion driving `ref_idx` past the
+/// contig end) are already undefined and excluded from the parity oracle by the
+/// overshoot/double-init guards in
+/// tests/parity/test_reconstruct_haplotypes_parity.py, so skipping the zero-init
+/// adds no new observable exposure. `T` is a plain numeric type (u8/i32/f32) with
+/// no invalid bit patterns.
+#[allow(clippy::uninit_vec)]
+fn uninit_output<T: Copy>(len: usize) -> Array1<T> {
+    let mut v: Vec<T> = Vec::with_capacity(len);
+    // SAFETY: see function-level invariant — every element is written before read.
+    unsafe {
+        v.set_len(len);
+    }
+    Array1::from_vec(v)
+}
+
+/// Per-(query, hap) reference-length diffs (see `genotypes::get_diffs_sparse`).
+/// `geno_offsets` is the normalized (2, n) int64 starts/stops array.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn get_diffs_sparse<'py>(
+    py: Python<'py>,
+    geno_offset_idx: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    ilens: PyReadonlyArray1<i32>,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    q_starts: Option<PyReadonlyArray1<i32>>,
+    q_ends: Option<PyReadonlyArray1<i32>>,
+    v_starts: Option<PyReadonlyArray1<i32>>,
+    parallel: bool,
+) -> Bound<'py, PyArray2<i32>> {
+    let go = geno_offsets.as_array();
+    let diffs = genotypes::get_diffs_sparse(
+        geno_offset_idx.as_array(),
+        geno_v_idxs.as_array(),
+        go.row(0),
+        go.row(1),
+        ilens.as_array(),
+        keep.as_ref().map(|a| a.as_array()),
+        keep_offsets.as_ref().map(|a| a.as_array()),
+        q_starts.as_ref().map(|a| a.as_array()),
+        q_ends.as_ref().map(|a| a.as_array()),
+        v_starts.as_ref().map(|a| a.as_array()),
+        parallel,
+    );
+    diffs.into_pyarray(py)
+}
 
 /// Paint base-pair-resolution tracks from intervals (writes `out` in place).
 #[pyfunction]
@@ -16,6 +77,7 @@ pub fn intervals_to_tracks(
     itv_offsets: PyReadonlyArray1<i64>,
     mut out: PyReadwriteArray1<f32>,
     out_offsets: PyReadonlyArray1<i64>,
+    parallel: bool,
 ) {
     intervals::intervals_to_tracks(
         offset_idxs.as_array(),
@@ -26,5 +88,1276 @@ pub fn intervals_to_tracks(
         itv_offsets.as_array(),
         out.as_array_mut(),
         out_offsets.as_array(),
+        parallel,
+    );
+}
+
+/// Exonic keep-mask (see `genotypes::choose_exonic_variants`). Returns
+/// `(keep: bool[n], keep_offsets: i64[n_groups+1])`.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn choose_exonic_variants<'py>(
+    py: Python<'py>,
+    starts: PyReadonlyArray1<i32>,
+    ends: PyReadonlyArray1<i32>,
+    geno_offset_idx: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+) -> (Bound<'py, PyArray1<bool>>, Bound<'py, PyArray1<i64>>) {
+    let go = geno_offsets.as_array();
+    let (keep, koff) = genotypes::choose_exonic_variants(
+        starts.as_array(),
+        ends.as_array(),
+        geno_offset_idx.as_array(),
+        geno_v_idxs.as_array(),
+        go.row(0),
+        go.row(1),
+        v_starts.as_array(),
+        ilens.as_array(),
+    );
+    (keep.into_pyarray(py), koff.into_pyarray(py))
+}
+
+/// Per-row i32 gather — variant indices (see `variants::gather_rows_i32`).
+#[pyfunction]
+pub fn gather_rows_i32<'py>(
+    py: Python<'py>,
+    geno_offset_idx: PyReadonlyArray1<i64>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    data: PyReadonlyArray1<i32>,
+) -> (Bound<'py, PyArray1<i32>>, Bound<'py, PyArray1<i64>>) {
+    let go = geno_offsets.as_array();
+    let (v, off) = variants::gather_rows_i32(
+        geno_offset_idx.as_array(),
+        go.row(0),
+        go.row(1),
+        data.as_array(),
+    );
+    (v.into_pyarray(py), off.into_pyarray(py))
+}
+
+/// Per-row f32 gather — dosage values (see `variants::gather_rows_f32`).
+#[pyfunction]
+pub fn gather_rows_f32<'py>(
+    py: Python<'py>,
+    geno_offset_idx: PyReadonlyArray1<i64>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    data: PyReadonlyArray1<f32>,
+) -> (Bound<'py, PyArray1<f32>>, Bound<'py, PyArray1<i64>>) {
+    let go = geno_offsets.as_array();
+    let (v, off) = variants::gather_rows_f32(
+        geno_offset_idx.as_array(),
+        go.row(0),
+        go.row(1),
+        data.as_array(),
+    );
+    (v.into_pyarray(py), off.into_pyarray(py))
+}
+
+/// Gather allele bytestrings (see `variants::gather_alleles`).
+#[pyfunction]
+pub fn gather_alleles<'py>(
+    py: Python<'py>,
+    v_idxs: PyReadonlyArray1<i32>,
+    allele_bytes: PyReadonlyArray1<u8>,
+    allele_offsets: PyReadonlyArray1<i64>,
+) -> (Bound<'py, PyArray1<u8>>, Bound<'py, PyArray1<i64>>) {
+    let (data, seq) = variants::gather_alleles(
+        v_idxs.as_array(),
+        allele_bytes.as_array(),
+        allele_offsets.as_array(),
+    );
+    (data.into_pyarray(py), seq.into_pyarray(py))
+}
+
+/// Compact i32 values under keep mask, rebuilding row offsets
+/// (see `variants::compact_keep_i32`).
+#[pyfunction]
+pub fn compact_keep_i32<'py>(
+    py: Python<'py>,
+    values: PyReadonlyArray1<i32>,
+    row_offsets: PyReadonlyArray1<i64>,
+    keep: PyReadonlyArray1<bool>,
+) -> (Bound<'py, PyArray1<i32>>, Bound<'py, PyArray1<i64>>) {
+    let (v, off) = variants::compact_keep_i32(
+        values.as_array(),
+        row_offsets.as_array(),
+        keep.as_array(),
+    );
+    (v.into_pyarray(py), off.into_pyarray(py))
+}
+
+/// Compact f32 values under keep mask, rebuilding row offsets
+/// (see `variants::compact_keep_f32`).
+#[pyfunction]
+pub fn compact_keep_f32<'py>(
+    py: Python<'py>,
+    values: PyReadonlyArray1<f32>,
+    row_offsets: PyReadonlyArray1<i64>,
+    keep: PyReadonlyArray1<bool>,
+) -> (Bound<'py, PyArray1<f32>>, Bound<'py, PyArray1<i64>>) {
+    let (v, off) = variants::compact_keep_f32(
+        values.as_array(),
+        row_offsets.as_array(),
+        keep.as_array(),
+    );
+    (v.into_pyarray(py), off.into_pyarray(py))
+}
+
+/// Fill empty rows with one scalar sentinel (i32). Returns `(new_data, new_offsets)`.
+/// (see `variants::fill_empty_scalar_i32`).
+#[pyfunction]
+pub fn fill_empty_scalar_i32<'py>(
+    py: Python<'py>,
+    data: PyReadonlyArray1<i32>,
+    offsets: PyReadonlyArray1<i64>,
+    fill: i32,
+) -> (Bound<'py, PyArray1<i32>>, Bound<'py, PyArray1<i64>>) {
+    let (v, off) = variants::fill_empty_scalar_i32(
+        data.as_array(),
+        offsets.as_array(),
+        fill,
+    );
+    (v.into_pyarray(py), off.into_pyarray(py))
+}
+
+/// Fill empty rows with one scalar sentinel (f32). Returns `(new_data, new_offsets)`.
+/// (see `variants::fill_empty_scalar_f32`).
+#[pyfunction]
+pub fn fill_empty_scalar_f32<'py>(
+    py: Python<'py>,
+    data: PyReadonlyArray1<f32>,
+    offsets: PyReadonlyArray1<i64>,
+    fill: f32,
+) -> (Bound<'py, PyArray1<f32>>, Bound<'py, PyArray1<i64>>) {
+    let (v, off) = variants::fill_empty_scalar_f32(
+        data.as_array(),
+        offsets.as_array(),
+        fill,
+    );
+    (v.into_pyarray(py), off.into_pyarray(py))
+}
+
+/// Fill empty rows with `inner` copies of sentinel (i32, fixed-stride).
+/// Returns `(new_data, new_offsets)`. (see `variants::fill_empty_fixed_i32`).
+#[pyfunction]
+pub fn fill_empty_fixed_i32<'py>(
+    py: Python<'py>,
+    data: PyReadonlyArray1<i32>,
+    offsets: PyReadonlyArray1<i64>,
+    inner: i64,
+    fill: i32,
+) -> (Bound<'py, PyArray1<i32>>, Bound<'py, PyArray1<i64>>) {
+    let (v, off) = variants::fill_empty_fixed_i32(
+        data.as_array(),
+        offsets.as_array(),
+        inner,
+        fill,
+    );
+    (v.into_pyarray(py), off.into_pyarray(py))
+}
+
+/// Fill empty rows with `inner` copies of sentinel (f32, fixed-stride).
+/// Returns `(new_data, new_offsets)`. (see `variants::fill_empty_fixed_f32`).
+#[pyfunction]
+pub fn fill_empty_fixed_f32<'py>(
+    py: Python<'py>,
+    data: PyReadonlyArray1<f32>,
+    offsets: PyReadonlyArray1<i64>,
+    inner: i64,
+    fill: f32,
+) -> (Bound<'py, PyArray1<f32>>, Bound<'py, PyArray1<i64>>) {
+    let (v, off) = variants::fill_empty_fixed_f32(
+        data.as_array(),
+        offsets.as_array(),
+        inner,
+        fill,
+    );
+    (v.into_pyarray(py), off.into_pyarray(py))
+}
+
+/// Two-level dummy-fill for allele bytestrings (uint8).
+/// Returns `(new_data, new_var_offsets, new_seq_offsets)`.
+/// (see `variants::fill_empty_seq_u8`).
+#[pyfunction]
+pub fn fill_empty_seq_u8<'py>(
+    py: Python<'py>,
+    data: PyReadonlyArray1<u8>,
+    var_offsets: PyReadonlyArray1<i64>,
+    seq_offsets: PyReadonlyArray1<i64>,
+    dummy: PyReadonlyArray1<u8>,
+) -> (
+    Bound<'py, PyArray1<u8>>,
+    Bound<'py, PyArray1<i64>>,
+    Bound<'py, PyArray1<i64>>,
+) {
+    let (nd, nvar, nseq) = variants::fill_empty_seq_u8(
+        data.as_array(),
+        var_offsets.as_array(),
+        seq_offsets.as_array(),
+        dummy.as_array(),
+    );
+    (nd.into_pyarray(py), nvar.into_pyarray(py), nseq.into_pyarray(py))
+}
+
+/// Two-level dummy-fill for token windows (int32).
+/// Returns `(new_data, new_var_offsets, new_seq_offsets)`.
+/// (see `variants::fill_empty_seq_i32`).
+#[pyfunction]
+pub fn fill_empty_seq_i32<'py>(
+    py: Python<'py>,
+    data: PyReadonlyArray1<i32>,
+    var_offsets: PyReadonlyArray1<i64>,
+    seq_offsets: PyReadonlyArray1<i64>,
+    dummy: PyReadonlyArray1<i32>,
+) -> (
+    Bound<'py, PyArray1<i32>>,
+    Bound<'py, PyArray1<i64>>,
+    Bound<'py, PyArray1<i64>>,
+) {
+    let (nd, nvar, nseq) = variants::fill_empty_seq_i32(
+        data.as_array(),
+        var_offsets.as_array(),
+        seq_offsets.as_array(),
+        dummy.as_array(),
+    );
+    (nd.into_pyarray(py), nvar.into_pyarray(py), nseq.into_pyarray(py))
+}
+
+/// Build the `{name: (data, seq_offsets)}` dict from assembled buffers.
+fn bufs_to_pydict<'py, Tok: numpy::Element + Copy>(
+    py: Python<'py>,
+    bufs: VariantBufs<Tok>,
+) -> Bound<'py, PyDict> {
+    let d = PyDict::new(py);
+    for (name, data, off) in bufs.byte_bufs {
+        d.set_item(name, (data.into_pyarray(py), off.into_pyarray(py)))
+            .unwrap();
+    }
+    for (name, data, off) in bufs.tok_bufs {
+        d.set_item(name, (data.into_pyarray(py), off.into_pyarray(py)))
+            .unwrap();
+    }
+    d
+}
+
+/// Monomorphized assembly entry. `Tok` is the token dtype; `mode` selects
+/// variants (0) vs windows (1). See module docs in `variants::windows`.
+#[allow(clippy::too_many_arguments)]
+fn assemble_variant_buffers_impl<'py, Tok: numpy::Element + Copy>(
+    py: Python<'py>,
+    mode: i64,
+    v_idxs: PyReadonlyArray1<i32>,
+    row_offsets: PyReadonlyArray1<i64>,
+    alt_global: PyReadonlyArray1<u8>,
+    alt_off_global: PyReadonlyArray1<i64>,
+    ref_global: Option<PyReadonlyArray1<u8>>,
+    ref_off_global: Option<PyReadonlyArray1<i64>>,
+    want_ref_bytes: bool,
+    want_flank: bool,
+    ref_mode: i64,
+    alt_mode: i64,
+    flank_len: i64,
+    lut: Option<PyReadonlyArray1<Tok>>,
+    v_contigs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    reference: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+) -> Bound<'py, PyDict> {
+    let rg = ref_global.as_ref().map(|a| a.as_array());
+    let ro = ref_off_global.as_ref().map(|a| a.as_array());
+    let lut_v = lut.as_ref().map(|a| a.as_array());
+    let bufs = if mode == 0 {
+        assemble_variants_mode::<Tok>(
+            v_idxs.as_array(),
+            row_offsets.as_array(),
+            alt_global.as_array(),
+            alt_off_global.as_array(),
+            if want_ref_bytes { rg } else { None },
+            if want_ref_bytes { ro } else { None },
+            want_flank,
+            flank_len,
+            lut_v,
+            v_contigs.as_array(),
+            v_starts.as_array(),
+            ilens.as_array(),
+            reference.as_array(),
+            ref_offsets.as_array(),
+            pad_char,
+        )
+    } else {
+        assemble_windows_mode::<Tok>(
+            v_idxs.as_array(),
+            row_offsets.as_array(),
+            ref_mode,
+            alt_mode,
+            alt_global.as_array(),
+            alt_off_global.as_array(),
+            rg,
+            ro,
+            flank_len,
+            lut_v.expect("windows mode requires a token LUT"),
+            v_contigs.as_array(),
+            v_starts.as_array(),
+            ilens.as_array(),
+            reference.as_array(),
+            ref_offsets.as_array(),
+            pad_char,
+        )
+    };
+    bufs_to_pydict(py, bufs)
+}
+
+/// u8-token assembly (token_dtype == uint8). See `assemble_variant_buffers_impl`.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn assemble_variant_buffers_u8<'py>(
+    py: Python<'py>,
+    mode: i64,
+    v_idxs: PyReadonlyArray1<i32>,
+    row_offsets: PyReadonlyArray1<i64>,
+    alt_global: PyReadonlyArray1<u8>,
+    alt_off_global: PyReadonlyArray1<i64>,
+    ref_global: Option<PyReadonlyArray1<u8>>,
+    ref_off_global: Option<PyReadonlyArray1<i64>>,
+    want_ref_bytes: bool,
+    want_flank: bool,
+    ref_mode: i64,
+    alt_mode: i64,
+    flank_len: i64,
+    lut: Option<PyReadonlyArray1<u8>>,
+    v_contigs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    reference: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+) -> Bound<'py, PyDict> {
+    assemble_variant_buffers_impl::<u8>(
+        py, mode, v_idxs, row_offsets, alt_global, alt_off_global, ref_global,
+        ref_off_global, want_ref_bytes, want_flank, ref_mode, alt_mode, flank_len,
+        lut, v_contigs, v_starts, ilens, reference, ref_offsets, pad_char,
+    )
+}
+
+/// i32-token assembly (token_dtype == int32). See `assemble_variant_buffers_impl`.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn assemble_variant_buffers_i32<'py>(
+    py: Python<'py>,
+    mode: i64,
+    v_idxs: PyReadonlyArray1<i32>,
+    row_offsets: PyReadonlyArray1<i64>,
+    alt_global: PyReadonlyArray1<u8>,
+    alt_off_global: PyReadonlyArray1<i64>,
+    ref_global: Option<PyReadonlyArray1<u8>>,
+    ref_off_global: Option<PyReadonlyArray1<i64>>,
+    want_ref_bytes: bool,
+    want_flank: bool,
+    ref_mode: i64,
+    alt_mode: i64,
+    flank_len: i64,
+    lut: Option<PyReadonlyArray1<i32>>,
+    v_contigs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    reference: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+) -> Bound<'py, PyDict> {
+    assemble_variant_buffers_impl::<i32>(
+        py, mode, v_idxs, row_offsets, alt_global, alt_off_global, ref_global,
+        ref_off_global, want_ref_bytes, want_flank, ref_mode, alt_mode, flank_len,
+        lut, v_contigs, v_starts, ilens, reference, ref_offsets, pad_char,
+    )
+}
+
+/// Reconstruct haplotypes for a batch of (query, hap) pairs in place (writes `out`).
+///
+/// `geno_offsets` is the normalized (2, n) int64 starts/stops array.
+/// `keep_offsets` is the 1-D (batch*ploidy + 1) offsets array for the keep mask, or None.
+/// `parallel` enables rayon batch parallelism (caller computes `should_parallelize`).
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn reconstruct_haplotypes_from_sparse(
+    mut out: PyReadwriteArray1<u8>,
+    out_offsets: PyReadonlyArray1<i64>,
+    regions: PyReadonlyArray2<i32>,
+    shifts: PyReadonlyArray2<i32>,
+    geno_offset_idx: PyReadonlyArray2<i64>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    alt_alleles: PyReadonlyArray1<u8>,
+    alt_offsets: PyReadonlyArray1<i64>,
+    ref_: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    mut annot_v_idxs: Option<PyReadwriteArray1<i32>>,
+    mut annot_ref_pos: Option<PyReadwriteArray1<i32>>,
+    parallel: bool,
+) {
+    use crate::reconstruct;
+    let go = geno_offsets.as_array();
+    reconstruct::reconstruct_haplotypes_from_sparse(
+        out.as_array_mut(),
+        out_offsets.as_array(),
+        regions.as_array(),
+        shifts.as_array(),
+        geno_offset_idx.as_array(),
+        go.row(0),
+        go.row(1),
+        geno_v_idxs.as_array(),
+        v_starts.as_array(),
+        ilens.as_array(),
+        alt_alleles.as_array(),
+        alt_offsets.as_array(),
+        ref_.as_array(),
+        ref_offsets.as_array(),
+        pad_char,
+        keep.as_ref().map(|k| k.as_array()),
+        keep_offsets.as_ref().map(|ko| ko.as_array()),
+        annot_v_idxs.as_mut().map(|a| a.as_array_mut()),
+        annot_ref_pos.as_mut().map(|a| a.as_array_mut()),
+        parallel,
+    );
+}
+
+/// Fused haplotypes __getitem__ kernel (Task 13).
+///
+/// Collapses two FFI crossings into one:
+///   1. Compute per-haplotype length diffs (``get_diffs_sparse`` logic).
+///   2. Allocate the output buffer and offset array in Rust from the computed diffs.
+///   3. Run ``reconstruct_haplotypes_from_sparse`` logic.
+///   4. Return ``(out_data: Array1<u8>, out_offsets: Array1<i64>)`` — ready for
+///      wrapping into ``_Flat.from_offsets(...).view("S1")`` with no further coercions.
+///
+/// ``output_length``:
+///   - ``-1`` → ragged mode (each haplotype gets its natural length = ref_len + diff).
+///   - ``>= 0`` → fixed-length mode (every haplotype is padded/truncated to this length).
+///
+/// ``geno_offsets`` is the normalized ``(2, n)`` int64 starts/stops array (same
+/// layout as the existing ``reconstruct_haplotypes_from_sparse`` FFI entry).
+///
+/// Annotation buffers are not supported in the fused entry (annotated path
+/// remains on the unfused dispatch wrappers — see Task 13 report for rationale).
+/// `parallel` enables rayon batch parallelism (caller computes `should_parallelize`).
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn reconstruct_haplotypes_fused<'py>(
+    py: Python<'py>,
+    regions: PyReadonlyArray2<i32>,
+    shifts: PyReadonlyArray2<i32>,
+    geno_offset_idx: PyReadonlyArray2<i64>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    alt_alleles: PyReadonlyArray1<u8>,
+    alt_offsets: PyReadonlyArray1<i64>,
+    ref_: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+    output_length: i64,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    to_rc: Option<PyReadonlyArray1<bool>>,
+    parallel: bool,
+) -> (Bound<'py, PyArray1<u8>>, Bound<'py, PyArray1<i64>>) {
+    use crate::genotypes;
+    use crate::reconstruct;
+
+    let go = geno_offsets.as_array();
+    let go_starts = go.row(0);
+    let go_stops = go.row(1);
+
+    let regions_a = regions.as_array();
+    let shifts_a = shifts.as_array();
+    let geno_offset_idx_a = geno_offset_idx.as_array();
+    let geno_v_idxs_a = geno_v_idxs.as_array();
+    let v_starts_a = v_starts.as_array();
+    let ilens_a = ilens.as_array();
+
+    let (batch_size, ploidy) = geno_offset_idx_a.dim();
+    let n_work = batch_size * ploidy;
+
+    // Step 1: compute per-haplotype length diffs (reuses get_diffs_sparse core).
+    // Mirrors _haps.py _haplotype_ilens exactly: pass q_starts/q_ends/v_starts so
+    // partial deletions that span a query boundary are correctly clipped.
+    // q_starts = regions[:, 1], q_ends = regions[:, 2] (both already in regions_a).
+    // v_starts is the same array passed in — it is the per-variant genomic start.
+    let q_starts_owned: ndarray::Array1<i32> = regions_a.column(1).to_owned();
+    let q_ends_owned: ndarray::Array1<i32> = regions_a.column(2).to_owned();
+    let diffs = genotypes::get_diffs_sparse(
+        geno_offset_idx_a,
+        geno_v_idxs_a,
+        go_starts,
+        go_stops,
+        ilens_a,
+        keep.as_ref().map(|a| a.as_array()),
+        keep_offsets.as_ref().map(|a| a.as_array()),
+        Some(q_starts_owned.view()), // q_starts = regions[:, 1]
+        Some(q_ends_owned.view()),   // q_ends   = regions[:, 2]
+        Some(v_starts_a),            // v_starts = per-variant genomic starts
+        parallel,
+    );
+
+    // Step 2: compute per-haplotype output lengths and prefix-sum offsets.
+    // Mirrors the Python side: out_lengths = hap_lengths (or fixed output_length).
+    // hap_lengths = regions[:, 2] - regions[:, 1] + diffs  (end - start + diff)
+    // out_offsets shape: (n_work + 1,)
+    let mut out_offsets_vec: Array1<i64> = Array1::zeros(n_work + 1);
+    {
+        let mut acc: i64 = 0;
+        out_offsets_vec[0] = 0;
+        for k in 0..n_work {
+            let query = k / ploidy;
+            let hap = k % ploidy;
+            let len: i64 = if output_length >= 0 {
+                output_length
+            } else {
+                let ref_len = (regions_a[[query, 2]] - regions_a[[query, 1]]) as i64;
+                let diff = diffs[[query, hap]] as i64;
+                (ref_len + diff).max(0)
+            };
+            acc += len;
+            out_offsets_vec[k + 1] = acc;
+        }
+    }
+
+    // Step 3: allocate the output buffer in Rust — Python never calls np.empty.
+    let total = out_offsets_vec[n_work] as usize;
+    let mut out_data: Array1<u8> = uninit_output(total);
+
+    // Step 4: reconstruct all haplotypes into the owned buffer (reuses batch core).
+    reconstruct::reconstruct_haplotypes_from_sparse(
+        out_data.view_mut(),
+        out_offsets_vec.view(),
+        regions_a,
+        shifts_a,
+        geno_offset_idx_a,
+        go_starts,
+        go_stops,
+        geno_v_idxs_a,
+        v_starts_a,
+        ilens_a,
+        alt_alleles.as_array(),
+        alt_offsets.as_array(),
+        ref_.as_array(),
+        ref_offsets.as_array(),
+        pad_char,
+        keep.as_ref().map(|k| k.as_array()),
+        keep_offsets.as_ref().map(|ko| ko.as_array()),
+        None, // annot_v_idxs — not supported in fused plain path
+        None, // annot_ref_pos — not supported in fused plain path
+        parallel,
+    );
+
+    // Step 4b: optional in-kernel reverse-complement (one bool per (query, hap) work item).
+    if let Some(to_rc) = to_rc.as_ref() {
+        debug_assert_eq!(
+            to_rc.as_array().len(),
+            out_offsets_vec.len() - 1,
+            "to_rc mask length must equal number of output rows (offsets.len() - 1)"
+        );
+        crate::reverse::rc_flat_rows_inplace(
+            out_data.as_slice_mut().unwrap(),
+            out_offsets_vec.view(),
+            to_rc.as_array(),
+        );
+    }
+
+    // Step 5: return owned arrays — Python wraps them with no further coercions.
+    (out_data.into_pyarray(py), out_offsets_vec.into_pyarray(py))
+}
+
+/// Fused spliced-haplotype reconstruction: reconstruct in one FFI crossing using
+/// precomputed output offsets.
+///
+/// Unlike ``reconstruct_haplotypes_fused``, the Python splice path already computes
+/// the permutation and output offsets (``splice_plan.permuted_out_offsets``), so
+/// this kernel takes ``out_offsets`` as a direct parameter and skips Steps 1-2
+/// (no ``get_diffs_sparse``, no offset loop). This makes it simpler than the
+/// plain fused entry.
+///
+/// ``permuted_regions`` is shape ``(n_perm, 3)`` where each row is
+/// ``[contig_idx, start, end]`` after splice permutation.
+/// ``out_offsets`` is ``permuted_out_offsets`` from the Python splice plan
+/// (length ``n_perm + 1``).
+/// ``geno_offsets`` is the normalized ``(2, n)`` int64 starts/stops array.
+///
+/// Returns ``out_data`` (u8 flat buffer). The caller already holds ``out_offsets``
+/// so it is NOT returned — Python wraps with ``_Flat.from_offsets``.
+/// `parallel` enables rayon batch parallelism (caller computes `should_parallelize`).
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn reconstruct_haplotypes_spliced_fused<'py>(
+    py: Python<'py>,
+    permuted_regions: PyReadonlyArray2<i32>,
+    flat_shifts: PyReadonlyArray2<i32>,
+    flat_geno_offset_idx: PyReadonlyArray2<i64>,
+    out_offsets: PyReadonlyArray1<i64>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    alt_alleles: PyReadonlyArray1<u8>,
+    alt_offsets: PyReadonlyArray1<i64>,
+    ref_: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    to_rc: Option<PyReadonlyArray1<bool>>,
+    parallel: bool,
+) -> Bound<'py, PyArray1<u8>> {
+    use crate::reconstruct;
+
+    let go = geno_offsets.as_array();
+    let go_starts = go.row(0);
+    let go_stops = go.row(1);
+
+    // out_offsets are precomputed by the Python splice plan — use them directly.
+    let out_offsets_a = out_offsets.as_array();
+    let total = out_offsets_a[out_offsets_a.len() - 1] as usize;
+
+    // Allocate output buffer.
+    let mut out_data: Array1<u8> = uninit_output(total);
+
+    // Reconstruct all haplotypes into the owned buffer (reuses batch core).
+    reconstruct::reconstruct_haplotypes_from_sparse(
+        out_data.view_mut(),
+        out_offsets_a,
+        permuted_regions.as_array(),
+        flat_shifts.as_array(),
+        flat_geno_offset_idx.as_array(),
+        go_starts,
+        go_stops,
+        geno_v_idxs.as_array(),
+        v_starts.as_array(),
+        ilens.as_array(),
+        alt_alleles.as_array(),
+        alt_offsets.as_array(),
+        ref_.as_array(),
+        ref_offsets.as_array(),
+        pad_char,
+        keep.as_ref().map(|k| k.as_array()),
+        keep_offsets.as_ref().map(|ko| ko.as_array()),
+        None, // annot_v_idxs — not used in splice path
+        None, // annot_ref_pos — not used in splice path
+        parallel,
+    );
+
+    // Optional in-place RC per permuted element (negative-strand haplotypes).
+    // out_offsets_a is the permuted per-element offsets array (splice_plan.permuted_out_offsets),
+    // so each masked element is RC'd in its own byte range — matching the to_rc_per_elem post-pass.
+    if let Some(to_rc) = to_rc.as_ref() {
+        debug_assert_eq!(
+            to_rc.as_array().len(),
+            out_offsets_a.len() - 1,
+            "to_rc mask length must equal number of output rows (offsets.len() - 1)"
+        );
+        crate::reverse::rc_flat_rows_inplace(
+            out_data.as_slice_mut().unwrap(),
+            out_offsets_a,
+            to_rc.as_array(),
+        );
+    }
+
+    // Return out_data only — Python already holds out_offsets (no round-trip).
+    out_data.into_pyarray(py)
+}
+
+/// Fused annotated spliced-haplotype reconstruction: the annotated counterpart of
+/// `reconstruct_haplotypes_spliced_fused`. Reconstructs in one FFI crossing using
+/// precomputed splice output offsets AND fills the two per-nucleotide annotation
+/// arrays (variant index, reference coordinate).
+///
+/// Like the non-annotated splice entry, the Python splice plan already computes the
+/// permutation and `out_offsets` (`splice_plan.permuted_out_offsets`), so this kernel
+/// takes `out_offsets` directly and skips `get_diffs_sparse` / the offset loop.
+///
+/// On `to_rc`, each masked permuted element is reverse-complemented in place
+/// (`rc_flat_rows_inplace` on the sequence bytes) and its annotation rows are reversed
+/// in place (`reverse_flat_rows_inplace`, no complement) — byte-identical to
+/// `_FlatAnnotatedHaps.reverse_masked(mask, _COMP)`.
+///
+/// Returns `(out_data, annot_v, annot_pos)`. `out_offsets` is held by the caller and
+/// not returned (matches `reconstruct_haplotypes_spliced_fused`).
+/// `parallel` enables rayon batch parallelism (caller computes `should_parallelize`).
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn reconstruct_annotated_haplotypes_spliced_fused<'py>(
+    py: Python<'py>,
+    permuted_regions: PyReadonlyArray2<i32>,
+    flat_shifts: PyReadonlyArray2<i32>,
+    flat_geno_offset_idx: PyReadonlyArray2<i64>,
+    out_offsets: PyReadonlyArray1<i64>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    alt_alleles: PyReadonlyArray1<u8>,
+    alt_offsets: PyReadonlyArray1<i64>,
+    ref_: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    to_rc: Option<PyReadonlyArray1<bool>>,
+    parallel: bool,
+) -> (
+    Bound<'py, PyArray1<u8>>,
+    Bound<'py, PyArray1<i32>>,
+    Bound<'py, PyArray1<i32>>,
+) {
+    use crate::reconstruct;
+
+    let go = geno_offsets.as_array();
+    let go_starts = go.row(0);
+    let go_stops = go.row(1);
+
+    // out_offsets are precomputed by the Python splice plan — use them directly.
+    let out_offsets_a = out_offsets.as_array();
+    let total = out_offsets_a[out_offsets_a.len() - 1] as usize;
+
+    // Allocate the sequence + annotation buffers.
+    let mut out_data: Array1<u8> = uninit_output(total);
+    let mut annot_v: Array1<i32> = uninit_output(total);
+    let mut annot_pos: Array1<i32> = uninit_output(total);
+
+    // Reconstruct all haplotypes + annotations into the owned buffers (reuses batch core).
+    reconstruct::reconstruct_haplotypes_from_sparse(
+        out_data.view_mut(),
+        out_offsets_a,
+        permuted_regions.as_array(),
+        flat_shifts.as_array(),
+        flat_geno_offset_idx.as_array(),
+        go_starts,
+        go_stops,
+        geno_v_idxs.as_array(),
+        v_starts.as_array(),
+        ilens.as_array(),
+        alt_alleles.as_array(),
+        alt_offsets.as_array(),
+        ref_.as_array(),
+        ref_offsets.as_array(),
+        pad_char,
+        keep.as_ref().map(|k| k.as_array()),
+        keep_offsets.as_ref().map(|ko| ko.as_array()),
+        Some(annot_v.view_mut()),   // annot_v_idxs — variant index per nucleotide
+        Some(annot_pos.view_mut()), // annot_ref_pos — reference coordinate per nucleotide
+        parallel,
+    );
+
+    // Optional in-place RC per permuted element. Sequence bytes are reverse-complemented;
+    // annotation rows are reversed only (no complement) — matching
+    // _FlatAnnotatedHaps.reverse_masked. out_offsets_a is the permuted per-element
+    // offsets array, so each masked element is transformed in its own byte range.
+    if let Some(to_rc) = to_rc.as_ref() {
+        let m = to_rc.as_array();
+        debug_assert_eq!(
+            m.len(),
+            out_offsets_a.len() - 1,
+            "to_rc mask length must equal number of output rows (offsets.len() - 1)"
+        );
+        crate::reverse::rc_flat_rows_inplace(out_data.as_slice_mut().unwrap(), out_offsets_a, m);
+        crate::reverse::reverse_flat_rows_inplace(annot_v.as_slice_mut().unwrap(), out_offsets_a, m);
+        crate::reverse::reverse_flat_rows_inplace(annot_pos.as_slice_mut().unwrap(), out_offsets_a, m);
+    }
+
+    (
+        out_data.into_pyarray(py),
+        annot_v.into_pyarray(py),
+        annot_pos.into_pyarray(py),
+    )
+}
+
+/// Fused annotated-haplotype reconstruction: diffs + offsets + reconstruct in one FFI crossing.
+///
+/// Identical to ``reconstruct_haplotypes_fused`` but ALSO fills per-nucleotide
+/// annotation arrays (variant indices and reference coordinates), returning them
+/// alongside the haplotype bytes and offsets.
+///
+/// Steps:
+///   1. Compute per-haplotype length diffs via ``get_diffs_sparse``.
+///   2. Compute output-length prefix-sum offsets.
+///   3. Allocate ``out_data`` (u8), ``annot_v`` (i32), ``annot_pos`` (i32).
+///   4. Run ``reconstruct_haplotypes_from_sparse`` with ``Some(annot_v)``, ``Some(annot_pos)``.
+///   5. Return ``(out_data, annot_v, annot_pos, out_offsets)`` — Python builds three
+///      ``Ragged`` arrays from the shared offsets with no further coercions.
+///
+/// ``output_length``:
+///   - ``-1`` → ragged mode (each haplotype gets its natural length = ref_len + diff).
+///   - ``>= 0`` → fixed-length mode (every haplotype is padded/truncated to this length).
+///
+/// ``geno_offsets`` is the normalized ``(2, n)`` int64 starts/stops array (same
+/// layout as the existing ``reconstruct_haplotypes_from_sparse`` FFI entry).
+///
+/// Annotation buffers are not supported in the plain ``reconstruct_haplotypes_fused``
+/// entry; this function is its annotated counterpart.
+/// `parallel` enables rayon batch parallelism (caller computes `should_parallelize`).
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn reconstruct_annotated_haplotypes_fused<'py>(
+    py: Python<'py>,
+    regions: PyReadonlyArray2<i32>,
+    shifts: PyReadonlyArray2<i32>,
+    geno_offset_idx: PyReadonlyArray2<i64>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    alt_alleles: PyReadonlyArray1<u8>,
+    alt_offsets: PyReadonlyArray1<i64>,
+    ref_: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+    output_length: i64,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    to_rc: Option<PyReadonlyArray1<bool>>,
+    parallel: bool,
+) -> (
+    Bound<'py, PyArray1<u8>>,
+    Bound<'py, PyArray1<i32>>,
+    Bound<'py, PyArray1<i32>>,
+    Bound<'py, PyArray1<i64>>,
+) {
+    use crate::genotypes;
+    use crate::reconstruct;
+
+    let go = geno_offsets.as_array();
+    let go_starts = go.row(0);
+    let go_stops = go.row(1);
+
+    let regions_a = regions.as_array();
+    let shifts_a = shifts.as_array();
+    let geno_offset_idx_a = geno_offset_idx.as_array();
+    let geno_v_idxs_a = geno_v_idxs.as_array();
+    let v_starts_a = v_starts.as_array();
+    let ilens_a = ilens.as_array();
+
+    let (batch_size, ploidy) = geno_offset_idx_a.dim();
+    let n_work = batch_size * ploidy;
+
+    // Step 1: compute per-haplotype length diffs (reuses get_diffs_sparse core).
+    // Mirrors _haps.py _haplotype_ilens exactly: pass q_starts/q_ends/v_starts so
+    // partial deletions that span a query boundary are correctly clipped.
+    // q_starts = regions[:, 1], q_ends = regions[:, 2] (both already in regions_a).
+    // v_starts is the same array passed in — it is the per-variant genomic start.
+    let q_starts_owned: ndarray::Array1<i32> = regions_a.column(1).to_owned();
+    let q_ends_owned: ndarray::Array1<i32> = regions_a.column(2).to_owned();
+    let diffs = genotypes::get_diffs_sparse(
+        geno_offset_idx_a,
+        geno_v_idxs_a,
+        go_starts,
+        go_stops,
+        ilens_a,
+        keep.as_ref().map(|a| a.as_array()),
+        keep_offsets.as_ref().map(|a| a.as_array()),
+        Some(q_starts_owned.view()), // q_starts = regions[:, 1]
+        Some(q_ends_owned.view()),   // q_ends   = regions[:, 2]
+        Some(v_starts_a),            // v_starts = per-variant genomic starts
+        parallel,
+    );
+
+    // Step 2: compute per-haplotype output lengths and prefix-sum offsets.
+    // Mirrors the Python side: out_lengths = hap_lengths (or fixed output_length).
+    // hap_lengths = regions[:, 2] - regions[:, 1] + diffs  (end - start + diff)
+    // out_offsets shape: (n_work + 1,)
+    let mut out_offsets_vec: Array1<i64> = Array1::zeros(n_work + 1);
+    {
+        let mut acc: i64 = 0;
+        out_offsets_vec[0] = 0;
+        for k in 0..n_work {
+            let query = k / ploidy;
+            let hap = k % ploidy;
+            let len: i64 = if output_length >= 0 {
+                output_length
+            } else {
+                let ref_len = (regions_a[[query, 2]] - regions_a[[query, 1]]) as i64;
+                let diff = diffs[[query, hap]] as i64;
+                (ref_len + diff).max(0)
+            };
+            acc += len;
+            out_offsets_vec[k + 1] = acc;
+        }
+    }
+
+    // Step 3: allocate the output buffer and annotation buffers in Rust.
+    let total = out_offsets_vec[n_work] as usize;
+    let mut out_data: Array1<u8> = uninit_output(total);
+    let mut annot_v: Array1<i32> = uninit_output(total);
+    let mut annot_pos: Array1<i32> = uninit_output(total);
+
+    // Step 4: reconstruct all haplotypes into the owned buffers (reuses batch core).
+    reconstruct::reconstruct_haplotypes_from_sparse(
+        out_data.view_mut(),
+        out_offsets_vec.view(),
+        regions_a,
+        shifts_a,
+        geno_offset_idx_a,
+        go_starts,
+        go_stops,
+        geno_v_idxs_a,
+        v_starts_a,
+        ilens_a,
+        alt_alleles.as_array(),
+        alt_offsets.as_array(),
+        ref_.as_array(),
+        ref_offsets.as_array(),
+        pad_char,
+        keep.as_ref().map(|k| k.as_array()),
+        keep_offsets.as_ref().map(|ko| ko.as_array()),
+        Some(annot_v.view_mut()),   // annot_v_idxs — variant index per nucleotide
+        Some(annot_pos.view_mut()), // annot_ref_pos — reference coordinate per nucleotide
+        parallel,
+    );
+
+    if let Some(to_rc) = to_rc.as_ref() {
+        let m = to_rc.as_array();
+        debug_assert_eq!(
+            m.len(),
+            out_offsets_vec.len() - 1,
+            "to_rc mask length must equal number of output rows (offsets.len() - 1)"
+        );
+        crate::reverse::rc_flat_rows_inplace(out_data.as_slice_mut().unwrap(), out_offsets_vec.view(), m);
+        crate::reverse::reverse_flat_rows_inplace(annot_v.as_slice_mut().unwrap(), out_offsets_vec.view(), m);
+        crate::reverse::reverse_flat_rows_inplace(annot_pos.as_slice_mut().unwrap(), out_offsets_vec.view(), m);
+    }
+    // Step 5: return owned arrays — Python wraps them with no further coercions.
+    (
+        out_data.into_pyarray(py),
+        annot_v.into_pyarray(py),
+        annot_pos.into_pyarray(py),
+        out_offsets_vec.into_pyarray(py),
+    )
+}
+
+/// Fetch padded reference rows for each region into one flat buffer.
+/// `regions[i] = (contig_idx, start, end)`. Mirrors numba `_get_reference_par/_ser`.
+#[pyfunction]
+pub fn get_reference<'py>(
+    py: Python<'py>,
+    regions: PyReadonlyArray2<i32>,
+    out_offsets: PyReadonlyArray1<i64>,
+    reference: PyReadonlyArray1<u8>,
+    ref_offsets: PyReadonlyArray1<i64>,
+    pad_char: u8,
+    parallel: bool,
+    to_rc: Option<PyReadonlyArray1<bool>>,
+) -> Bound<'py, PyArray1<u8>> {
+    let out = reference::get_reference(
+        regions.as_array(),
+        out_offsets.as_array(),
+        reference.as_array(),
+        ref_offsets.as_array(),
+        pad_char,
+        parallel,
+        to_rc.as_ref().map(|a| a.as_array()),
+    );
+    out.into_pyarray(py)
+}
+
+/// Shift and realign tracks for a batch of (query, hap) pairs in place (writes `out`).
+///
+/// `geno_offsets` is the normalized (2, n) int64 starts/stops array;
+/// internally split into `.row(0)` (starts) and `.row(1)` (stops).
+/// `keep_offsets` stays 1-D (batch*ploidy + 1) offsets array for the keep mask, or None.
+/// `params` is a 1-D f64 parameter array (one entry per track, indexed Python-side).
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn shift_and_realign_tracks_sparse(
+    mut out: PyReadwriteArray1<f32>,
+    out_offsets: PyReadonlyArray1<i64>,
+    regions: PyReadonlyArray2<i32>,
+    shifts: PyReadonlyArray2<i32>,
+    geno_offset_idx: PyReadonlyArray2<i64>,
+    geno_v_idxs: PyReadonlyArray1<i32>,
+    geno_offsets: PyReadonlyArray2<i64>,
+    v_starts: PyReadonlyArray1<i32>,
+    ilens: PyReadonlyArray1<i32>,
+    tracks: PyReadonlyArray1<f32>,
+    track_offsets: PyReadonlyArray1<i64>,
+    params: PyReadonlyArray1<f64>,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    strategy_id: i64,
+    base_seed: u64,
+    parallel: bool,
+) {
+    use crate::tracks;
+    let go = geno_offsets.as_array();
+    tracks::shift_and_realign_tracks_sparse(
+        out.as_array_mut(),
+        out_offsets.as_array(),
+        regions.as_array(),
+        shifts.as_array(),
+        geno_offset_idx.as_array(),
+        geno_v_idxs.as_array(),
+        go.row(0),
+        go.row(1),
+        v_starts.as_array(),
+        ilens.as_array(),
+        tracks.as_array(),
+        track_offsets.as_array(),
+        params.as_array(),
+        keep.as_ref().map(|k| k.as_array()),
+        keep_offsets.as_ref().map(|ko| ko.as_array()),
+        strategy_id,
+        base_seed,
+        parallel,
     );
 }
+
+/// RLE-encode a ragged f32 track buffer into (starts, ends, values, offsets).
+///
+/// Mirrors numba `tracks_to_intervals` in `_intervals.py` lines 129-195.
+/// Returns a 4-tuple `(all_starts: i32, all_ends: i32, all_values: f32, interval_offsets: i64)`.
+#[pyfunction]
+pub fn tracks_to_intervals<'py>(
+    py: Python<'py>,
+    regions: PyReadonlyArray2<i32>,
+    tracks: PyReadonlyArray1<f32>,
+    track_offsets: PyReadonlyArray1<i64>,
+    parallel: bool,
+) -> (
+    Bound<'py, PyArray1<i32>>,
+    Bound<'py, PyArray1<i32>>,
+    Bound<'py, PyArray1<f32>>,
+    Bound<'py, PyArray1<i64>>,
+) {
+    use crate::tracks;
+    let (starts, ends, values, offsets) = tracks::tracks_to_intervals(
+        regions.as_array(),
+        tracks.as_array(),
+        track_offsets.as_array(),
+        parallel,
+    );
+    (
+        starts.into_pyarray(py),
+        ends.into_pyarray(py),
+        values.into_pyarray(py),
+        offsets.into_pyarray(py),
+    )
+}
+
+/// Fused per-track __getitem__ kernel (Task 14).
+///
+/// Collapses two FFI crossings into one per track:
+///   1. ``intervals_to_tracks`` core: fills a Rust-side scratch buffer from
+///      stored intervals (replacing the Python ``_tracks = np.empty(...)``
+///      intermediate, audit T2).
+///   2. ``shift_and_realign_tracks_sparse`` core: reads the scratch and writes
+///      the caller's pre-allocated ``out`` slice.
+///
+/// The outer Python loop over n_tracks remains (bounded by track count, small).
+/// Each loop iteration now makes ONE FFI crossing instead of two, and allocates
+/// ZERO Python-side intermediates.
+///
+/// ``out`` is the per-track slice of the caller's pre-allocated output buffer
+/// (shape ``(b*p*l,)`` f32).  ``out_offsets`` gives ragged lengths into that
+/// slice for each (query, hap) pair.
+///
+/// ``offset_idxs`` is the per-query index array into ``itv_offsets`` (shape
+/// ``(b,)``); ``itv_offsets`` is 1-D ``(n_samples*n_regions + 1)`` int64.
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn intervals_and_realign_track_fused(
+    mut out: PyReadwriteArray1<f32>,          // (b*p*l) — caller's per-track slice
+    out_offsets: PyReadonlyArray1<i64>,       // (b*p + 1)
+    regions: PyReadonlyArray2<i32>,           // (b, 3)
+    shifts: PyReadonlyArray2<i32>,            // (b, p)
+    geno_offset_idx: PyReadonlyArray2<i64>,   // (b, p)
+    geno_v_idxs: PyReadonlyArray1<i32>,       // (r*s*p*v)
+    geno_offsets: PyReadonlyArray2<i64>,      // (2, r*s*p)
+    v_starts: PyReadonlyArray1<i32>,          // (tot_v)
+    ilens: PyReadonlyArray1<i32>,             // (tot_v)
+    // intervals (reference-coordinate, for this track)
+    offset_idxs: PyReadonlyArray1<i64>,       // (b) — per-query index into itv_offsets
+    itv_starts: PyReadonlyArray1<i32>,         // (n_intervals)
+    itv_ends: PyReadonlyArray1<i32>,           // (n_intervals)
+    itv_values: PyReadonlyArray1<f32>,         // (n_intervals)
+    itv_offsets: PyReadonlyArray1<i64>,        // (n_samples*n_regions + 1)
+    track_offsets: PyReadonlyArray1<i64>,      // (b+1) — out_offsets for scratch buffer
+    // insertion-fill strategy
+    params: PyReadonlyArray1<f64>,
+    strategy_id: i64,
+    base_seed: u64,
+    keep: Option<PyReadonlyArray1<bool>>,
+    keep_offsets: Option<PyReadonlyArray1<i64>>,
+    to_rc: Option<PyReadonlyArray1<bool>>,
+    parallel: bool,
+) -> PyResult<()> {
+    use crate::intervals;
+    use crate::tracks;
+
+    let go = geno_offsets.as_array();
+    let go_starts = go.row(0);
+    let go_stops = go.row(1);
+
+    let out_offsets_a = out_offsets.as_array();
+    let regions_a = regions.as_array();
+
+    // Determine scratch buffer size from track_offsets.
+    let track_offsets_a = track_offsets.as_array();
+    let scratch_len = track_offsets_a[track_offsets_a.len() - 1] as usize;
+
+    // Allocate Rust-side scratch buffer — replaces Python `_tracks = np.empty(...)`.
+    // intervals_to_tracks calls out.fill(0.0) as its first step, so full-write is
+    // guaranteed; uninit_output is safe here.
+    let mut scratch = uninit_output::<f32>(scratch_len);
+
+    // Extract query starts (regions[:, 1]) as a contiguous owned array.
+    // regions_a.column(1) is a non-contiguous view (row-major storage); we
+    // must own/contiguify it before passing to intervals_to_tracks which
+    // expects a contiguous ArrayView1<i32>.
+    let q_starts: ndarray::Array1<i32> = regions_a.column(1).to_owned();
+
+    // Step 1: paint reference-coordinate intervals into scratch (reuses intervals core).
+    intervals::intervals_to_tracks(
+        offset_idxs.as_array(),
+        q_starts.view(),
+        itv_starts.as_array(),
+        itv_ends.as_array(),
+        itv_values.as_array(),
+        itv_offsets.as_array(),
+        scratch.view_mut(),
+        track_offsets_a,
+        parallel,
+    );
+
+    // Step 2: shift and realign into caller's out slice (reuses tracks core).
+    tracks::shift_and_realign_tracks_sparse(
+        out.as_array_mut(),
+        out_offsets_a,
+        regions_a,
+        shifts.as_array(),
+        geno_offset_idx.as_array(),
+        geno_v_idxs.as_array(),
+        go_starts,
+        go_stops,
+        v_starts.as_array(),
+        ilens.as_array(),
+        scratch.view(),
+        track_offsets_a,
+        params.as_array(),
+        keep.as_ref().map(|k| k.as_array()),
+        keep_offsets.as_ref().map(|ko| ko.as_array()),
+        strategy_id,
+        base_seed,
+        parallel,
+    );
+
+    // Step 3: optional in-place reverse for negative-strand tracks (reverse only, no complement).
+    if let Some(to_rc) = to_rc.as_ref() {
+        debug_assert_eq!(
+            to_rc.as_array().len(),
+            out_offsets.as_array().len() - 1,
+            "to_rc mask length must equal number of output rows (offsets.len() - 1)"
+        );
+        crate::reverse::reverse_flat_rows_inplace(
+            out.as_slice_mut().unwrap(),
+            out_offsets.as_array(),
+            to_rc.as_array(),
+        );
+    }
+
+    Ok(())
+}
+
+// ── Task 3: guard test — drives rc_flat_rows_inplace on a synthetic hap buffer ─
+// ── Task 4: guard test — drives reverse_flat_rows_inplace::<f32> (reverse only) ─
+// ── Task 6: guard test — proves per-element masking over permuted offsets ────────
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn haplotype_buffer_rc_is_revcomp_of_forward() {
+        let mut out = b"ACGTA".to_vec(); // pretend reconstructed forward bytes
+        let offsets = ndarray::array![0i64, 5];
+        let to_rc = ndarray::array![true];
+        crate::reverse::rc_flat_rows_inplace(&mut out, offsets.view(), to_rc.view());
+        assert_eq!(&out, b"TACGT"); // revcomp(ACGTA)
+    }
+
+    #[test]
+    fn track_buffer_rc_is_reverse_only() {
+        let mut out = vec![1.0f32, 2.0, 3.0];
+        let offsets = ndarray::array![0i64, 3];
+        let to_rc = ndarray::array![true];
+        crate::reverse::reverse_flat_rows_inplace(&mut out, offsets.view(), to_rc.view());
+        assert_eq!(out, vec![3.0, 2.0, 1.0]); // no value transform
+    }
+
+    #[test]
+    fn spliced_rc_applies_per_element_over_permuted_offsets() {
+        // two permuted elements: "ACG" (rc) and "TTT" (not rc)
+        let mut out = b"ACGTTT".to_vec();
+        let offsets = ndarray::array![0i64, 3, 6];
+        let to_rc = ndarray::array![true, false];
+        crate::reverse::rc_flat_rows_inplace(&mut out, offsets.view(), to_rc.view());
+        assert_eq!(&out[0..3], b"CGT"); // revcomp(ACG)
+        assert_eq!(&out[3..6], b"TTT"); // untouched
+    }
+
+    #[test]
+    fn annotated_rc_complements_bytes_reverses_indices() {
+        let mut bytes = b"ACG".to_vec();          // revcomp -> "CGT"
+        let mut vidx = vec![5i32, 6, 7];          // reverse -> [7,6,5]
+        let mut rpos = vec![100i32, 101, 102];    // reverse -> [102,101,100]
+        let offsets = ndarray::array![0i64, 3];
+        let m = ndarray::array![true];
+        crate::reverse::rc_flat_rows_inplace(&mut bytes, offsets.view(), m.view());
+        crate::reverse::reverse_flat_rows_inplace(&mut vidx, offsets.view(), m.view());
+        crate::reverse::reverse_flat_rows_inplace(&mut rpos, offsets.view(), m.view());
+        assert_eq!(&bytes, b"CGT");
+        assert_eq!(vidx, vec![7, 6, 5]);
+        assert_eq!(rpos, vec![102, 101, 100]);
+    }
+}
+
+// ── DEBUG exports for PRNG parity tests (Task 7) ─────────────────────────────
+// These thin wrappers exist solely to make the Rust PRNG functions callable from
+// Python tests. Decision (final-review, Task 15): KEEP permanently as the direct
+// PRNG parity guard. The njit-internal xorshift64/hash4 leaves have no other
+// Python entry point, so these are the only way to assert byte-identity of the
+// PRNG core from test_prng_parity.py. Do NOT remove.
+
+/// In-place reverse-complement of the alleles of mask-selected `(b*p)` rows.
+/// See `crate::variants::rc_alleles_inplace`.
+#[pyfunction]
+pub fn rc_alleles(
+    mut byte_data: PyReadwriteArray1<u8>,
+    seq_offsets: PyReadonlyArray1<i64>,
+    var_offsets: PyReadonlyArray1<i64>,
+    to_rc_row: PyReadonlyArray1<bool>,
+) {
+    crate::variants::rc_alleles_inplace(
+        byte_data.as_slice_mut().unwrap(),
+        seq_offsets.as_array(),
+        var_offsets.as_array(),
+        to_rc_row.as_array(),
+    );
+}
+
+/// [DEBUG] Rust xorshift64 — callable from Python for parity testing.
+/// Mirrors numba `_xorshift64` on `np.uint64`.
+#[pyfunction]
+pub fn _debug_xorshift64(x: u64) -> u64 {
+    crate::tracks::xorshift64(x)
+}
+
+/// [DEBUG] Rust hash4 — callable from Python for parity testing.
+/// Mirrors numba `_hash4` on `np.uint64`.
+#[pyfunction]
+pub fn _debug_hash4(a: u64, b: u64, c: u64, d: u64) -> u64 {
+    crate::tracks::hash4(a, b, c, d)
+}
diff --git a/src/genotypes/mod.rs b/src/genotypes/mod.rs
new file mode 100644
index 00000000..e42167ff
--- /dev/null
+++ b/src/genotypes/mod.rs
@@ -0,0 +1,232 @@
+//! Genotype assembly/selection cores (pure ndarray). PyO3 lives in `crate::ffi`.
+use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
+use rayon::prelude::*;
+
+/// Per-(query, hap) reference-length diffs. Mirrors the numba
+/// `get_diffs_sparse` exactly. `o_starts`/`o_stops` are the two rows of the
+/// normalized (2, n) offset array: `o_s = o_starts[o_idx]`, `o_e = o_stops[o_idx]`.
+/// Length sums stay far within i32 for real variants; accumulate in i64 and
+/// truncate on store to mirror numpy's `int32`-slot assignment.
+///
+/// When `parallel=true` the outer query×hap loop is dispatched via rayon
+/// `par_chunks_mut` over the flat output buffer. Each chunk is exactly one
+/// `(query, hap)` cell, so the writes are provably disjoint.
+#[allow(clippy::too_many_arguments)]
+pub fn get_diffs_sparse(
+    geno_offset_idx: ArrayView2<i64>,
+    geno_v_idxs: ArrayView1<i32>,
+    o_starts: ArrayView1<i64>,
+    o_stops: ArrayView1<i64>,
+    ilens: ArrayView1<i32>,
+    keep: Option<ArrayView1<bool>>,
+    keep_offsets: Option<ArrayView1<i64>>,
+    q_starts: Option<ArrayView1<i32>>,
+    q_ends: Option<ArrayView1<i32>>,
+    v_starts: Option<ArrayView1<i32>>,
+    parallel: bool,
+) -> Array2<i32> {
+    let (n_queries, ploidy) = geno_offset_idx.dim();
+    let n_work = n_queries * ploidy;
+    let mut diffs = Array2::<i32>::zeros((n_queries, ploidy));
+
+    // Closure computing the diff for work item k=(query*ploidy+hap).
+    // All read-only ArrayViews are Send+Sync; the output cell is carved via
+    // par_chunks_mut so each chunk covers exactly one i32 — provably disjoint.
+    let has_query = q_starts.is_some() && q_ends.is_some() && v_starts.is_some();
+    let has_keep = keep.is_some() && keep_offsets.is_some();
+
+    let compute = |k: usize| -> i32 {
+        let query = k / ploidy;
+        let hap = k % ploidy;
+        let o_idx = geno_offset_idx[[query, hap]] as usize;
+        let o_s = o_starts[o_idx] as usize;
+        let o_e = o_stops[o_idx] as usize;
+        let n_variants = o_e - o_s;
+
+        if n_variants == 0 {
+            0
+        } else if has_query {
+            let qs = q_starts.unwrap();
+            let qe = q_ends.unwrap();
+            let vs = v_starts.unwrap();
+            let q_start = qs[query] as i64;
+            let q_end = qe[query] as i64;
+            let mut ref_idx = q_start;
+            let mut acc: i64 = 0;
+            for v in o_s..o_e {
+                if has_keep {
+                    let kp = keep.unwrap();
+                    let ko = keep_offsets.unwrap();
+                    let k_s = ko[query * ploidy + hap] as usize;
+                    if !kp[k_s + (v - o_s)] {
+                        continue;
+                    }
+                }
+                let v_idx = geno_v_idxs[v] as usize;
+                let v_start = vs[v_idx] as i64;
+                let mut v_ilen = ilens[v_idx] as i64;
+                let v_end = v_start - v_ilen.min(0) + 1;
+                if v_end <= q_start {
+                    continue;
+                }
+                if v_start >= q_end {
+                    break;
+                }
+                if v_start >= q_start && v_start < ref_idx {
+                    continue;
+                }
+                ref_idx = ref_idx.max(v_end);
+                if v_ilen < 0 {
+                    v_ilen += (q_start - v_start - 1).max(0);
+                }
+                v_ilen += (v_end - q_end).max(0);
+                acc += v_ilen;
+            }
+            acc as i32
+        } else if has_keep {
+            let kp = keep.unwrap();
+            let ko = keep_offsets.unwrap();
+            let k_s = ko[query * ploidy + hap] as usize;
+            let mut sum: i64 = 0;
+            for (j, v) in (o_s..o_e).enumerate() {
+                if kp[k_s + j] {
+                    sum += ilens[geno_v_idxs[v] as usize] as i64;
+                }
+            }
+            sum as i32
+        } else {
+            let mut sum: i64 = 0;
+            for v in o_s..o_e {
+                sum += ilens[geno_v_idxs[v] as usize] as i64;
+            }
+            sum as i32
+        }
+    };
+
+    if parallel {
+        // Each chunk is exactly one i32 cell (chunk_size=1), so writes are
+        // provably disjoint — safe for rayon. &mut [i32] is Send.
+        diffs
+            .as_slice_mut()
+            .unwrap()
+            .par_chunks_mut(1)
+            .enumerate()
+            .for_each(|(k, cell)| {
+                cell[0] = compute(k);
+            });
+    } else {
+        for k in 0..n_work {
+            let query = k / ploidy;
+            let hap = k % ploidy;
+            diffs[[query, hap]] = compute(k);
+        }
+    }
+    diffs
+}
+
+/// Keep-mask for variants fully contained in each query interval. Mirrors the
+/// numba `choose_exonic_variants` + inner `_choose_exonic_variants`. Returns
+/// `(keep, keep_offsets)` where keep_offsets is the per-group prefix sum of
+/// group sizes (len n_groups + 1).
+#[allow(clippy::too_many_arguments)]
+pub fn choose_exonic_variants(
+    starts: ArrayView1<i32>,
+    ends: ArrayView1<i32>,
+    geno_offset_idx: ArrayView2<i64>,
+    geno_v_idxs: ArrayView1<i32>,
+    o_starts: ArrayView1<i64>,
+    o_stops: ArrayView1<i64>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+) -> (Array1<bool>, Array1<i64>) {
+    let (n_regions, ploidy) = geno_offset_idx.dim();
+
+    // keep_offsets = prefix sum of per-group lengths (numba uses lengths.cumsum()).
+    let mut keep_offsets = Array1::<i64>::zeros(n_regions * ploidy + 1);
+    let mut acc: i64 = 0;
+    for query in 0..n_regions {
+        for hap in 0..ploidy {
+            let o_idx = geno_offset_idx[[query, hap]] as usize;
+            let len = (o_stops[o_idx] - o_starts[o_idx]).max(0);
+            acc += len;
+            keep_offsets[query * ploidy + hap + 1] = acc;
+        }
+    }
+
+    let n_variants = keep_offsets[n_regions * ploidy] as usize;
+    let mut keep = Array1::<bool>::default(n_variants);
+
+    for query in 0..n_regions {
+        let ref_start = starts[query] as i64;
+        let ref_end = ends[query] as i64;
+        for hap in 0..ploidy {
+            let o_idx = geno_offset_idx[[query, hap]] as usize;
+            let o_s = o_starts[o_idx] as usize;
+            let o_e = o_stops[o_idx] as usize;
+            let k_s = keep_offsets[query * ploidy + hap] as usize;
+            for (j, v) in (o_s..o_e).enumerate() {
+                let v_idx = geno_v_idxs[v] as usize;
+                let v_pos = v_starts[v_idx] as i64;
+                let v_ref_end = v_pos - (ilens[v_idx] as i64).min(0) + 1;
+                keep[k_s + j] = v_pos >= ref_start && v_ref_end <= ref_end;
+            }
+        }
+    }
+    (keep, keep_offsets)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::{arr1, arr2};
+
+    #[test]
+    fn test_plain_sum() {
+        // 1 query, ploidy 1, two variants with ilens [-2, 3] → sum 1.
+        let goi = arr2(&[[0i64]]);
+        let v_idxs = arr1(&[0i32, 1]);
+        let o_starts = arr1(&[0i64]);
+        let o_stops = arr1(&[2i64]);
+        let ilens = arr1(&[-2i32, 3]);
+        let d = get_diffs_sparse(
+            goi.view(), v_idxs.view(), o_starts.view(), o_stops.view(),
+            ilens.view(), None, None, None, None, None,
+            false, // serial — unit tests don't need rayon overhead
+        );
+        assert_eq!(d[[0, 0]], 1);
+    }
+
+    #[test]
+    fn test_empty_group_is_zero() {
+        let goi = arr2(&[[0i64]]);
+        let v_idxs: ndarray::Array1<i32> = ndarray::Array1::from(vec![]);
+        let o_starts = arr1(&[0i64]);
+        let o_stops = arr1(&[0i64]); // empty slice
+        let ilens: ndarray::Array1<i32> = ndarray::Array1::from(vec![]);
+        let d = get_diffs_sparse(
+            goi.view(), v_idxs.view(), o_starts.view(), o_stops.view(),
+            ilens.view(), None, None, None, None, None,
+            false, // serial — unit tests don't need rayon overhead
+        );
+        assert_eq!(d[[0, 0]], 0);
+    }
+
+    #[test]
+    fn test_exonic_contained_only() {
+        // region [10, 20). variants at pos 12 (ilen 0 -> end 13, kept) and
+        // pos 19 (ilen 0 -> end 20, kept), pos 19 with ilen -2 -> end 22 (dropped).
+        let goi = arr2(&[[0i64]]);
+        let v_idxs = arr1(&[0i32, 1, 2]);
+        let o_starts = arr1(&[0i64]);
+        let o_stops = arr1(&[3i64]);
+        let v_starts = arr1(&[12i32, 19, 19]);
+        let ilens = arr1(&[0i32, 0, -2]);
+        let (keep, koff) = choose_exonic_variants(
+            arr1(&[10i32]).view(), arr1(&[20i32]).view(), goi.view(),
+            v_idxs.view(), o_starts.view(), o_stops.view(),
+            v_starts.view(), ilens.view(),
+        );
+        assert_eq!(keep.to_vec(), vec![true, true, false]);
+        assert_eq!(koff.to_vec(), vec![0, 3]);
+    }
+}
diff --git a/src/intervals.rs b/src/intervals.rs
index e78a2014..c31ad8c0 100644
--- a/src/intervals.rs
+++ b/src/intervals.rs
@@ -1,4 +1,5 @@
 use ndarray::{ArrayView1, ArrayViewMut1};
+use rayon::prelude::*;
 
 /// Paint base-pair-resolution tracks from pre-sorted intervals.
 ///
@@ -11,8 +12,10 @@ use ndarray::{ArrayView1, ArrayViewMut1};
 /// - Breaks out of the interval loop when `start >= length` (intervals are
 ///   sorted by start, so all subsequent intervals are also out of range).
 /// - Values are copied (f32 → f32), never reduced.
-/// - Sequential over queries — per-query out slices are disjoint, so the
-///   result equals numba's prange result without any need for rayon here.
+///
+/// When `parallel=true` the outer query loop is dispatched via rayon using the
+/// split_at_mut cursor idiom (same as C1/C2) so per-query out slices are
+/// provably disjoint — no raw `*mut` in the closure.
 pub fn intervals_to_tracks(
     offset_idxs: ArrayView1<i64>,
     starts: ArrayView1<i32>,
@@ -22,26 +25,42 @@ pub fn intervals_to_tracks(
     itv_offsets: ArrayView1<i64>,
     mut out: ArrayViewMut1<f32>,
     out_offsets: ArrayView1<i64>,
+    parallel: bool,
 ) {
+    // Hoist all inputs to raw slices before any loop — eliminates ndarray's
+    // per-element stride multiplication and bounds-check branches that would
+    // otherwise appear in every inner-loop iteration.
+    let offset_idxs = offset_idxs.as_slice().unwrap();
+    let starts = starts.as_slice().unwrap();
+    let itv_starts = itv_starts.as_slice().unwrap();
+    let itv_ends = itv_ends.as_slice().unwrap();
+    let itv_values = itv_values.as_slice().unwrap();
+    let itv_offsets = itv_offsets.as_slice().unwrap();
+    let out_offsets = out_offsets.as_slice().unwrap();
+
     // Step 1: zero the whole output buffer, exactly like `out[:] = 0.0`.
-    out.fill(0.0);
+    // The out buffer is freshly allocated and contiguous; address it as a raw
+    // &mut [f32] so per-interval writes avoid ndarray SliceInfo construction.
+    let out_slice = out.as_slice_mut().unwrap();
+    out_slice.fill(0.0);
 
     let n_queries = starts.len();
 
-    for query in 0..n_queries {
+    // Inner per-query paint logic. Takes a mutable slice for this query's
+    // output region (already offset-addressed) plus the query index.
+    // All read-only slices are captured by shared reference — they are
+    // Send+Sync so this closure is safe to use in rayon.
+    let paint_query = |query: usize, out_chunk: &mut [f32]| {
         let idx = offset_idxs[query] as usize;
         let itv_s = itv_offsets[idx] as usize;
         let itv_e = itv_offsets[idx + 1] as usize;
 
         if itv_s == itv_e {
-            // No intervals for this query — out slice stays 0.
-            continue;
+            // No intervals for this query — out slice stays 0 (already zeroed).
+            return;
         }
 
-        let out_s = out_offsets[query] as usize;
-        let out_e = out_offsets[query + 1] as usize;
-        // length as i64 to do signed arithmetic below.
-        let length = (out_e - out_s) as i64;
+        let length = out_chunk.len() as i64;
         let query_start = starts[query] as i64;
 
         for interval in itv_s..itv_e {
@@ -57,15 +76,52 @@ pub fn intervals_to_tracks(
             }
             // Clip to the query window. Intervals may start before query_start
             // (jitter-expanded interval storage vs. the per-read query origin;
-            // see issue #242) or end past it. No negative-index wrap.
+            // see issue #242) or end past it. Keep s/e as i64 until after the
+            // guard so that negative values don't wrap when cast to usize.
             let s = start.max(0);
             let e = end.min(length);
             if e > s {
-                let a = out_s + s as usize;
-                let b = out_s + e as usize;
-                out.slice_mut(ndarray::s![a..b]).fill(value);
+                out_chunk[s as usize..e as usize].fill(value);
+            }
+        }
+    };
+
+    if parallel {
+        // Build disjoint per-query mutable slices using the split_at_mut
+        // cursor idiom (mirrors C1 reconstruct_haplotypes_from_sparse).
+        let bounds: Vec<(usize, usize)> = (0..n_queries)
+            .map(|q| (out_offsets[q] as usize, out_offsets[q + 1] as usize))
+            .collect();
+
+        let mut out_chunks: Vec<&mut [f32]> = Vec::with_capacity(n_queries);
+        {
+            let mut rest = &mut out_slice[..];
+            let mut cursor = 0usize;
+            for &(s, e) in &bounds {
+                debug_assert!(
+                    s >= cursor && e >= s,
+                    "out_offsets must be monotonically non-decreasing (got s={s}, e={e}, cursor={cursor})"
+                );
+                let (_, tail) = rest.split_at_mut(s - cursor);
+                let (mid, tail2) = tail.split_at_mut(e - s);
+                out_chunks.push(mid);
+                rest = tail2;
+                cursor = e;
             }
         }
+
+        out_chunks
+            .into_par_iter()
+            .enumerate()
+            .for_each(|(query, out_chunk)| {
+                paint_query(query, out_chunk);
+            });
+    } else {
+        for query in 0..n_queries {
+            let out_s = out_offsets[query] as usize;
+            let out_e = out_offsets[query + 1] as usize;
+            paint_query(query, &mut out_slice[out_s..out_e]);
+        }
     }
 }
 
@@ -95,6 +151,7 @@ mod tests {
             Array1::from_vec(itv_offsets.to_vec()).view(),
             out.view_mut(),
             Array1::from_vec(out_offsets.to_vec()).view(),
+            false, // serial path — unit tests don't need rayon overhead
         );
         out.to_vec()
     }
diff --git a/src/lib.rs b/src/lib.rs
index d963d8c6..096545ef 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,8 +1,14 @@
 pub mod bigwig;
 pub mod ffi;
+pub mod genotypes;
 pub mod intervals;
 pub mod ragged;
+pub mod reconstruct;
+pub mod reference;
+pub mod reverse;
 pub mod tables;
+pub mod tracks;
+pub mod variants;
 use numpy::{prelude::*, PyArray1, PyArray2, PyReadonlyArray1};
 use pyo3::prelude::*;
 use std::path::PathBuf;
@@ -15,10 +21,38 @@ fn genvarloader(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<tables::RustTable>()?;
     m.add_function(wrap_pyfunction!(ragged::ragged_to_padded, m)?)?;
     m.add_function(wrap_pyfunction!(ffi::intervals_to_tracks, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::get_diffs_sparse, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::choose_exonic_variants, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::gather_rows_i32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::gather_rows_f32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::gather_alleles, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::compact_keep_i32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::compact_keep_f32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::fill_empty_scalar_i32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::fill_empty_scalar_f32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::fill_empty_fixed_i32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::fill_empty_fixed_f32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::fill_empty_seq_u8, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::fill_empty_seq_i32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::assemble_variant_buffers_u8, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::assemble_variant_buffers_i32, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::rc_alleles, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::get_reference, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::reconstruct_haplotypes_from_sparse, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::reconstruct_haplotypes_fused, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::reconstruct_annotated_haplotypes_fused, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::reconstruct_haplotypes_spliced_fused, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::reconstruct_annotated_haplotypes_spliced_fused, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::shift_and_realign_tracks_sparse, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::tracks_to_intervals, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::intervals_and_realign_track_fused, m)?)?;
+    // DEBUG: PRNG parity exports (Task 7) — keep or remove after Task 8/9 review
+    m.add_function(wrap_pyfunction!(ffi::_debug_xorshift64, m)?)?;
+    m.add_function(wrap_pyfunction!(ffi::_debug_hash4, m)?)?;
     Ok(())
 }
 
-/// Write intervals.npy + offsets.npy for a bigWig track directly to `out_dir`.
+/// Write SoA starts/ends/values.npy + offsets.npy for a bigWig track directly to `out_dir`.
 #[pyfunction]
 #[allow(clippy::too_many_arguments)]
 fn bigwig_write_track(
diff --git a/src/reconstruct/mod.rs b/src/reconstruct/mod.rs
new file mode 100644
index 00000000..4b77ea77
--- /dev/null
+++ b/src/reconstruct/mod.rs
@@ -0,0 +1,1208 @@
+//! Single-haplotype reconstruction core (pure ndarray). PyO3 lives in `crate::ffi`.
+//!
+//! Mirrors `reconstruct_haplotype_from_sparse` in
+//! `python/genvarloader/_dataset/_genotypes.py:277-465` statement-by-statement.
+use ndarray::{s, ArrayView1, ArrayView2, ArrayViewMut1};
+use rayon::prelude::*;
+
+/// Reconstruct a single haplotype from reference sequence and variants.
+///
+/// Single-haplotype inner kernel. Mirror of numba
+/// `reconstruct_haplotype_from_sparse` (`_genotypes.py:277-465`).
+///
+/// # Parameters
+/// - `v_idxs`      – indices into the full variant table for this haplotype (i32)
+/// - `v_starts`    – genomic start position of each variant (i32, indexed by variant)
+/// - `ilens`       – insertion-length (ilen = alt_len − ref_len + 1) per variant (i32)
+/// - `shift`       – total amount to shift by (i64)
+/// - `alt_alleles` – packed ALT allele bytes for all variants (u8)
+/// - `alt_offsets` – byte offsets into `alt_alleles`; length = total_variants + 1 (i64)
+/// - `ref_`        – reference contig bytes (u8)
+/// - `ref_start`   – start position into the reference; may be negative (i64)
+/// - `out`         – output buffer to fill (u8, length = desired haplotype length)
+/// - `pad_char`    – byte used for padding where reference is unavailable
+/// - `keep`        – optional per-haplotype-variant mask; `None` means use all
+/// - `annot_v_idxs`  – optional annotation: variant index per output position (i32; -1 = ref/pad)
+/// - `annot_ref_pos` – optional annotation: reference position per output position (i32;
+///                     -1 = leading pad, i32::MAX = trailing pad)
+#[allow(clippy::too_many_arguments)]
+pub fn reconstruct_haplotype_from_sparse(
+    v_idxs: ArrayView1<i32>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+    shift: i64,
+    alt_alleles: ArrayView1<u8>,
+    alt_offsets: ArrayView1<i64>,
+    ref_: ArrayView1<u8>,
+    ref_start: i64,
+    mut out: ArrayViewMut1<u8>,
+    pad_char: u8,
+    keep: Option<ArrayView1<bool>>,
+    mut annot_v_idxs: Option<ArrayViewMut1<i32>>,
+    mut annot_ref_pos: Option<ArrayViewMut1<i32>>,
+) {
+    let length = out.len() as i64;
+    let n_variants = v_idxs.len();
+
+    // Hoist contiguous-slice pointers once so the hot loops use direct byte ops
+    // (fill/copy_from_slice) instead of ndarray's stride/do_slice dispatch path.
+    let out_flat: &mut [u8] = out.as_slice_mut().unwrap();
+    let ref_flat: &[u8] = ref_.as_slice().unwrap();
+    let alt_flat: &[u8] = alt_alleles.as_slice().unwrap();
+    let mut av_flat: Option<&mut [i32]> = annot_v_idxs.as_mut().and_then(|a| a.as_slice_mut());
+    let mut ap_flat: Option<&mut [i32]> = annot_ref_pos.as_mut().and_then(|a| a.as_slice_mut());
+
+    // where to get next reference subsequence
+    let mut ref_idx: i64 = ref_start;
+    // where to put next subsequence
+    let mut out_idx: i64 = 0;
+    // how much we've shifted
+    let mut shifted: i64 = 0;
+
+    // if ref_idx is negative, we need to pad the beginning of the haplotype
+    if ref_idx < 0 {
+        let pad_len_raw = -ref_idx;
+        shifted = shift.min(pad_len_raw);
+        let pad_len = pad_len_raw - shifted;
+        let s = out_idx as usize;
+        let e = (out_idx + pad_len) as usize;
+        out_flat[s..e].fill(pad_char);
+        if let Some(av) = av_flat.as_deref_mut() {
+            av[s..e].fill(-1);
+        }
+        if let Some(ap) = ap_flat.as_deref_mut() {
+            ap[s..e].fill(-1);
+        }
+        out_idx += pad_len;
+        ref_idx = 0;
+    }
+
+    'variants: for v in 0..n_variants {
+        if let Some(ref k) = keep {
+            if !k[v] {
+                continue;
+            }
+        }
+
+        let variant = v_idxs[v] as usize;
+        let v_pos = v_starts[variant] as i64;
+        let v_diff = ilens[variant] as i64;
+        let ao_s = alt_offsets[variant] as usize;
+        let ao_e = alt_offsets[variant + 1] as usize;
+        // full allele slice; may be sub-sliced below for shift consumption
+        let allele_full = &alt_flat[ao_s..ao_e];
+        let v_len_full = allele_full.len() as i64;
+        // +1 assumes atomized variants, exactly 1 nt shared between REF and ALT
+        let v_ref_end: i64 = v_pos - 0i64.min(v_diff) + 1;
+
+        // if variant is a DEL spanning start of query
+        if v_pos < ref_start && v_diff < 0 && v_ref_end >= ref_start {
+            ref_idx = v_ref_end;
+            continue;
+        }
+
+        // overlapping variants
+        // v_pos < ref_idx only if we see an ALT at a given position a second
+        // time or more. We'll do what bcftools consensus does and only use the
+        // first ALT variant we find.
+        if v_pos < ref_idx {
+            continue;
+        }
+
+        // handle shift
+        // allele_start_idx tracks how much of the allele to skip (0 by default)
+        let mut allele_start_idx: i64 = 0;
+        if shifted < shift {
+            let ref_shift_dist = v_pos - ref_idx;
+            // not enough distance to finish the shift even with the variant
+            if shifted + ref_shift_dist + v_len_full < shift {
+                // skip the variant
+                continue 'variants;
+            }
+            // enough distance between ref_idx and start of variant to finish shift
+            else if shifted + ref_shift_dist >= shift {
+                ref_idx += shift - shifted;
+                shifted = shift;
+                // can still use the variant and whatever ref is left between
+                // ref_idx and the variant
+            }
+            // ref + all or some of variant is enough to finish shift
+            else {
+                // how much left to shift - amount of ref we can use
+                allele_start_idx = shift - shifted - ref_shift_dist;
+                shifted = shift;
+                // enough dist with variant to complete shift
+                if allele_start_idx == v_len_full {
+                    // move ref to end of variant
+                    ref_idx = v_ref_end;
+                    // skip the variant
+                    continue 'variants;
+                }
+                // consume ref up to beginning of variant
+                // ref_idx will be moved to end of variant after using the variant
+                ref_idx = v_pos;
+                // adjust variant to start at allele_start_idx — done via offset below
+            }
+        }
+
+        // Working allele slice (may start at allele_start_idx after shift consumption)
+        let allele = &allele_full[allele_start_idx as usize..];
+        let v_len = allele.len() as i64;
+
+        // add reference sequence
+        let ref_len = v_pos - ref_idx;
+        if out_idx + ref_len >= length {
+            // ref will get written by final clause
+            // handles case where extraneous variants downstream of the haplotype were provided
+            break;
+        }
+        {
+            let os = out_idx as usize;
+            let oe = (out_idx + ref_len) as usize;
+            let rs = ref_idx as usize;
+            let re = (ref_idx + ref_len) as usize;
+            out_flat[os..oe].copy_from_slice(&ref_flat[rs..re]);
+            if let Some(av) = av_flat.as_deref_mut() {
+                av[os..oe].fill(-1);
+            }
+            if let Some(ap) = ap_flat.as_deref_mut() {
+                // arange(ref_idx, ref_idx + ref_len)
+                for (j, pos) in (os..oe).zip(rs..re) {
+                    ap[j] = pos as i32;
+                }
+            }
+        }
+        out_idx += ref_len;
+
+        // apply variant
+        let writable_length = v_len.min(length - out_idx);
+        {
+            let os = out_idx as usize;
+            let oe = (out_idx + writable_length) as usize;
+            out_flat[os..oe].copy_from_slice(&allele[..writable_length as usize]);
+            if let Some(av) = av_flat.as_deref_mut() {
+                av[os..oe].fill(variant as i32);
+            }
+            if let Some(ap) = ap_flat.as_deref_mut() {
+                ap[os..oe].fill(v_pos as i32);
+            }
+        }
+        out_idx += writable_length;
+
+        // advance ref_idx to end of variant
+        ref_idx = v_ref_end;
+
+        if out_idx >= length {
+            break;
+        }
+    }
+
+    if shifted < shift {
+        // need to shift the rest of the track
+        ref_idx += shift - shifted;
+        ref_idx = ref_idx.min(ref_flat.len() as i64);
+        shifted = shift;
+    }
+    let _ = shifted; // used above, silence unused-assign warning
+
+    // fill rest with reference sequence and right-pad with Ns
+    let unfilled_length = length - out_idx;
+    if unfilled_length > 0 {
+        // fill with reference sequence; when ref_idx is past the contig end,
+        // writable_ref <= 0 and the tail out[out_idx..length] is right-padded.
+        let writable_ref = unfilled_length.min(ref_flat.len() as i64 - ref_idx);
+        // Positive: copy ref bytes from ref_idx. Zero or negative: no-op.
+        let out_end_idx = if writable_ref > 0 {
+            let oe = out_idx + writable_ref;
+            let re = ref_idx + writable_ref;
+            {
+                let os = out_idx as usize;
+                let oe_u = oe as usize;
+                let rs = ref_idx as usize;
+                let re_u = re as usize;
+                out_flat[os..oe_u].copy_from_slice(&ref_flat[rs..re_u]);
+                if let Some(av) = av_flat.as_deref_mut() {
+                    av[os..oe_u].fill(-1);
+                }
+                if let Some(ap) = ap_flat.as_deref_mut() {
+                    for (j, pos) in (os..oe_u).zip(rs..re_u) {
+                        ap[j] = pos as i32;
+                    }
+                }
+            }
+            oe
+        } else {
+            // writable_ref <= 0: ref exhausted (ref_idx at/after contig end).
+            // No reference bytes remain to copy, so the entire unfilled tail
+            // out[out_idx..length] must be padded. Clamp out_end_idx to out_idx
+            // (NOT 0) so the right-pad below fills exactly out[out_idx..length]
+            // and never overwrites already-written positions.
+            out_idx
+        };
+
+        // right-pad
+        if out_end_idx < length {
+            let pe = length as usize;
+            let ps = out_end_idx as usize;
+            out_flat[ps..pe].fill(pad_char);
+            if let Some(av) = av_flat.as_deref_mut() {
+                av[ps..pe].fill(-1);
+            }
+            if let Some(ap) = ap_flat.as_deref_mut() {
+                ap[ps..pe].fill(i32::MAX);
+            }
+        }
+    }
+}
+
+/// Batch driver: reconstruct haplotypes for all (query, hap) pairs.
+///
+/// Mirrors `reconstruct_haplotypes_from_sparse` (plural) in
+/// `python/genvarloader/_dataset/_genotypes.py`.
+///
+/// # Parameters
+/// - `out` – flat output buffer, length = out_offsets[-1] (u8); written in place
+/// - `out_offsets` – shape (batch*ploidy + 1,) offsets into `out`
+/// - `regions` – shape (batch, 3) as (contig_idx, start, end) i32
+/// - `shifts` – shape (batch, ploidy) i32
+/// - `geno_offset_idx` – shape (batch, ploidy) i64 indices into geno_o_starts/stops
+/// - `geno_o_starts` – shape (n,) i64 — row(0) of normalized (2,n) geno_offsets
+/// - `geno_o_stops` – shape (n,) i64 — row(1) of normalized (2,n) geno_offsets
+/// - `geno_v_idxs` – flat sparse genotype variant indices i32
+/// - `v_starts` – variant genomic start positions i32
+/// - `ilens` – variant insertion lengths i32
+/// - `alt_alleles` – packed ALT allele bytes u8
+/// - `alt_offsets` – offsets into alt_alleles i64
+/// - `ref_` – packed reference bytes u8
+/// - `ref_offsets` – per-contig offsets into ref_ i64
+/// - `pad_char` – padding byte u8
+/// - `keep` – optional flat keep mask bool
+/// - `keep_offsets` – optional 1D (batch*ploidy + 1) offsets into keep i64
+/// - `annot_v_idxs` – optional annotation output i32 (same layout as out)
+/// - `annot_ref_pos` – optional annotation output i32 (same layout as out)
+/// - `parallel` – if true, use rayon to process work items concurrently
+#[allow(clippy::too_many_arguments)]
+pub fn reconstruct_haplotypes_from_sparse(
+    mut out: ArrayViewMut1<u8>,
+    out_offsets: ArrayView1<i64>,
+    regions: ArrayView2<i32>,
+    shifts: ArrayView2<i32>,
+    geno_offset_idx: ArrayView2<i64>,
+    geno_o_starts: ArrayView1<i64>,
+    geno_o_stops: ArrayView1<i64>,
+    geno_v_idxs: ArrayView1<i32>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+    alt_alleles: ArrayView1<u8>,
+    alt_offsets: ArrayView1<i64>,
+    ref_: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+    keep: Option<ArrayView1<bool>>,
+    keep_offsets: Option<ArrayView1<i64>>,
+    mut annot_v_idxs: Option<ArrayViewMut1<i32>>,
+    mut annot_ref_pos: Option<ArrayViewMut1<i32>>,
+    parallel: bool,
+) {
+    let batch_size = regions.nrows();
+    let ploidy = shifts.ncols();
+    let n_work = batch_size * ploidy;
+
+    // Per-k inner work: given disjoint output slices, call the single-haplotype kernel.
+    // All read-only ArrayViews are Send+Sync so the closure can borrow them freely.
+    let do_work = |k: usize,
+                   out_view: ArrayViewMut1<u8>,
+                   av_view: Option<ArrayViewMut1<i32>>,
+                   ap_view: Option<ArrayViewMut1<i32>>| {
+        let query = k / ploidy;
+        let hap = k % ploidy;
+
+        // geno slice for this (query, hap)
+        let o_idx = geno_offset_idx[[query, hap]] as usize;
+        let o_s = geno_o_starts[o_idx] as usize;
+        let o_e = geno_o_stops[o_idx] as usize;
+        let qh_v_idxs = geno_v_idxs.slice(s![o_s..o_e]);
+
+        // keep slice
+        let qh_keep: Option<ArrayView1<bool>> =
+            if let (Some(ref k_arr), Some(ref ko)) = (&keep, &keep_offsets) {
+                let ks = ko[k] as usize;
+                let ke = ko[k + 1] as usize;
+                Some(k_arr.slice(s![ks..ke]))
+            } else {
+                None
+            };
+
+        // region info
+        let c_idx = regions[[query, 0]] as usize;
+        let c_s = ref_offsets[c_idx] as usize;
+        let c_e = ref_offsets[c_idx + 1] as usize;
+        let contig_ref = ref_.slice(s![c_s..c_e]);
+        let ref_start = regions[[query, 1]] as i64;
+        let shift = shifts[[query, hap]] as i64;
+
+        reconstruct_haplotype_from_sparse(
+            qh_v_idxs,
+            v_starts,
+            ilens,
+            shift,
+            alt_alleles,
+            alt_offsets,
+            contig_ref,
+            ref_start,
+            out_view,
+            pad_char,
+            qh_keep,
+            av_view,
+            ap_view,
+        );
+    };
+
+    if parallel {
+        // Build disjoint per-k mutable slices for all active buffers using the
+        // proven split_at_mut chain idiom (mirrors get_reference in reference/mod.rs).
+        // &mut [_] slices are Send, unlike raw *mut pointers — safe for rayon closures.
+        let bounds: Vec<(usize, usize)> = (0..n_work)
+            .map(|k| (out_offsets[k] as usize, out_offsets[k + 1] as usize))
+            .collect();
+
+        let out_slice = out.as_slice_mut().unwrap();
+        let mut out_chunks: Vec<&mut [u8]> = Vec::with_capacity(n_work);
+        {
+            let mut rest = &mut out_slice[..];
+            let mut cursor = 0usize;
+            for &(s, e) in &bounds {
+                // Contract: `out_offsets` is monotonically non-decreasing, so each
+                // work item's range starts at or after the previous one's end. This
+                // guarantees `s - cursor` does not underflow and the carved slices
+                // are disjoint. The same `bounds` drives the annotation carves below.
+                debug_assert!(
+                    s >= cursor && e >= s,
+                    "out_offsets must be monotonically non-decreasing (got s={s}, e={e}, cursor={cursor})"
+                );
+                let (_, tail) = rest.split_at_mut(s - cursor);
+                let (mid, tail2) = tail.split_at_mut(e - s);
+                out_chunks.push(mid);
+                rest = tail2;
+                cursor = e;
+            }
+        }
+
+        // Carve annotation buffers only when they are Some.
+        let av_chunks: Option<Vec<&mut [i32]>> = annot_v_idxs.as_mut().map(|av| {
+            let av_slice = av.as_slice_mut().unwrap();
+            let mut chunks: Vec<&mut [i32]> = Vec::with_capacity(n_work);
+            let mut rest = &mut av_slice[..];
+            let mut cursor = 0usize;
+            for &(s, e) in &bounds {
+                let (_, tail) = rest.split_at_mut(s - cursor);
+                let (mid, tail2) = tail.split_at_mut(e - s);
+                chunks.push(mid);
+                rest = tail2;
+                cursor = e;
+            }
+            chunks
+        });
+
+        let ap_chunks: Option<Vec<&mut [i32]>> = annot_ref_pos.as_mut().map(|ap| {
+            let ap_slice = ap.as_slice_mut().unwrap();
+            let mut chunks: Vec<&mut [i32]> = Vec::with_capacity(n_work);
+            let mut rest = &mut ap_slice[..];
+            let mut cursor = 0usize;
+            for &(s, e) in &bounds {
+                let (_, tail) = rest.split_at_mut(s - cursor);
+                let (mid, tail2) = tail.split_at_mut(e - s);
+                chunks.push(mid);
+                rest = tail2;
+                cursor = e;
+            }
+            chunks
+        });
+
+        // Zip all chunk vecs and dispatch in parallel.
+        // Handle the four combinations of av/ap presence.
+        match (av_chunks, ap_chunks) {
+            (Some(avc), Some(apc)) => {
+                out_chunks
+                    .into_par_iter()
+                    .zip(avc.into_par_iter())
+                    .zip(apc.into_par_iter())
+                    .enumerate()
+                    .for_each(|(k, ((out_chunk, av_chunk), ap_chunk))| {
+                        do_work(
+                            k,
+                            ArrayViewMut1::from(out_chunk),
+                            Some(ArrayViewMut1::from(av_chunk)),
+                            Some(ArrayViewMut1::from(ap_chunk)),
+                        );
+                    });
+            }
+            (Some(avc), None) => {
+                out_chunks
+                    .into_par_iter()
+                    .zip(avc.into_par_iter())
+                    .enumerate()
+                    .for_each(|(k, (out_chunk, av_chunk))| {
+                        do_work(
+                            k,
+                            ArrayViewMut1::from(out_chunk),
+                            Some(ArrayViewMut1::from(av_chunk)),
+                            None,
+                        );
+                    });
+            }
+            (None, Some(apc)) => {
+                out_chunks
+                    .into_par_iter()
+                    .zip(apc.into_par_iter())
+                    .enumerate()
+                    .for_each(|(k, (out_chunk, ap_chunk))| {
+                        do_work(
+                            k,
+                            ArrayViewMut1::from(out_chunk),
+                            None,
+                            Some(ArrayViewMut1::from(ap_chunk)),
+                        );
+                    });
+            }
+            (None, None) => {
+                out_chunks
+                    .into_par_iter()
+                    .enumerate()
+                    .for_each(|(k, out_chunk)| {
+                        do_work(k, ArrayViewMut1::from(out_chunk), None, None);
+                    });
+            }
+        }
+    } else {
+        // Serial path: use raw pointers for disjoint sub-range access, exactly as before.
+        // The serial loop prevents concurrent aliasing.
+        let out_raw: *mut u8 = out.as_mut_ptr();
+        let av_raw: Option<*mut i32> = annot_v_idxs.as_mut().map(|a| a.as_mut_ptr());
+        let ap_raw: Option<*mut i32> = annot_ref_pos.as_mut().map(|a| a.as_mut_ptr());
+
+        for k in 0..n_work {
+            let out_s = out_offsets[k] as usize;
+            let out_e = out_offsets[k + 1] as usize;
+
+            // SAFETY: `out_offsets` is required by the calling contract to be monotonically
+            // non-decreasing, so consecutive (out_s, out_e) pairs are strictly non-overlapping
+            // address ranges within the same allocation.  Because the loop is serial there are
+            // no concurrent borrows, so constructing a `&mut [u8]` from each disjoint sub-range
+            // is free of aliasing UB.
+            let out_chunk =
+                unsafe { std::slice::from_raw_parts_mut(out_raw.add(out_s), out_e - out_s) };
+            let out_view = ArrayViewMut1::from(out_chunk);
+
+            // SAFETY: same invariant as out_chunk — `out_offsets` non-decreasing guarantees
+            // each [out_s..out_e] is a disjoint sub-range; serial loop prevents concurrent
+            // aliasing.
+            let av_view: Option<ArrayViewMut1<i32>> = av_raw.map(|p| {
+                let chunk = unsafe {
+                    std::slice::from_raw_parts_mut(p.add(out_s), out_e - out_s)
+                };
+                ArrayViewMut1::from(chunk)
+            });
+
+            // SAFETY: same invariant as out_chunk — `out_offsets` non-decreasing guarantees
+            // each [out_s..out_e] is a disjoint sub-range; serial loop prevents concurrent
+            // aliasing.
+            let ap_view: Option<ArrayViewMut1<i32>> = ap_raw.map(|p| {
+                let chunk = unsafe {
+                    std::slice::from_raw_parts_mut(p.add(out_s), out_e - out_s)
+                };
+                ArrayViewMut1::from(chunk)
+            });
+
+            do_work(k, out_view, av_view, ap_view);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::{arr1, Array1};
+
+    /// Helper: run the kernel and return (out, annot_v_idxs, annot_ref_pos)
+    fn run(
+        v_idxs: &[i32],
+        v_starts: &[i32],
+        ilens: &[i32],
+        shift: i64,
+        alt_alleles: &[u8],
+        alt_offsets: &[i64],
+        ref_: &[u8],
+        ref_start: i64,
+        out_len: usize,
+        pad_char: u8,
+        keep: Option<&[bool]>,
+        annotate: bool,
+    ) -> (Vec<u8>, Vec<i32>, Vec<i32>) {
+        let mut out = Array1::<u8>::from_elem(out_len, pad_char);
+        let mut av = Array1::<i32>::from_elem(out_len, 0i32);
+        let mut ap = Array1::<i32>::from_elem(out_len, 0i32);
+
+        let keep_arr: Option<Array1<bool>> = keep.map(|k| arr1(k));
+
+        if annotate {
+            reconstruct_haplotype_from_sparse(
+                arr1(v_idxs).view(),
+                arr1(v_starts).view(),
+                arr1(ilens).view(),
+                shift,
+                arr1(alt_alleles).view(),
+                arr1(alt_offsets).view(),
+                arr1(ref_).view(),
+                ref_start,
+                out.view_mut(),
+                pad_char,
+                keep_arr.as_ref().map(|k| k.view()),
+                Some(av.view_mut()),
+                Some(ap.view_mut()),
+            );
+        } else {
+            reconstruct_haplotype_from_sparse(
+                arr1(v_idxs).view(),
+                arr1(v_starts).view(),
+                arr1(ilens).view(),
+                shift,
+                arr1(alt_alleles).view(),
+                arr1(alt_offsets).view(),
+                arr1(ref_).view(),
+                ref_start,
+                out.view_mut(),
+                pad_char,
+                keep_arr.as_ref().map(|k| k.view()),
+                None,
+                None,
+            );
+        }
+        (out.to_vec(), av.to_vec(), ap.to_vec())
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 1: no variants, shift=0, in-bounds
+    // ref = [10,20,30,40,50], ref_start=1, out_len=3 → [20,30,40]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn no_variants_shift0_in_bounds() {
+        let (out, _av, _ap) = run(
+            &[],     // v_idxs
+            &[],     // v_starts (indexed by variant)
+            &[],     // ilens
+            0,       // shift
+            &[],     // alt_alleles
+            &[0i64], // alt_offsets (1 sentinel for 0 variants)
+            &[10, 20, 30, 40, 50],
+            1,  // ref_start
+            3,  // out_len
+            0,  // pad_char
+            None,
+            false,
+        );
+        assert_eq!(out, vec![20, 30, 40]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 2: negative ref_start → leading pad, annot_ref_pos == -1 over the pad
+    // ref = [1,2,3,4,5], ref_start=-2, out_len=5, pad=9
+    // → [9,9,1,2,3], annot_ref_pos over pad = [-1,-1,0,1,2]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn negative_ref_start_leading_pad() {
+        let (out, av, ap) = run(
+            &[],
+            &[],
+            &[],
+            0,
+            &[],
+            &[0i64],
+            &[1, 2, 3, 4, 5],
+            -2, // ref_start
+            5,
+            9,
+            None,
+            true,
+        );
+        assert_eq!(out, vec![9, 9, 1, 2, 3]);
+        assert_eq!(&av[..2], &[-1i32, -1]);
+        assert_eq!(&ap[..2], &[-1i32, -1], "leading pad annot_ref_pos must be -1");
+        assert_eq!(&ap[2..], &[0i32, 1, 2]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 3: single SNP (ilen=0)
+    // ref   = [A,C,G,T,A] = [65,67,71,84,65], ref_start=0, out_len=5
+    // variant 0: pos=2, ilen=0, allele=[84] (T replaces G)
+    // v_idxs=[0], v_starts=[2], ilens=[0], alt_alleles=[84], alt_offsets=[0,1]
+    // expected out: [65,67,84,84,65]  (ref_end = 2 - min(0,0) + 1 = 3)
+    // -------------------------------------------------------------------------
+    #[test]
+    fn single_snp() {
+        // ref: A C G T A (positions 0..5)
+        // variant at pos=2 (G→T), ilen=0 → v_ref_end = 2 - 0 + 1 = 3
+        // out: A C [T] T A
+        let (out, av, _ap) = run(
+            &[0],        // v_idxs: only variant 0
+            &[2],        // v_starts: variant 0 is at pos 2
+            &[0],        // ilens: SNP, no length change
+            0,           // shift
+            &[84u8],     // alt_alleles: T
+            &[0i64, 1],  // alt_offsets
+            &[65, 67, 71, 84, 65], // A C G T A
+            0,           // ref_start
+            5,
+            0,
+            None,
+            true,
+        );
+        // ref[0..2]=AC, allele T, ref[3..5]=TA
+        assert_eq!(out, vec![65, 67, 84, 84, 65]);
+        // annot_v_idxs: [-1,-1, 0, -1,-1]
+        assert_eq!(av, vec![-1, -1, 0, -1, -1]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 4: 2bp insertion (ilen=+2)
+    // ref = [1,2,3,4,5], ref_start=0, out_len=5
+    // variant at pos=2, ilen=+2, allele=[10,11,12] (3 bytes: REF anchor + 2 inserted)
+    // v_ref_end = 2 - min(0,+2) + 1 = 3
+    // Processing: ref[0..2]=[1,2], allele=[10,11,12] → 3 bytes, but out only has 1 slot left
+    //   after 2 ref bytes → writes 3 bytes clipped to min(3, 5-2)=3: [10,11,12]
+    //   out = [1,2,10,11,12]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn two_bp_insertion() {
+        let (out, _av, _ap) = run(
+            &[0],
+            &[2],        // variant 0 at pos 2
+            &[2],        // ilen=+2
+            0,
+            &[10u8, 11, 12],
+            &[0i64, 3],
+            &[1, 2, 3, 4, 5],
+            0,
+            5,
+            0,
+            None,
+            false,
+        );
+        // ref[0..2]=[1,2], allele[0..3]=[10,11,12] (writable_length=min(3,3)=3)
+        // v_ref_end=3, out_idx=5, break. Final clause: unfilled=0.
+        assert_eq!(out, vec![1, 2, 10, 11, 12]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 5: deletion (ilen=-2)
+    // ref = [1,2,3,4,5,6,7], ref_start=0, out_len=5
+    // variant at pos=2, ilen=-2, allele=[30] (1 byte, anchor only)
+    // v_ref_end = 2 - min(0,-2) + 1 = 2+2+1 = 5
+    // Processing: ref[0..2]=[1,2], allele=[30] (1 byte), ref_idx→5
+    //   remaining ref[5..7]=[6,7], out=[1,2,30,6,7]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn deletion() {
+        let (out, _av, _ap) = run(
+            &[0],
+            &[2],        // variant 0 at pos 2
+            &[-2],       // ilen=-2
+            0,
+            &[30u8],     // anchor allele byte
+            &[0i64, 1],
+            &[1, 2, 3, 4, 5, 6, 7],
+            0,
+            5,
+            0,
+            None,
+            false,
+        );
+        // ref[0..2]=[1,2], allele=[30], ref_idx→5, then ref[5..7]=[6,7]
+        assert_eq!(out, vec![1, 2, 30, 6, 7]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 6: DEL spanning ref_start
+    // ref = [1,2,3,4,5,6,7], ref_start=3
+    // variant: v_pos=1, ilen=-3, allele=[99]
+    //   v_ref_end = 1 - min(0,-3) + 1 = 1+3+1 = 5
+    //   condition: v_pos(1) < ref_start(3), v_diff(-3) < 0, v_ref_end(5) >= ref_start(3)
+    //   → ref_idx = 5, continue
+    // Then final clause fills ref[5..7]=[6,7] + right-pad
+    // out_len=5: ref[5..7]→[6,7], right-pad [0,0,0]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn del_spanning_ref_start() {
+        let (out, _av, ap) = run(
+            &[0],
+            &[1],        // v_pos=1
+            &[-3],       // ilen=-3
+            0,
+            &[99u8],
+            &[0i64, 1],
+            &[1, 2, 3, 4, 5, 6, 7],
+            3,           // ref_start=3
+            5,
+            0,
+            None,
+            true,
+        );
+        // ref_idx set to 5. Final: ref[5..7]=[6,7], pad [0,0]
+        assert_eq!(out, vec![6, 7, 0, 0, 0]);
+        // trailing pad annot_ref_pos must be i32::MAX
+        assert_eq!(&ap[2..], &[i32::MAX, i32::MAX, i32::MAX]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case: deletion drives ref_idx past the contig end (overshoot).
+    // ref = [1,2,3,4] (len 4), ref_start=0, out_len=8.
+    // variant at pos=2, ilen=-5, allele=[50] (anchor).
+    //   v_ref_end = 2 - min(0,-5) + 1 = 8  → ref_idx advances to 8 (> len 4).
+    // Processing: ref[0..2]=[1,2], allele=[50] → out_idx=3.
+    // Final clause: unfilled=5, ref exhausted (writable_ref = min(5, 4-8) = -4 <= 0).
+    // CORRECT: no ref left → pad the whole tail → [1,2,50,0,0,0,0,0].
+    // (Pre-fix rust over-pads from index 0 → all zeros.)
+    // -------------------------------------------------------------------------
+    #[test]
+    fn overshoot_ref_past_contig() {
+        let (out, _av, _ap) = run(
+            &[0],
+            &[2],          // v_pos=2
+            &[-5],         // ilen=-5 (deletion past contig end)
+            0,             // shift
+            &[50u8],       // anchor allele
+            &[0i64, 1],
+            &[1, 2, 3, 4], // ref, len 4
+            0,             // ref_start
+            8,             // out_len
+            0,             // pad_char
+            None,
+            false,
+        );
+        assert_eq!(out, vec![1, 2, 50, 0, 0, 0, 0, 0]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 7: overlapping ALTs — only first applied
+    // ref = [1,2,3,4,5], ref_start=0, out_len=5
+    // v_idxs=[0,1]: two variants both at pos=2, but second has v_pos < ref_idx after first
+    // variant 0: pos=2, ilen=0, allele=[20]
+    // variant 1: pos=2, ilen=0, allele=[30] — overlapping, must be skipped
+    // expected: [1,2,20,4,5]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn overlapping_alts_first_applied() {
+        let (out, _av, _ap) = run(
+            &[0, 1],     // v_idxs: variants 0 then 1
+            &[2, 2],     // both at pos=2
+            &[0, 0],     // both SNPs
+            0,
+            &[20u8, 30], // alleles: 20 and 30
+            &[0i64, 1, 2],
+            &[1, 2, 3, 4, 5],
+            0,
+            5,
+            0,
+            None,
+            false,
+        );
+        // First: ref[0..2]=[1,2], allele=[20], ref_idx→3
+        // Second: v_pos=2 < ref_idx=3 → skip
+        // Final: ref[3..5]=[4,5]
+        assert_eq!(out, vec![1, 2, 20, 4, 5]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 8: shift consumed partly by ref + partly by allele
+    // ref = [1,2,3,4,5,6,7,8], ref_start=0, shift=4, out_len=4
+    // variant 0: pos=3, ilen=0, allele=[99] (SNP at pos 3)
+    //   shifted=0, ref_shift_dist=3-0=3, v_len=1
+    //   shifted+ref_shift_dist+v_len = 0+3+1=4 == shift=4  → NOT < 4
+    //   shifted+ref_shift_dist=3 < shift=4 → "else" branch
+    //   allele_start_idx = 4 - 0 - 3 = 1
+    //   allele_start_idx(1) == v_len(1) → ref_idx=v_ref_end=4, continue
+    // After loop: shifted(0) < shift(4) → ref_idx += 4-0=4 → ref_idx=8, min(8,8)=8
+    // Final: writable_ref = min(4, 8-8)=0, out=[pad,pad,pad,pad] → all 0
+    // Wait: after the early-continue in shift branch, ref_idx=4 (not 0).
+    // Let me re-trace: shifted=0, ref_idx=0, v_pos=3
+    //   allele_start_idx = shift(4) - shifted(0) - ref_shift_dist(3) = 1
+    //   allele_start_idx(1) == v_len(1) → ref_idx = v_ref_end = 4, continue
+    // After loop: shifted(0) < shift(4) → ref_idx=4+(4-0)=8, min(8,8)=8
+    // Final: unfilled=4, writable_ref=min(4, 8-8)=0 → all pad
+    // Better test: shift=3, variant at pos=5, allele=[99,88] (2 bytes, ilen=+1)
+    //   ref_shift_dist=5, shifted+ref_shift_dist=5 >= shift=3 → first elif
+    //   ref_idx += 3-0=3 → ref_idx=3, shifted=3
+    //   Then ref[3..5]=[4,5], allele=[99,88], ref[7..8]=[8]
+    //   out_len=4: ref[3..5]=[4,5] (2 bytes), allele=[99,88] (2 bytes) → [4,5,99,88]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn shift_consumed_partly_ref_partly_allele() {
+        // shift=2, ref=[1,2,3,4,5,6], ref_start=0, variant at pos=3, allele=[99,88] (ilen=+1)
+        // ref_shift_dist = 3-0 = 3, shifted+ref_shift_dist+v_len = 0+3+2 = 5 >= shift=2
+        // shifted+ref_shift_dist = 3 >= shift=2 → ref_idx += 2-0=2 → ref_idx=2
+        // ref[2..3]=[3], allele=[99,88], ref[4..6]=[5,6]
+        // out_len=5: [3, 99, 88, 5, 6]
+        let (out, _av, _ap) = run(
+            &[0],
+            &[3],        // v_pos=3
+            &[1],        // ilen=+1
+            2,           // shift=2
+            &[99u8, 88],
+            &[0i64, 2],
+            &[1, 2, 3, 4, 5, 6],
+            0,
+            5,
+            0,
+            None,
+            false,
+        );
+        // ref_idx=2 after shift, ref[2..3]=[3], allele=[99,88], v_ref_end=4, ref[4..6]=[5,6]
+        assert_eq!(out, vec![3, 99, 88, 5, 6]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 8b: shift partly consumed by allele itself (allele_start_idx < v_len)
+    // shift=4, ref=[1,2,3,4,5,6,7,8], ref_start=0, out_len=4
+    // variant at pos=3, ilen=+1, allele=[99,88] (2 bytes)
+    //   ref_shift_dist=3, shifted+ref_shift_dist+v_len = 0+3+2=5 >= shift=4
+    //   shifted+ref_shift_dist=3 < shift=4 → else branch
+    //   allele_start_idx = 4-0-3 = 1
+    //   allele_start_idx(1) != v_len(2) → ref_idx=v_pos=3, allele=allele[1:]=[88]
+    //   ref_len = v_pos(3) - ref_idx(3) = 0 (no ref before variant)
+    //   allele=[88] writable_length=min(1,4)=1
+    //   ref_idx → v_ref_end=4
+    //   Final: ref[4..8]=[5,6,7,8], out=[88,5,6,7]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn shift_partly_consumed_by_allele() {
+        let (out, _av, _ap) = run(
+            &[0],
+            &[3],
+            &[1],        // ilen=+1, allele 2 bytes
+            4,           // shift=4
+            &[99u8, 88],
+            &[0i64, 2],
+            &[1, 2, 3, 4, 5, 6, 7, 8],
+            0,
+            4,
+            0,
+            None,
+            false,
+        );
+        // allele starts at index 1: [88], then ref[4..8]=[5,6,7,8] → [88,5,6,7]
+        assert_eq!(out, vec![88, 5, 6, 7]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 9: right-pad clause
+    // ref = [1,2,3], ref_start=0, out_len=6, no variants
+    // → ref fills [1,2,3], then pad [0,0,0]
+    // trailing annot_ref_pos = i32::MAX
+    // -------------------------------------------------------------------------
+    #[test]
+    fn right_pad_clause() {
+        let (out, av, ap) = run(
+            &[],
+            &[],
+            &[],
+            0,
+            &[],
+            &[0i64],
+            &[1, 2, 3],
+            0,
+            6,
+            0,
+            None,
+            true,
+        );
+        assert_eq!(out, vec![1, 2, 3, 0, 0, 0]);
+        // ref portion: annot_v_idxs=-1, annot_ref_pos=[0,1,2]
+        assert_eq!(&av[..3], &[-1i32, -1, -1]);
+        assert_eq!(&ap[..3], &[0i32, 1, 2]);
+        // trailing pad: annot_v_idxs=-1, annot_ref_pos=i32::MAX
+        assert_eq!(&av[3..], &[-1i32, -1, -1]);
+        assert_eq!(
+            &ap[3..],
+            &[i32::MAX, i32::MAX, i32::MAX],
+            "trailing pad annot_ref_pos must be i32::MAX"
+        );
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 11: allele_start_idx == v_len → early-continue branch
+    //
+    // Exercises numba _genotypes.py:390-401 / Rust mod.rs:121-131:
+    //   the "else" shift sub-branch where allele_start_idx == v_len, causing
+    //   ref_idx to advance to v_ref_end and the variant to be skipped.
+    //
+    // Hand-derivation:
+    //   ref = [1..8], ref_start=0, shift=4, out_len=4
+    //   SNP at v_pos=3, ilen=0, allele=[88] (v_len=1)
+    //   --- shift handling (shifted=0 < shift=4) ---
+    //   ref_shift_dist = v_pos - ref_idx = 3 - 0 = 3
+    //   check 1: shifted + ref_shift_dist + v_len = 0+3+1 = 4  → NOT < 4, skip
+    //   check 2: shifted + ref_shift_dist = 3                  → NOT >= 4, skip
+    //   else: allele_start_idx = shift - shifted - ref_shift_dist = 4-0-3 = 1
+    //         shifted = 4  (numba:391 / Rust:124)
+    //         allele_start_idx(1) == v_len(1)                  → TRUE
+    //         ref_idx = v_ref_end = 3 - min(0,0) + 1 = 4
+    //         continue  (numba:397-401 / Rust:126-130)
+    //   --- after loop ---
+    //   shifted(4) == shift(4) → no extra advance
+    //   Final fill: ref_idx=4, unfilled=4, writable_ref=min(4,8-4)=4
+    //   out = ref[4..8] = [5,6,7,8]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn allele_start_idx_eq_v_len_continue() {
+        let (out, _av, _ap) = run(
+            &[0],               // v_idxs: only variant 0
+            &[3],               // v_starts: variant 0 at pos 3
+            &[0],               // ilens: SNP, ilen=0
+            4,                  // shift=4
+            &[88u8],            // alt_allele
+            &[0i64, 1],         // alt_offsets
+            &[1, 2, 3, 4, 5, 6, 7, 8],
+            0,                  // ref_start
+            4,                  // out_len
+            0,                  // pad_char
+            None,
+            false,
+        );
+        // allele_start_idx(1) == v_len(1): variant skipped, ref_idx→4
+        // shifted=4 after continue, no further shift; final fills ref[4..8]=[5,6,7,8]
+        assert_eq!(out, vec![5, 6, 7, 8]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 12: skip_variant_not_enough_distance
+    //
+    // Exercises numba _genotypes.py:377-380 / Rust mod.rs:108-112:
+    //   the "not enough distance" branch where shifted + ref_shift_dist + v_len < shift,
+    //   causing the variant to be skipped entirely without advancing ref_idx.
+    //
+    // Hand-derivation:
+    //   ref = [1..15], ref_start=0, shift=10, out_len=3
+    //   SNP at v_pos=3, ilen=0, allele=[77] (v_len=1)
+    //   --- shift handling (shifted=0 < shift=10) ---
+    //   ref_shift_dist = v_pos - ref_idx = 3 - 0 = 3
+    //   check 1: shifted + ref_shift_dist + v_len = 0+3+1 = 4 < 10  → TRUE
+    //            continue  (numba:379-380 / Rust:110-112)
+    //   --- after loop ---
+    //   shifted(0) < shift(10) → ref_idx += 10-0 = 10, min(10,15)=10, shifted=10
+    //   Final fill: ref_idx=10, unfilled=3, writable_ref=min(3,15-10)=3
+    //   out = ref[10..13] = [11,12,13]
+    // -------------------------------------------------------------------------
+    #[test]
+    fn skip_variant_not_enough_distance() {
+        let ref_: Vec<u8> = (1u8..=15).collect();
+        let (out, _av, _ap) = run(
+            &[0],               // v_idxs: only variant 0
+            &[3],               // v_starts: variant 0 at pos 3
+            &[0],               // ilens: SNP, ilen=0
+            10,                 // shift=10
+            &[77u8],            // alt_allele (never used)
+            &[0i64, 1],         // alt_offsets
+            &ref_,
+            0,                  // ref_start
+            3,                  // out_len
+            0,                  // pad_char
+            None,
+            false,
+        );
+        // variant skipped (0+3+1=4 < 10); after loop ref_idx=10; final fills [11,12,13]
+        assert_eq!(out, vec![11, 12, 13]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 13: keep_mask_excludes_variant
+    //
+    // Exercises numba _genotypes.py:351-352 / Rust mod.rs:72-75:
+    //   keep=[false, true] so variant 0 is skipped and variant 1 is applied.
+    //
+    // Hand-derivation:
+    //   ref = [1,2,3,4,5], ref_start=0, shift=0, out_len=5
+    //   variant 0: pos=1, ilen=0, allele=[55]
+    //   variant 1: pos=3, ilen=0, allele=[99]
+    //   keep = [false, true]
+    //   --- v=0: keep[0]=false → continue (skipped entirely) ---
+    //   --- v=1: keep[1]=true → process ---
+    //   ref_len = v_pos(3) - ref_idx(0) = 3 → write ref[0..3]=[1,2,3]
+    //   allele=[99], writable_length=1 → write 99, out_idx=4
+    //   ref_idx = v_ref_end = 3 - min(0,0) + 1 = 4
+    //   Final fill: ref_idx=4, unfilled=1, writable_ref=min(1,5-4)=1
+    //   out[4] = ref[4] = 5
+    //   out = [1,2,3,99,5]
+    //   variant 0 (at pos 1, allele 55) NOT applied; variant 1 IS applied at pos 3.
+    // -------------------------------------------------------------------------
+    #[test]
+    fn keep_mask_excludes_variant() {
+        let (out, av, _ap) = run(
+            &[0, 1],            // v_idxs: variants 0 and 1
+            &[1, 3],            // v_starts: variant 0 at pos 1, variant 1 at pos 3
+            &[0, 0],            // ilens: both SNPs
+            0,                  // shift=0
+            &[55u8, 99],        // alleles: 55 for v0, 99 for v1
+            &[0i64, 1, 2],      // alt_offsets
+            &[1, 2, 3, 4, 5],
+            0,                  // ref_start
+            5,                  // out_len
+            0,                  // pad_char
+            Some(&[false, true]), // keep mask: skip v0, apply v1
+            true,               // annotate
+        );
+        // variant 0 (pos=1, allele=55) excluded by keep mask: ref[1] NOT replaced
+        // variant 1 (pos=3, allele=99) applied: ref[3] replaced by 99
+        assert_eq!(out, vec![1, 2, 3, 99, 5]);
+        // annot_v_idxs: positions 0..3 are ref (-1), position 3 is variant 1, position 4 is ref (-1)
+        assert_eq!(av, vec![-1, -1, -1, 1, -1]);
+    }
+
+    // -------------------------------------------------------------------------
+    // Case 10: annotated vs non-annotated produce identical out bytes
+    // ref = [1,2,3,4,5], ref_start=0, variant at pos=2 (SNP)
+    // -------------------------------------------------------------------------
+    #[test]
+    fn annotated_vs_non_annotated_identical_out() {
+        let params = (
+            &[0i32][..],   // v_idxs
+            &[2i32][..],   // v_starts
+            &[0i32][..],   // ilens
+            0i64,          // shift
+            &[77u8][..],   // alt_alleles
+            &[0i64, 1][..],// alt_offsets
+            &[1u8,2,3,4,5][..], // ref_
+            0i64,          // ref_start
+            5usize,        // out_len
+            0u8,           // pad_char
+        );
+        let (out_annot, _, _) = run(
+            params.0, params.1, params.2, params.3,
+            params.4, params.5, params.6, params.7,
+            params.8, params.9, None, true,
+        );
+        let (out_plain, _, _) = run(
+            params.0, params.1, params.2, params.3,
+            params.4, params.5, params.6, params.7,
+            params.8, params.9, None, false,
+        );
+        assert_eq!(out_annot, out_plain, "annotated and non-annotated must produce identical out bytes");
+    }
+
+    #[test]
+    fn batch_correctness_two_queries() {
+        // Correctness check for the batch driver: 2 queries × 1 haplotype, no variants.
+        // The batch driver is intentionally serial-only: parity is this phase's only gate
+        // (throughput is recorded, not gated); the rayon parallel path is deferred to the
+        // throughput/fusion optimization pass.  The out/annotation buffers are written by
+        // disjoint per-(query,hap) slices, so this loop is rayon-parallelizable later via
+        // the same disjoint-chunk split used in src/reference/mod.rs get_reference.
+        // Expected: each out chunk is just the corresponding ref slice.
+        let reference = b"ACGTACGTACGT";
+        let ref_ = arr1(reference.as_ref());
+        let ref_offsets = arr1(&[0i64, 12]);
+        let v_starts = arr1::<i32>(&[]);
+        let ilens = arr1::<i32>(&[]);
+        let alt_alleles = arr1::<u8>(&[]);
+        let alt_offsets = arr1(&[0i64]);
+        // Two regions: [0,4) and [4,8) on contig 0
+        let regions = ndarray::arr2(&[[0i32, 0, 4], [0, 4, 8]]);
+        let shifts = ndarray::arr2(&[[0i32], [0]]);
+        let geno_offset_idx = ndarray::arr2(&[[0i64], [1]]);
+        let geno_o_starts = arr1(&[0i64, 0]);
+        let geno_o_stops = arr1(&[0i64, 0]);
+        let geno_v_idxs = arr1::<i32>(&[]);
+        let out_offsets = arr1(&[0i64, 4, 8]);
+        let pad_char = b'N';
+
+        let mut out = ndarray::Array1::<u8>::from_elem(8, pad_char);
+        super::reconstruct_haplotypes_from_sparse(
+            out.view_mut(),
+            out_offsets.view(),
+            regions.view(),
+            shifts.view(),
+            geno_offset_idx.view(),
+            geno_o_starts.view(),
+            geno_o_stops.view(),
+            geno_v_idxs.view(),
+            v_starts.view(),
+            ilens.view(),
+            alt_alleles.view(),
+            alt_offsets.view(),
+            ref_.view(),
+            ref_offsets.view(),
+            pad_char,
+            None,
+            None,
+            None,
+            None,
+            false,
+        );
+
+        assert_eq!(&out.as_slice().unwrap()[0..4], b"ACGT", "first region");
+        assert_eq!(&out.as_slice().unwrap()[4..8], b"ACGT", "second region");
+    }
+
+    #[test]
+    fn batch_correctness_with_snp() {
+        // Correctness check for the batch driver with a SNP to exercise the
+        // variant-application path (not just reference-copy).
+        // Reference: "ACGTACGT" (8 bp, contig 0)
+        // Two regions: [0,4) and [4,8).
+        // One SNP at ref position 1 (C→T), present in haplotype 0 of query 0 only.
+        // Expected region 0: "ATGT" (SNP applied), region 1: "ACGT" (no variant).
+        let reference = b"ACGTACGT";
+        let ref_ = arr1(reference.as_ref());
+        let ref_offsets = arr1(&[0i64, 8]);
+
+        // One SNP: position 1, iLen 0 (substitution), alt allele b'T'
+        let v_starts = arr1::<i32>(&[1]);
+        let ilens = arr1::<i32>(&[0]);
+        let alt_alleles = arr1::<u8>(b"T");
+        // alt_offsets: [start_of_allele_0, end_of_allele_0] = [0, 1]
+        let alt_offsets = arr1(&[0i64, 1]);
+
+        // Two queries, one haplotype each
+        let regions = ndarray::arr2(&[[0i32, 0, 4], [0, 4, 8]]);
+        let shifts = ndarray::arr2(&[[0i32], [0]]);
+
+        // Query 0, hap 0: has the SNP at variant index 0
+        // Query 1, hap 0: no variants
+        // geno_offset_idx[query, hap] → index into geno_o_starts/stops
+        let geno_offset_idx = ndarray::arr2(&[[0i64], [1]]);
+        // For query 0 hap 0: variant block spans geno_v_idxs[0..1] → [0]
+        // For query 1 hap 0: empty block (start == stop)
+        let geno_o_starts = arr1(&[0i64, 1]);
+        let geno_o_stops = arr1(&[1i64, 1]);
+        let geno_v_idxs = arr1::<i32>(&[0]); // variant index 0 = the SNP
+
+        let out_offsets = arr1(&[0i64, 4, 8]);
+        let pad_char = b'N';
+
+        let mut out = ndarray::Array1::<u8>::from_elem(8, pad_char);
+        super::reconstruct_haplotypes_from_sparse(
+            out.view_mut(),
+            out_offsets.view(),
+            regions.view(),
+            shifts.view(),
+            geno_offset_idx.view(),
+            geno_o_starts.view(),
+            geno_o_stops.view(),
+            geno_v_idxs.view(),
+            v_starts.view(),
+            ilens.view(),
+            alt_alleles.view(),
+            alt_offsets.view(),
+            ref_.view(),
+            ref_offsets.view(),
+            pad_char,
+            None,
+            None,
+            None,
+            None,
+            false,
+        );
+
+        assert_eq!(&out.as_slice().unwrap()[0..4], b"ATGT", "region 0 with SNP applied");
+        assert_eq!(&out.as_slice().unwrap()[4..8], b"ACGT", "region 1 reference-only");
+    }
+}
diff --git a/src/reference/mod.rs b/src/reference/mod.rs
new file mode 100644
index 00000000..bce3ac04
--- /dev/null
+++ b/src/reference/mod.rs
@@ -0,0 +1,266 @@
+//! Reference sequence assembly cores (pure ndarray). PyO3 lives in `crate::ffi`.
+use ndarray::{Array1, ArrayView1, ArrayView2, ArrayViewMut1};
+use rayon::prelude::*;
+
+/// Copy `arr[start:stop]` into `out`, padding with `pad_val` where the slice
+/// runs past `[0, arr.len())`. Mirrors numba `padded_slice`
+/// (`_dataset/_utils.py`). `out.len()` MUST equal `stop - start` for the
+/// in-bounds case (the caller guarantees this via out_offsets).
+pub fn padded_slice(
+    arr: ArrayView1<u8>,
+    start: i64,
+    stop: i64,
+    pad_val: u8,
+    mut out: ArrayViewMut1<u8>,
+) {
+    if start >= stop {
+        return;
+    }
+    if stop < 0 {
+        out.fill(pad_val);
+        return;
+    }
+    let len = arr.len() as i64;
+    let pad_left = (-start).max(0);
+    let pad_right = (stop - len).max(0);
+    if pad_left == 0 && pad_right == 0 {
+        // out[:] = arr[start:stop]
+        out.assign(&arr.slice(ndarray::s![start as usize..stop as usize]));
+        return;
+    }
+    let out_len = out.len() as i64;
+    if pad_left > 0 && pad_right > 0 {
+        let out_stop = out_len - pad_right;
+        out.slice_mut(ndarray::s![..pad_left as usize]).fill(pad_val);
+        out.slice_mut(ndarray::s![pad_left as usize..out_stop as usize])
+            .assign(&arr);
+        out.slice_mut(ndarray::s![out_stop as usize..]).fill(pad_val);
+    } else if pad_left > 0 {
+        // out[:pad_left] = pad; out[pad_left:] = arr[:stop]
+        out.slice_mut(ndarray::s![..pad_left as usize]).fill(pad_val);
+        out.slice_mut(ndarray::s![pad_left as usize..])
+            .assign(&arr.slice(ndarray::s![..stop as usize]));
+    } else {
+        // pad_right > 0: out[:out_stop] = arr[start:]; out[out_stop:] = pad
+        let out_stop = out_len - pad_right;
+        out.slice_mut(ndarray::s![..out_stop as usize])
+            .assign(&arr.slice(ndarray::s![start as usize..]));
+        out.slice_mut(ndarray::s![out_stop as usize..]).fill(pad_val);
+    }
+}
+
+/// Fetch padded reference rows for each region into one flat buffer.
+/// `regions[i] = (contig_idx, start, end)`. Mirrors numba
+/// `_get_reference_par/_ser` + `_get_reference_row`. Scheduling (rayon vs
+/// serial) does not affect output — out-slices are disjoint.
+pub fn get_reference(
+    regions: ArrayView2<i32>,
+    out_offsets: ArrayView1<i64>,
+    reference: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+    parallel: bool,
+    to_rc: Option<ArrayView1<bool>>,
+) -> Array1<u8> {
+    let total = out_offsets[out_offsets.len() - 1] as usize;
+    let mut out = Array1::<u8>::zeros(total);
+    let n = regions.nrows();
+
+    // Build disjoint mutable row slices so we can fill each region independently.
+    let row = |i: usize, dst: &mut [u8]| {
+        let c_idx = regions[[i, 0]] as usize;
+        let start = regions[[i, 1]] as i64;
+        let end = regions[[i, 2]] as i64;
+        let c_s = ref_offsets[c_idx] as usize;
+        let c_e = ref_offsets[c_idx + 1] as usize;
+        let contig = reference.slice(ndarray::s![c_s..c_e]);
+        let mut dst_view = ndarray::ArrayViewMut1::from(dst);
+        padded_slice(contig, start, end, pad_char, dst_view.view_mut());
+    };
+
+    // Partition `out` into per-region chunks by out_offsets, then fill.
+    let bounds: Vec<(usize, usize)> = (0..n)
+        .map(|i| (out_offsets[i] as usize, out_offsets[i + 1] as usize))
+        .collect();
+    let out_slice = out.as_slice_mut().unwrap();
+    if parallel {
+        // split_at_mut chain over sorted disjoint bounds
+        let mut chunks: Vec<&mut [u8]> = Vec::with_capacity(n);
+        let mut rest = out_slice;
+        let mut cursor = 0usize;
+        for &(s, e) in &bounds {
+            let (_, tail) = rest.split_at_mut(s - cursor);
+            let (mid, tail2) = tail.split_at_mut(e - s);
+            chunks.push(mid);
+            rest = tail2;
+            cursor = e;
+        }
+        chunks
+            .into_par_iter()
+            .enumerate()
+            .for_each(|(i, dst)| row(i, dst));
+    } else {
+        for (i, &(s, e)) in bounds.iter().enumerate() {
+            row(i, &mut out_slice[s..e]);
+        }
+    }
+    if let Some(to_rc) = to_rc {
+        debug_assert_eq!(
+            to_rc.len(),
+            out_offsets.len() - 1,
+            "to_rc mask length must equal number of output rows (offsets.len() - 1)"
+        );
+        crate::reverse::rc_flat_rows_inplace(
+            out.as_slice_mut().unwrap(),
+            out_offsets,
+            to_rc,
+        );
+    }
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::{arr1, arr2, Array1};
+
+    fn run(arr: &[u8], start: i64, stop: i64, pad: u8) -> Vec<u8> {
+        let a = arr1(arr);
+        let mut out = Array1::<u8>::zeros((stop - start).max(0) as usize);
+        padded_slice(a.view(), start, stop, pad, out.view_mut());
+        out.to_vec()
+    }
+
+    #[test]
+    fn in_bounds() {
+        assert_eq!(run(&[1, 2, 3, 4, 5], 1, 4, 0), vec![2, 3, 4]);
+    }
+    #[test]
+    fn pad_left_only() {
+        assert_eq!(run(&[1, 2, 3], -2, 2, 9), vec![9, 9, 1, 2]);
+    }
+    #[test]
+    fn pad_right_only() {
+        assert_eq!(run(&[1, 2, 3], 1, 5, 9), vec![2, 3, 9, 9]);
+    }
+    #[test]
+    fn pad_both() {
+        assert_eq!(run(&[1, 2], -1, 3, 9), vec![9, 1, 2, 9]);
+    }
+    #[test]
+    fn empty_when_start_ge_stop() {
+        assert_eq!(run(&[1, 2, 3], 2, 2, 9), Vec::<u8>::new());
+    }
+    #[test]
+    fn all_pad_when_stop_negative() {
+        let a = arr1(&[1u8, 2, 3]);
+        let mut out = Array1::<u8>::zeros(3);
+        padded_slice(a.view(), -5, -1, 7, out.view_mut());
+        // stop < 0 → numba returns early after filling pad_val on the whole out
+        assert_eq!(out.to_vec(), vec![7, 7, 7]);
+    }
+
+    // Helper: run get_reference with a flat reference + single contig
+    fn run_get_reference(
+        reference: &[u8],
+        regions: &[[i32; 3]],
+        pad: u8,
+        parallel: bool,
+    ) -> Vec<u8> {
+        let n_contigs = 1usize;
+        let ref_arr = Array1::from_vec(reference.to_vec());
+        let ref_offsets = Array1::from_vec(vec![0i64, reference.len() as i64]);
+        let lengths: Vec<usize> = regions.iter().map(|r| (r[2] - r[1]).max(0) as usize).collect();
+        let out_offsets: Vec<i64> = std::iter::once(0i64)
+            .chain(lengths.iter().scan(0i64, |acc, &l| {
+                *acc += l as i64;
+                Some(*acc)
+            }))
+            .collect();
+        let out_offsets_arr = Array1::from_vec(out_offsets);
+        let n = regions.len();
+        let flat: Vec<i32> = regions.iter().flat_map(|r| r.iter().copied()).collect();
+        let regions_arr = ndarray::Array2::from_shape_vec((n, 3), flat).unwrap();
+        get_reference(
+            regions_arr.view(),
+            out_offsets_arr.view(),
+            ref_arr.view(),
+            ref_offsets.view(),
+            pad,
+            parallel,
+            None,
+        )
+        .to_vec()
+    }
+
+    #[test]
+    fn get_reference_fully_in_bounds() {
+        // region [1,4) on contig [10,20,30,40,50] → [20,30,40]
+        let result = run_get_reference(&[10, 20, 30, 40, 50], &[[0, 1, 4]], 0, false);
+        assert_eq!(result, vec![20, 30, 40]);
+    }
+
+    #[test]
+    fn get_reference_straddling_left_edge() {
+        // region [-2,2) on contig [1,2,3] → pad pad 1 2
+        let result = run_get_reference(&[1, 2, 3], &[[0, -2, 2]], 9, false);
+        assert_eq!(result, vec![9, 9, 1, 2]);
+    }
+
+    #[test]
+    fn get_reference_straddling_right_edge() {
+        // region [1,5) on contig [1,2,3] → 2 3 pad pad
+        let result = run_get_reference(&[1, 2, 3], &[[0, 1, 5]], 9, false);
+        assert_eq!(result, vec![2, 3, 9, 9]);
+    }
+
+    #[test]
+    fn get_reference_two_contigs() {
+        // reference = [10,20] | [30,40,50]; ref_offsets = [0,2,5]
+        // region 0: contig 0, [0,2) → [10,20]
+        // region 1: contig 1, [1,3) → [40,50]
+        let reference = Array1::from_vec(vec![10u8, 20, 30, 40, 50]);
+        let ref_offsets = Array1::from_vec(vec![0i64, 2, 5]);
+        let regions = arr2(&[[0i32, 0, 2], [1, 1, 3]]);
+        let out_offsets = Array1::from_vec(vec![0i64, 2, 4]);
+        let result = get_reference(
+            regions.view(),
+            out_offsets.view(),
+            reference.view(),
+            ref_offsets.view(),
+            0,
+            false,
+            None,
+        );
+        assert_eq!(result.to_vec(), vec![10, 20, 40, 50]);
+    }
+
+    #[test]
+    fn get_reference_parallel_matches_serial() {
+        let reference: Vec<u8> = (0..30).collect();
+        let regions_data = vec![[0i32, -1, 4], [0, 5, 10], [0, 25, 32]];
+        let serial = run_get_reference(&reference, &regions_data, 255, false);
+        let parallel = run_get_reference(&reference, &regions_data, 255, true);
+        assert_eq!(serial, parallel);
+    }
+
+    #[test]
+    fn get_reference_applies_rc_when_masked() {
+        // contig "ACGTAA"; region [0,3) -> forward "ACG" -> revcomp "CGT" (non-palindrome)
+        let reference = ndarray::array![b'A', b'C', b'G', b'T', b'A', b'A'];
+        let ref_offsets = ndarray::array![0i64, 6];
+        let regions = ndarray::array![[0i32, 0, 3]];
+        let out_offsets = ndarray::array![0i64, 3];
+        let to_rc = ndarray::array![true];
+        let out = get_reference(
+            regions.view(),
+            out_offsets.view(),
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+            false,
+            Some(to_rc.view()),
+        );
+        assert_eq!(out.to_vec(), b"CGT".to_vec());
+    }
+}
diff --git a/src/reverse.rs b/src/reverse.rs
new file mode 100644
index 00000000..8dea03a2
--- /dev/null
+++ b/src/reverse.rs
@@ -0,0 +1,148 @@
+//! In-place reverse / reverse-complement of masked rows in a flat (data, offsets)
+//! buffer. Used by the read-path kernels to emit negative-strand output already
+//! reverse-complemented, replacing the Python RC post-pass on the rust backend.
+
+use ndarray::ArrayView1;
+
+/// ACGT<->TGCA complement, identity for every other byte. Mirrors
+/// `bytes.maketrans(b"ACGT", b"TGCA")` (python/genvarloader/_ragged.py).
+pub const COMP: [u8; 256] = {
+    let mut t = [0u8; 256];
+    let mut i = 0usize;
+    while i < 256 {
+        t[i] = i as u8;
+        i += 1;
+    }
+    t[b'A' as usize] = b'T';
+    t[b'T' as usize] = b'A';
+    t[b'C' as usize] = b'G';
+    t[b'G' as usize] = b'C';
+    t
+};
+
+/// Reverse element order within each masked row (no complement). Generic over
+/// element width so it serves f32 tracks and i32/i64 annotation arrays.
+pub fn reverse_flat_rows_inplace<T: Copy>(
+    data: &mut [T],
+    offsets: ArrayView1<i64>,
+    to_rc: ArrayView1<bool>,
+) {
+    for i in 0..to_rc.len() {
+        if !to_rc[i] {
+            continue;
+        }
+        let s = offsets[i] as usize;
+        let e = offsets[i + 1] as usize;
+        data[s..e].reverse();
+    }
+}
+
+/// Reverse a single row of bytes then DNA-complement it in place via the
+/// branchless ACGT↔TGCA arithmetic (identity for every other byte; A/T = XOR
+/// 0x15, C/G = XOR 0x04). `#[inline]` so callers (rc_flat_rows_inplace,
+/// rc_alleles_inplace) inline it back to the prior codegen.
+#[inline]
+pub(crate) fn rc_row(row: &mut [u8]) {
+    row.reverse();
+    for b in row.iter_mut() {
+        let v = *b;
+        let at = (((v == b'A') | (v == b'T')) as u8).wrapping_neg(); // 0xFF if A/T
+        let cg = (((v == b'C') | (v == b'G')) as u8).wrapping_neg(); // 0xFF if C/G
+        *b = v ^ (at & 21) ^ (cg & 4);
+    }
+}
+
+/// Reverse AND complement bytes within each masked row via `rc_row`.
+pub fn rc_flat_rows_inplace(
+    data: &mut [u8],
+    offsets: ArrayView1<i64>,
+    to_rc: ArrayView1<bool>,
+) {
+    for i in 0..to_rc.len() {
+        if !to_rc[i] {
+            continue;
+        }
+        let s = offsets[i] as usize;
+        let e = offsets[i + 1] as usize;
+        rc_row(&mut data[s..e]);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::array;
+
+    #[test]
+    fn comp_lut_matches_maketrans() {
+        // identity except ACGT<->TGCA uppercase
+        assert_eq!(COMP[b'A' as usize], b'T');
+        assert_eq!(COMP[b'T' as usize], b'A');
+        assert_eq!(COMP[b'C' as usize], b'G');
+        assert_eq!(COMP[b'G' as usize], b'C');
+        assert_eq!(COMP[b'N' as usize], b'N');
+        assert_eq!(COMP[b'a' as usize], b'a'); // lowercase pass-through
+        assert_eq!(COMP[b'c' as usize], b'c');
+        assert_eq!(COMP[b'R' as usize], b'R'); // IUPAC pass-through
+        assert_eq!(COMP[0u8 as usize], 0u8);
+    }
+
+    #[test]
+    fn rc_reverses_and_complements_masked_rows_only() {
+        // two rows: "ACGT" (rc -> "ACGT") and "AACG" (not rc)
+        let mut data = b"ACGTAACG".to_vec();
+        let offsets = array![0i64, 4, 8];
+        let to_rc = array![true, false];
+        rc_flat_rows_inplace(&mut data, offsets.view(), to_rc.view());
+        assert_eq!(&data[0..4], b"ACGT"); // revcomp of ACGT is ACGT
+        assert_eq!(&data[4..8], b"AACG"); // untouched
+    }
+
+    #[test]
+    fn rc_handles_odd_length_and_n() {
+        let mut data = b"ACN".to_vec(); // revcomp -> "NGT"
+        let offsets = array![0i64, 3];
+        let to_rc = array![true];
+        rc_flat_rows_inplace(&mut data, offsets.view(), to_rc.view());
+        assert_eq!(&data, b"NGT");
+    }
+
+    #[test]
+    fn reverse_only_no_complement_f32() {
+        let mut data = vec![1.0f32, 2.0, 3.0, 9.0];
+        let offsets = array![0i64, 3, 4];
+        let to_rc = array![true, false];
+        reverse_flat_rows_inplace(&mut data, offsets.view(), to_rc.view());
+        assert_eq!(data, vec![3.0, 2.0, 1.0, 9.0]);
+    }
+
+    #[test]
+    fn reverse_only_i32_for_annot_arrays() {
+        let mut data = vec![10i32, 11, 12];
+        let offsets = array![0i64, 3];
+        let to_rc = array![true];
+        reverse_flat_rows_inplace(&mut data, offsets.view(), to_rc.view());
+        assert_eq!(data, vec![12, 11, 10]);
+    }
+
+    #[test]
+    fn empty_row_and_all_false_are_noops() {
+        let mut data = b"AC".to_vec();
+        let offsets = array![0i64, 0, 2]; // first row empty
+        rc_flat_rows_inplace(&mut data, offsets.view(), array![true, false].view());
+        assert_eq!(&data, b"AC");
+    }
+
+    /// Exhaustive regression: arithmetic complement must match COMP table for every
+    /// possible byte value 0..=255.  A 1-element row reverses to itself, so this
+    /// isolates the complement pass from the reverse pass.
+    #[test]
+    fn arith_complement_matches_comp_for_all_256_bytes() {
+        for b in 0u8..=255 {
+            let mut row = [b];
+            let off = array![0i64, 1];
+            rc_flat_rows_inplace(&mut row, off.view(), array![true].view());
+            assert_eq!(row[0], COMP[b as usize], "byte {b}");
+        }
+    }
+}
diff --git a/src/tables.rs b/src/tables.rs
index 46bffbb5..bf305deb 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -158,7 +158,9 @@ impl RustTable {
         max_mem: usize,
     ) -> Result<()> {
         std::fs::create_dir_all(out_dir)?;
-        let mut itv_w = BufWriter::new(File::create(out_dir.join("intervals.npy"))?);
+        let mut starts_w = BufWriter::new(File::create(out_dir.join("starts.npy"))?);
+        let mut ends_w = BufWriter::new(File::create(out_dir.join("ends.npy"))?);
+        let mut values_w = BufWriter::new(File::create(out_dir.join("values.npy"))?);
         let mut off_w = BufWriter::new(File::create(out_dir.join("offsets.npy"))?);
 
         let n_regions = chrom_codes.len();
@@ -209,9 +211,9 @@ impl RustTable {
             }
             // write region rows (already in cell-major, start-sorted order)
             for (s, e, v) in &region_rows {
-                itv_w.write_all(&s.to_le_bytes())?;
-                itv_w.write_all(&e.to_le_bytes())?;
-                itv_w.write_all(&v.to_le_bytes())?;
+                starts_w.write_all(&s.to_le_bytes())?;
+                ends_w.write_all(&e.to_le_bytes())?;
+                values_w.write_all(&v.to_le_bytes())?;
             }
             // write per-cell offsets
             for n in per_cell_counts {
@@ -219,7 +221,9 @@ impl RustTable {
                 off_w.write_all(&acc.to_le_bytes())?;
             }
         }
-        itv_w.flush()?;
+        starts_w.flush()?;
+        ends_w.flush()?;
+        values_w.flush()?;
         off_w.flush()?;
         Ok(())
     }
@@ -433,7 +437,9 @@ mod tests {
             .unwrap();
 
         // Oracle: per-contig count -> offsets -> intervals, concatenated in region order.
-        let mut exp_itv: Vec<u8> = Vec::new();
+        let mut exp_starts: Vec<u8> = Vec::new();
+        let mut exp_ends: Vec<u8> = Vec::new();
+        let mut exp_values: Vec<u8> = Vec::new();
         let mut exp_off: Vec<u8> = Vec::new();
         let mut acc = 0i64;
         exp_off.extend_from_slice(&acc.to_le_bytes());
@@ -451,9 +457,9 @@ mod tests {
             let offsets = offsets_from_count(&counts);
             let (coords, vals) = t.intervals_from_offsets(c, cs, ce, &sel, &offsets);
             for i in 0..vals.len() {
-                exp_itv.extend_from_slice(&coords[[i, 0]].to_le_bytes());
-                exp_itv.extend_from_slice(&coords[[i, 1]].to_le_bytes());
-                exp_itv.extend_from_slice(&vals[i].to_le_bytes());
+                exp_starts.extend_from_slice(&coords[[i, 0]].to_le_bytes());
+                exp_ends.extend_from_slice(&coords[[i, 1]].to_le_bytes());
+                exp_values.extend_from_slice(&vals[i].to_le_bytes());
             }
             for k in 0..counts.len() {
                 acc += counts.as_slice().unwrap()[k] as i64;
@@ -461,9 +467,10 @@ mod tests {
             }
             ri = rj;
         }
-        let got_itv = std::fs::read(tmp.join("intervals.npy")).unwrap();
+        assert_eq!(std::fs::read(tmp.join("starts.npy")).unwrap(), exp_starts, "starts mismatch");
+        assert_eq!(std::fs::read(tmp.join("ends.npy")).unwrap(), exp_ends, "ends mismatch");
+        assert_eq!(std::fs::read(tmp.join("values.npy")).unwrap(), exp_values, "values mismatch");
         let got_off = std::fs::read(tmp.join("offsets.npy")).unwrap();
-        assert_eq!(got_itv, exp_itv, "intervals bytes mismatch");
         assert_eq!(got_off, exp_off, "offsets bytes mismatch");
     }
 
diff --git a/src/tracks/mod.rs b/src/tracks/mod.rs
new file mode 100644
index 00000000..a0bfcb0c
--- /dev/null
+++ b/src/tracks/mod.rs
@@ -0,0 +1,2160 @@
+//! Track-realignment PRNG primitives and insertion-fill strategies.
+//!
+//! PRNG functions mirror the numba implementations in
+//! `python/genvarloader/_dataset/_tracks.py` (`_xorshift64`, `_hash4`) exactly.
+//! All arithmetic is on `u64` with wrapping shifts/xors to match numba's
+//! `np.uint64` overflow semantics.
+//!
+//! `apply_insertion_fill` mirrors `_apply_insertion_fill` in the same file
+//! (lines 56-138), statement-by-statement, including float promotion points.
+
+use ndarray::{Array1, ArrayView1, ArrayView2, ArrayViewMut1};
+use rayon::prelude::*;
+
+// Strategy IDs — mirror _insertion_fill.py exactly.
+pub const REPEAT_5P: i64 = 0;
+pub const REPEAT_5P_NORM: i64 = 1;
+pub const CONSTANT: i64 = 2;
+pub const FLANK_SAMPLE: i64 = 3;
+pub const INTERPOLATE: i64 = 4;
+
+/// Single round of xorshift64.
+///
+/// Mirrors numba `_xorshift64` on `np.uint64`:
+/// ```text
+/// x ^= x << 13
+/// x ^= x >> 7
+/// x ^= x << 17
+/// ```
+/// Left shifts use `wrapping_shl` to replicate `np.uint64` truncation-to-64-bits.
+#[inline(always)]
+pub fn xorshift64(mut x: u64) -> u64 {
+    x ^= x.wrapping_shl(13);
+    x ^= x >> 7;
+    x ^= x.wrapping_shl(17);
+    x
+}
+
+/// Hash four `u64` values into one.
+///
+/// Mirrors numba `_hash4`:
+/// ```text
+/// h = a
+/// h = xorshift64(h ^ b)
+/// h = xorshift64(h ^ c)
+/// h = xorshift64(h ^ d)
+/// ```
+#[inline(always)]
+pub fn hash4(a: u64, b: u64, c: u64, d: u64) -> u64 {
+    let mut h = a;
+    h = xorshift64(h ^ b);
+    h = xorshift64(h ^ c);
+    h = xorshift64(h ^ d);
+    h
+}
+
+/// Fill `writable_length` values starting at `out[out_idx]` using the given
+/// insertion-fill strategy.
+///
+/// Mirrors numba `_apply_insertion_fill` (lines 56-138 of `_tracks.py`)
+/// statement-by-statement, including float promotion points:
+///
+/// - `REPEAT_5P_NORM`: numba computes `track[v_rel_pos] / v_len` in **f64**
+///   (`v_len` is int64; np.float32 / np.int64 → float64), then rounds to f32
+///   on store. We compute f32 / f32 directly: this is bit-identical to numba
+///   **only** because IEEE-754 division is double-rounding-safe (f64 mantissa
+///   53 bits ≥ 2·24+2 = 50, verified empirically over 42M cases). Do NOT
+///   generalize this f32-direct shortcut to multiply-add or multi-step
+///   accumulations — those are NOT double-rounding-safe; mirror numba's f64
+///   intermediate there.
+/// - `CONSTANT`: `params[0]` is f64; stored into f32 `out` (cast on store).
+/// - `INTERPOLATE`: all anchor/Lagrange arithmetic in f64 (`xs`, `ys` are f64);
+///   `ys[j] = track[ref_idx]` promotes f32 → f64 on assignment; final `acc`
+///   stored into f32 `out` (cast on store).
+///
+/// # Parameters
+/// - `out`: output track buffer (f32)
+/// - `out_idx`: starting write index within `out`
+/// - `writable_length`: number of positions to write
+/// - `v_len`: total insertion length (v_diff + 1)
+/// - `track`: reference track values (f32)
+/// - `v_rel_pos`: variant position relative to the query region
+/// - `strategy_id`: one of `REPEAT_5P`, `REPEAT_5P_NORM`, `CONSTANT`,
+///   `FLANK_SAMPLE`, `INTERPOLATE`
+/// - `params`: per-strategy parameter slot (f64); `params[0]` = flank_width,
+///   constant value, or interpolation order depending on strategy
+/// - `base_seed`, `query`, `hap`: seed components for `FLANK_SAMPLE`
+pub fn apply_insertion_fill(
+    out: &mut ArrayViewMut1<f32>,
+    out_idx: usize,
+    writable_length: usize,
+    v_len: i64,
+    track: ArrayView1<f32>,
+    v_rel_pos: i64,
+    strategy_id: i64,
+    params: ArrayView1<f64>,
+    base_seed: u64,
+    query: u64,
+    hap: u64,
+) {
+    let track_len = track.len() as i64;
+
+    if strategy_id == REPEAT_5P {
+        // Numba comment: "unreachable from outer kernel (which short-circuits this
+        // strategy before calling). Kept for completeness and direct-helper-call safety."
+        let val = track[v_rel_pos as usize];
+        for i in 0..writable_length {
+            out[out_idx + i] = val;
+        }
+    } else if strategy_id == REPEAT_5P_NORM {
+        // Numba: val = track[v_rel_pos] / v_len  (computed in f64; v_len is int64,
+        // so np.float32/np.int64 → float64), then stored into f32 out.
+        // We divide f32/f32 directly: bit-identical to numba because IEEE-754
+        // division is double-rounding-safe. Do NOT extend this shortcut to
+        // multiply-add or multi-op paths — use f64 intermediates there.
+        let val = track[v_rel_pos as usize] / (v_len as f32);
+        for i in 0..writable_length {
+            out[out_idx + i] = val;
+        }
+    } else if strategy_id == CONSTANT {
+        // Numba: val = params[0] (f64), stored into f32 out on assignment.
+        let val = params[0] as f32;
+        for i in 0..writable_length {
+            out[out_idx + i] = val;
+        }
+    } else if strategy_id == FLANK_SAMPLE {
+        // Numba: width = np.int64(params[0])
+        let width = params[0] as i64;
+        let pool_lo = (v_rel_pos - width).max(0);
+        let pool_hi = (v_rel_pos + width).min(track_len - 1);
+        let pool_size = (pool_hi - pool_lo + 1) as u64;
+        for i in 0..writable_length {
+            // Numba: seed = _hash4(base_seed, np.uint64(query), np.uint64(hap), np.uint64(out_idx + i))
+            let seed = hash4(base_seed, query, hap, (out_idx + i) as u64);
+            // Numba: offset = np.int64(seed % np.uint64(pool_size))
+            let offset = (seed % pool_size) as i64;
+            out[out_idx + i] = track[(pool_lo + offset) as usize];
+        }
+    } else if strategy_id == INTERPOLATE {
+        // Numba: order = np.int64(params[0])
+        let order = params[0] as i64;
+        // k = ceil((order+1)/2)
+        // Numba: k = (order + 1 + 1) // 2
+        let k = (order + 1 + 1) / 2;
+        let n_anchors = (2 * k) as usize;
+
+        // Anchors: xs and ys are f64 (numba: np.empty(..., dtype=np.float64))
+        let mut xs = vec![0.0f64; n_anchors];
+        let mut ys = vec![0.0f64; n_anchors];
+
+        // 5' side: xs[j] = -j, ys[j] = track[max(v_rel_pos - j, 0)]
+        // Numba: xs[j] = -float(j), ys[j] = track[ref_idx]
+        // track[ref_idx] is f32; ys is f64 → f32 promoted to f64 on assignment.
+        for j in 0..k as usize {
+            let ref_idx = (v_rel_pos - j as i64).max(0) as usize;
+            xs[j] = -(j as f64);
+            ys[j] = track[ref_idx] as f64;
+        }
+        // 3' side: xs[k+j] = v_len + j, ys[k+j] = track[min(v_rel_pos+1+j, track_len-1)]
+        // Numba: xs[k + j] = float(v_len) + float(j), ys[k + j] = track[ref_idx]
+        for j in 0..k as usize {
+            let ref_idx = (v_rel_pos + 1 + j as i64).min(track_len - 1) as usize;
+            xs[k as usize + j] = (v_len as f64) + (j as f64);
+            ys[k as usize + j] = track[ref_idx] as f64;
+        }
+
+        // Lagrange interpolation: mirror numba loop nesting exactly.
+        // outer: a over n_anchors; inner: b over n_anchors, skip b==a
+        for i in 0..writable_length {
+            // Numba: x = float(i) — this is the insertion-local coordinate
+            let x = i as f64;
+            // Numba: acc = 0.0 (float64 literal)
+            let mut acc = 0.0f64;
+            for a in 0..n_anchors {
+                // Numba: term = ys[a]
+                let mut term = ys[a];
+                for b in 0..n_anchors {
+                    if b == a {
+                        continue;
+                    }
+                    // Numba: term *= (x - xs[b]) / (xs[a] - xs[b])
+                    term *= (x - xs[b]) / (xs[a] - xs[b]);
+                }
+                // Numba: acc += term
+                acc += term;
+            }
+            // Numba: out[out_idx + i] = acc — f64 acc stored into f32 out
+            out[out_idx + i] = acc as f32;
+        }
+    }
+}
+
+/// Shift and realign a single track to correspond to one haplotype.
+///
+/// Mirrors numba `shift_and_realign_track_sparse` (lines 230-401 of `_tracks.py`)
+/// statement-by-statement.
+///
+/// Three key differences from the haplotype reconstruction kernel:
+/// 1. SNPs (`v_diff == 0`) are SKIPPED — tracks match reference at SNP positions.
+/// 2. Insertions route to `apply_insertion_fill` UNLESS `strategy_id == REPEAT_5P`
+///    (which repeats `track[v_rel_pos]` directly).
+/// 3. Trailing fill pads with `0.0` (NOT a pad_char byte).
+///
+/// # Parameters
+/// - `offset_idx`: index into geno_o_starts/geno_o_stops for this (query, hap) pair
+/// - `geno_v_idxs`: flat variant index array
+/// - `geno_o_starts`, `geno_o_stops`: normalized (2, n) offsets split into two rows
+/// - `v_starts`: variant start positions (absolute genomic coordinates)
+/// - `ilens`: variant insertion-length differences (signed)
+/// - `shift`: total shift for this haplotype
+/// - `track`: reference track values for this query (f32 slice)
+/// - `query_start`: the genomic start of this query region
+/// - `out`: output slice to fill (length = haplotype output length)
+/// - `params`: per-strategy parameter (f64)
+/// - `keep`: optional boolean mask over the variant group for this (query, hap)
+/// - `strategy_id`: insertion-fill strategy
+/// - `base_seed`, `query`, `hap`: seed components for FlankSample strategy
+#[allow(clippy::too_many_arguments)]
+pub fn shift_and_realign_track_sparse(
+    offset_idx: usize,
+    geno_v_idxs: ndarray::ArrayView1<i32>,
+    geno_o_starts: ndarray::ArrayView1<i64>,
+    geno_o_stops: ndarray::ArrayView1<i64>,
+    v_starts: ndarray::ArrayView1<i32>,
+    ilens: ndarray::ArrayView1<i32>,
+    shift: i64,
+    track: ndarray::ArrayView1<f32>,
+    query_start: i64,
+    out: &mut ndarray::ArrayViewMut1<f32>,
+    params: ndarray::ArrayView1<f64>,
+    keep: Option<ndarray::ArrayView1<bool>>,
+    strategy_id: i64,
+    base_seed: u64,
+    query: u64,
+    hap: u64,
+) {
+    // Numba: o_s, o_e = geno_offsets[offset_idx], geno_offsets[offset_idx + 1]  (1-D branch)
+    //        or geno_offsets[:, offset_idx]  (2-D branch — normalized form)
+    // We receive the pre-split (2, n) rows directly.
+    let o_s = geno_o_starts[offset_idx] as usize;
+    let o_e = geno_o_stops[offset_idx] as usize;
+    let variant_idxs = &geno_v_idxs.as_slice().unwrap()[o_s..o_e];
+    let length = out.len();
+    let n_variants = variant_idxs.len();
+
+    if n_variants == 0 {
+        // Numba: out[:] = track[:length]
+        for i in 0..length {
+            out[i] = track[i];
+        }
+        return;
+    }
+
+    // Numba: track_idx = 0; out_idx = 0; shifted = 0
+    let mut track_idx: i64 = 0;
+    let mut out_idx: i64 = 0;
+    let mut shifted: i64 = 0;
+
+    for v in 0..n_variants {
+        // Numba: if keep is not None and not keep[v]: continue
+        if let Some(ref k) = keep {
+            if !k[v] {
+                continue;
+            }
+        }
+
+        let variant = variant_idxs[v] as usize;
+
+        // Numba: v_rel_pos = v_starts[variant] - query_start
+        let v_rel_pos = v_starts[variant] as i64 - query_start;
+        // Numba: v_diff = ilens[variant]
+        let v_diff = ilens[variant] as i64;
+        // Numba: v_rel_end = v_rel_pos - min(0, v_diff) + 1
+        let v_rel_end = v_rel_pos - v_diff.min(0) + 1;
+
+        // Numba: if v_diff < 0 and v_rel_pos < 0 and v_rel_end >= 0:
+        //            track_idx = v_rel_end; continue
+        if v_diff < 0 && v_rel_pos < 0 && v_rel_end >= 0 {
+            track_idx = v_rel_end;
+            continue;
+        }
+
+        // Numba: if v_rel_pos < track_idx: continue  (overlapping variant)
+        if v_rel_pos < track_idx {
+            continue;
+        }
+
+        // Numba: v_len = max(0, v_diff) + 1
+        let mut v_len = v_diff.max(0) + 1;
+
+        // Numba: if shifted < shift:
+        if shifted < shift {
+            let ref_shift_dist = v_rel_pos - track_idx;
+            // Numba: if shifted + ref_shift_dist + v_len < shift: continue
+            if shifted + ref_shift_dist + v_len < shift {
+                continue;
+            } else if shifted + ref_shift_dist >= shift {
+                // Numba: track_idx += shift - shifted; shifted = shift
+                track_idx += shift - shifted;
+                shifted = shift;
+            } else {
+                // ref + (some of) variant is enough to finish shift
+                // Numba: allele_start_idx = shift - shifted - ref_shift_dist; shifted = shift
+                let allele_start_idx = shift - shifted - ref_shift_dist;
+                shifted = shift;
+                // Numba: if allele_start_idx == v_len: track_idx = v_rel_end; continue
+                if allele_start_idx == v_len {
+                    track_idx = v_rel_end;
+                    continue;
+                }
+                // Numba: track_idx = v_rel_pos; v_len -= allele_start_idx
+                track_idx = v_rel_pos;
+                v_len -= allele_start_idx;
+            }
+        }
+
+        // Key difference 1: SNPs skipped for tracks (they match ref)
+        // Numba: if v_diff == 0: continue
+        if v_diff == 0 {
+            continue;
+        }
+
+        // Numba: track_len = v_rel_pos - track_idx
+        let track_len = v_rel_pos - track_idx;
+        // Numba: if out_idx + track_len >= length: break
+        if out_idx + track_len >= length as i64 {
+            break;
+        }
+        // Numba: out[out_idx:out_idx+track_len] = track[track_idx:track_idx+track_len]
+        for i in 0..track_len as usize {
+            out[out_idx as usize + i] = track[track_idx as usize + i];
+        }
+        out_idx += track_len;
+
+        // Numba: writable_length = min(v_len, length - out_idx)
+        let writable_length = (v_len.min(length as i64 - out_idx)) as usize;
+
+        // Key difference 2: insertions route to apply_insertion_fill unless REPEAT_5P
+        // Numba: if v_diff > 0 and strategy_id != _REPEAT_5P:
+        if v_diff > 0 && strategy_id != REPEAT_5P {
+            apply_insertion_fill(
+                out,
+                out_idx as usize,
+                writable_length,
+                v_len,
+                track,
+                v_rel_pos,
+                strategy_id,
+                params,
+                base_seed,
+                query,
+                hap,
+            );
+        } else {
+            // Numba: for i in range(writable_length): out[out_idx + i] = track[v_rel_pos]
+            // Deletions AND Repeat5p insertions: repeat track[v_rel_pos]
+            let val = track[v_rel_pos as usize];
+            for i in 0..writable_length {
+                out[out_idx as usize + i] = val;
+            }
+        }
+        out_idx += writable_length as i64;
+        track_idx = v_rel_end;
+
+        // Numba: if out_idx >= length: break
+        if out_idx >= length as i64 {
+            break;
+        }
+    }
+
+    // Numba: if shifted < shift: track_idx += shift - shifted; ...
+    if shifted < shift {
+        track_idx += shift - shifted;
+        track_idx = track_idx.min(track.len() as i64);
+        // shifted = shift;  (not used after this point)
+    }
+
+    // Key difference 3: trailing fill pads with 0.0 (NOT pad_char)
+    // Numba: unfilled_length = length - out_idx
+    let unfilled_length = length as i64 - out_idx;
+    if unfilled_length > 0 {
+        // When a deletion's v_rel_end runs past the track end, track_idx advances
+        // past track.len() and writable_ref becomes negative. The fixed numba kernel
+        // uses max(0, min(unfilled, len(track)-track_idx)), so writable_ref >= 0 and
+        // out_end_idx = out_idx. Mirror that: clamp out_end_idx to out_idx so the
+        // zero-pad fills exactly out[out_idx..length] without overwriting
+        // already-written positions (mirrors reconstruct/mod.rs:234-239).
+        let writable_ref = unfilled_length.min(track.len() as i64 - track_idx);
+        // Positive: copy track bytes. Zero or negative: track exhausted, no copy.
+        let out_end_idx = if writable_ref > 0 {
+            let oe = out_idx + writable_ref;
+            let re = track_idx + writable_ref;
+            // Numba: out[out_idx:out_end_idx] = track[track_idx:ref_end_idx]
+            for i in 0..writable_ref as usize {
+                out[out_idx as usize + i] = track[track_idx as usize + i];
+            }
+            let _ = re; // ref_end_idx used only to bound the copy above
+            oe
+        } else {
+            // writable_ref <= 0: track exhausted (track_idx at/after track end).
+            // No track bytes remain to copy; zero-pad the entire unfilled tail
+            // out[out_idx..length]. Clamp to out_idx (NOT (out_idx+writable_ref).max(0))
+            // to avoid overwriting already-written positions.
+            out_idx
+        };
+        // Numba: if out_end_idx < length: out[out_end_idx:] = 0
+        if out_end_idx < length as i64 {
+            for i in out_end_idx as usize..length {
+                out[i] = 0.0_f32;
+            }
+        }
+    }
+}
+
+/// Shift and realign tracks for a batch of (query, hap) pairs in place (writes `out`).
+///
+/// Mirrors numba `shift_and_realign_tracks_sparse` (lines 141-228 of `_tracks.py`)
+/// statement-by-statement. Serial-only (rayon deferred to Phase 5, matching Task 5
+/// precedent for initial parity verification).
+///
+/// # Parameters
+/// - `out`: flat output buffer (f32), written in place
+/// - `out_offsets`: ragged offsets into out, shape (n_q * ploidy + 1,)
+/// - `regions`: (n_q, 3) array of (contig_idx, start, end) per query
+/// - `shifts`: (n_q, ploidy) shift per (query, hap)
+/// - `geno_offset_idx`: (n_q, ploidy) indices into geno_o_starts/stops
+/// - `geno_v_idxs`: flat variant index array
+/// - `geno_o_starts`, `geno_o_stops`: normalized (2, n) offsets split into rows
+/// - `v_starts`: variant start positions
+/// - `ilens`: variant ilen differences
+/// - `tracks`: flat reference track buffer (f32), ragged by track_offsets
+/// - `track_offsets`: (n_q + 1,) offsets into tracks (one track per query)
+/// - `params`: per-strategy parameter (f64), shape (1,)
+/// - `keep`, `keep_offsets`: optional keep mask + 1-D offsets
+/// - `strategy_id`, `base_seed`: insertion-fill strategy parameters
+#[allow(clippy::too_many_arguments)]
+pub fn shift_and_realign_tracks_sparse(
+    mut out: ndarray::ArrayViewMut1<f32>,
+    out_offsets: ndarray::ArrayView1<i64>,
+    regions: ndarray::ArrayView2<i32>,
+    shifts: ndarray::ArrayView2<i32>,
+    geno_offset_idx: ndarray::ArrayView2<i64>,
+    geno_v_idxs: ndarray::ArrayView1<i32>,
+    geno_o_starts: ndarray::ArrayView1<i64>,
+    geno_o_stops: ndarray::ArrayView1<i64>,
+    v_starts: ndarray::ArrayView1<i32>,
+    ilens: ndarray::ArrayView1<i32>,
+    tracks: ndarray::ArrayView1<f32>,
+    track_offsets: ndarray::ArrayView1<i64>,
+    params: ndarray::ArrayView1<f64>,
+    keep: Option<ndarray::ArrayView1<bool>>,
+    keep_offsets: Option<ndarray::ArrayView1<i64>>,
+    strategy_id: i64,
+    base_seed: u64,
+    parallel: bool,
+) {
+    // Numba: n_regions, ploidy = geno_offset_idx.shape
+    let n_regions = geno_offset_idx.nrows();
+    let ploidy = geno_offset_idx.ncols();
+    let n_work = n_regions * ploidy;
+
+    // Hoist contiguous raw slices once to eliminate ndarray::do_slice call overhead
+    // in the inner (query, hap) loop.  The prior interval-kernel fix (src/intervals.rs)
+    // applied the same pattern: out.as_slice_mut().unwrap() once, then index [a..b]
+    // directly.  Here we do the same for out, tracks, and keep.
+    // geno_v_idxs already uses .as_slice().unwrap() (inner fn line 240) — same contract.
+    let out_flat = out.as_slice_mut().expect("out must be contiguous (C-order)");
+    let tracks_flat = tracks.as_slice().expect("tracks must be contiguous (C-order)");
+    // Hoist keep flat option once (avoids repeated .as_slice() per hap).
+    let keep_flat: Option<&[bool]> =
+        keep.as_ref().map(|k| k.as_slice().expect("keep must be contiguous (C-order)"));
+
+    if parallel {
+        // Build disjoint per-k mutable output slices using the split_at_mut cursor
+        // idiom (mirrors C1 reconstruct_haplotypes_from_sparse parallel path).
+        let bounds: Vec<(usize, usize)> = (0..n_work)
+            .map(|k| (out_offsets[k] as usize, out_offsets[k + 1] as usize))
+            .collect();
+
+        let mut out_chunks: Vec<&mut [f32]> = Vec::with_capacity(n_work);
+        {
+            let mut rest = &mut out_flat[..];
+            let mut cursor = 0usize;
+            for &(s, e) in &bounds {
+                debug_assert!(
+                    s >= cursor && e >= s,
+                    "out_offsets must be monotonically non-decreasing (got s={s}, e={e}, cursor={cursor})"
+                );
+                let (_, tail) = rest.split_at_mut(s - cursor);
+                let (mid, tail2) = tail.split_at_mut(e - s);
+                out_chunks.push(mid);
+                rest = tail2;
+                cursor = e;
+            }
+        }
+
+        out_chunks
+            .into_par_iter()
+            .enumerate()
+            .for_each(|(k, out_chunk)| {
+                let query = k / ploidy;
+                let hap = k % ploidy;
+
+                let t_s = track_offsets[query] as usize;
+                let t_e = track_offsets[query + 1] as usize;
+                let q_track = ndarray::ArrayView1::from(&tracks_flat[t_s..t_e]);
+                let q_start = regions[[query, 1]] as i64;
+                let o_idx = geno_offset_idx[[query, hap]] as usize;
+                let qh_shift = shifts[[query, hap]] as i64;
+
+                let qh_keep: Option<ndarray::ArrayView1<bool>> =
+                    match (&keep_flat, &keep_offsets) {
+                        (Some(k_flat), Some(ko)) => {
+                            let ks = ko[k] as usize;
+                            let ke = ko[k + 1] as usize;
+                            Some(ndarray::ArrayView1::from(&k_flat[ks..ke]))
+                        }
+                        _ => None,
+                    };
+
+                let mut qh_out = ndarray::ArrayViewMut1::from(out_chunk);
+                shift_and_realign_track_sparse(
+                    o_idx,
+                    geno_v_idxs,
+                    geno_o_starts,
+                    geno_o_stops,
+                    v_starts,
+                    ilens,
+                    qh_shift,
+                    q_track,
+                    q_start,
+                    &mut qh_out,
+                    params,
+                    qh_keep,
+                    strategy_id,
+                    base_seed,
+                    query as u64,
+                    hap as u64,
+                );
+            });
+    } else {
+        // Serial path: Numba: for query in nb.prange(n_regions):  (serial equivalent)
+        for query in 0..n_regions {
+            // Numba: t_s, t_e = track_offsets[query], track_offsets[query + 1]
+            let t_s = track_offsets[query] as usize;
+            let t_e = track_offsets[query + 1] as usize;
+            // Numba: q_track = tracks[t_s:t_e]
+            // ArrayView1::from(&slice) is cheaper than tracks.slice(s![..]) — no do_slice call.
+            let q_track = ndarray::ArrayView1::from(&tracks_flat[t_s..t_e]);
+
+            // Numba: q_start = regions[query, 1]
+            let q_start = regions[[query, 1]] as i64;
+
+            // Numba: for hap in nb.prange(ploidy):  (serial equivalent)
+            for hap in 0..ploidy {
+                // Numba: o_idx = geno_offset_idx[query, hap]
+                let o_idx = geno_offset_idx[[query, hap]] as usize;
+
+                // Numba: k_idx = query * ploidy + hap
+                let k_idx = query * ploidy + hap;
+
+                // Numba: if keep is not None and keep_offsets is not None:
+                //            qh_keep = keep[keep_offsets[k_idx]:keep_offsets[k_idx+1]]
+                // ArrayView1::from(&slice[..]) avoids the do_slice call that
+                // k.slice(s![ks..ke]) would generate.
+                let qh_keep: Option<ndarray::ArrayView1<bool>> =
+                    match (&keep_flat, &keep_offsets) {
+                        (Some(k_flat), Some(ko)) => {
+                            let ks = ko[k_idx] as usize;
+                            let ke = ko[k_idx + 1] as usize;
+                            Some(ndarray::ArrayView1::from(&k_flat[ks..ke]))
+                        }
+                        _ => None,
+                    };
+
+                // Numba: out_s, out_e = out_offsets[k_idx], out_offsets[k_idx + 1]
+                let out_s = out_offsets[k_idx] as usize;
+                let out_e = out_offsets[k_idx + 1] as usize;
+                // Numba: qh_out = out[out_s:out_e]; qh_shifts = shifts[query, hap]
+                // ArrayViewMut1::from(&mut slice[..]) avoids the do_slice call that
+                // out.slice_mut(s![out_s..out_e]) would generate.
+                let mut qh_out = ndarray::ArrayViewMut1::from(&mut out_flat[out_s..out_e]);
+                let qh_shift = shifts[[query, hap]] as i64;
+
+                shift_and_realign_track_sparse(
+                    o_idx,
+                    geno_v_idxs,
+                    geno_o_starts,
+                    geno_o_stops,
+                    v_starts,
+                    ilens,
+                    qh_shift,
+                    q_track,
+                    q_start,
+                    &mut qh_out,
+                    params,
+                    qh_keep,
+                    strategy_id,
+                    base_seed,
+                    query as u64,
+                    hap as u64,
+                );
+            }
+        }
+    }
+}
+
+/// RLE-encode a ragged f32 track buffer into (starts, ends, values, offsets) intervals.
+///
+/// Mirrors numba `tracks_to_intervals` + `_scanned_mask` + `_compact_mask` in
+/// `python/genvarloader/_dataset/_intervals.py` lines 129-220, statement-by-statement.
+///
+/// # Algorithm (matches numba exactly)
+/// Two-pass:
+/// 1. For each query, compute `scanned_mask` (cumulative count of value-change positions)
+///    and store `n_intervals[query] = scanned_mask[-1]`.
+/// 2. Cumsum `n_intervals` into `interval_offsets` (i64, mirrors numba's `.cumsum()`).
+/// 3. Fill pass: for each query, recover run boundaries via `compact_mask`, then write
+///    starts/ends/values into the output arrays at `interval_offsets[query]`.
+///
+/// Key fidelity points:
+/// - `backward_mask[0] = true`, `backward_mask[i] = track[i-1] != track[i]` — exact f32 `!=`
+///   (bit-level, not ordered comparison).
+/// - `scanned_mask` = prefix-sum of `backward_mask` (i64 accumulation).
+/// - 0-value intervals ARE included (no filtering on value == 0.0, matches numba comment).
+/// - `starts` and `ends` are absolute genomic coords: `boundaries + regions[query, 1]`.
+/// - Output dtypes: starts/ends i32, values f32, offsets i64.
+pub fn tracks_to_intervals(
+    regions: ArrayView2<i32>,
+    tracks: ArrayView1<f32>,
+    track_offsets: ArrayView1<i64>,
+    parallel: bool,
+) -> (Array1<i32>, Array1<i32>, Array1<f32>, Array1<i64>) {
+    let n_queries = regions.nrows();
+
+    // --- Pass 1: count intervals per query ---
+    // Numba: n_intervals = np.empty(n_queries, np.int32)
+    // Numba: scanned_masks = np.empty_like(tracks, np.int64)
+    // We allocate a single flat scanned_masks buffer mirroring numba's layout.
+    let total_track_len = tracks.len();
+    let mut scanned_masks = vec![0i64; total_track_len];
+    let mut n_intervals = vec![0i32; n_queries];
+
+    if parallel {
+        // Build disjoint per-query mutable slices of scanned_masks (variable-size
+        // chunks per query) using the split_at_mut cursor idiom (mirrors C1).
+        let track_bounds: Vec<(usize, usize)> = (0..n_queries)
+            .map(|q| (track_offsets[q] as usize, track_offsets[q + 1] as usize))
+            .collect();
+
+        let mut scan_chunks: Vec<&mut [i64]> = Vec::with_capacity(n_queries);
+        {
+            let mut rest = &mut scanned_masks[..];
+            let mut cursor = 0usize;
+            for &(s, e) in &track_bounds {
+                let (_, tail) = rest.split_at_mut(s - cursor);
+                let (mid, tail2) = tail.split_at_mut(e - s);
+                scan_chunks.push(mid);
+                rest = tail2;
+                cursor = e;
+            }
+        }
+
+        let tracks_slice = tracks.as_slice().unwrap();
+        scan_chunks
+            .into_par_iter()
+            .zip(n_intervals.par_iter_mut())
+            .enumerate()
+            .for_each(|(query, (scan, n_int))| {
+                let o_s = track_offsets[query] as usize;
+                let o_e = track_offsets[query + 1] as usize;
+                if o_s == o_e {
+                    *n_int = 0;
+                    return;
+                }
+                let track = &tracks_slice[o_s..o_e];
+                let mut acc: i64 = 0;
+                for i in 0..track.len() {
+                    let bm = if i == 0 {
+                        true
+                    } else {
+                        track[i - 1] != track[i]
+                    };
+                    acc += bm as i64;
+                    scan[i] = acc;
+                }
+                *n_int = scan[track.len() - 1] as i32;
+            });
+    } else {
+        for query in 0..n_queries {
+            let o_s = track_offsets[query] as usize;
+            let o_e = track_offsets[query + 1] as usize;
+            // Numba: if o_s == o_e: n_intervals[query] = 0; continue
+            if o_s == o_e {
+                n_intervals[query] = 0;
+                continue;
+            }
+            let track = &tracks.as_slice().unwrap()[o_s..o_e];
+            let scan = &mut scanned_masks[o_s..o_e];
+            // _scanned_mask: backward_mask[0]=True, backward_mask[i] = track[i-1] != track[i]
+            // cumsum into scan (i64 accumulator)
+            // Numba: out[:] = backward_mask.cumsum()
+            let mut acc: i64 = 0;
+            for i in 0..track.len() {
+                let bm = if i == 0 {
+                    true
+                } else {
+                    // Exact f32 != comparison (bit-level, matches numba)
+                    track[i - 1] != track[i]
+                };
+                acc += bm as i64;
+                scan[i] = acc;
+            }
+            // n_intervals[query] = scanned_backward_mask[-1]
+            n_intervals[query] = scan[track.len() - 1] as i32;
+        }
+    }
+
+    // --- Two-pass cumsum: mirrors numba's n_intervals.cumsum() ---
+    // Numba:
+    //   interval_offsets = np.empty(n_queries + 1, np.int64)
+    //   interval_offsets[0] = 0
+    //   interval_offsets[1:] = n_intervals.cumsum()
+    // (stays sequential — prefix-sum has a data dependency chain)
+    let mut interval_offsets = vec![0i64; n_queries + 1];
+    let mut running: i64 = 0;
+    for q in 0..n_queries {
+        running += n_intervals[q] as i64;
+        interval_offsets[q + 1] = running;
+    }
+    let total_intervals = running as usize;
+
+    let mut all_starts = vec![0i32; total_intervals];
+    let mut all_ends = vec![0i32; total_intervals];
+    let mut all_values = vec![0.0f32; total_intervals];
+
+    // --- Pass 2: fill starts/ends/values ---
+    if parallel {
+        // Build disjoint per-query mutable slices from all_starts/ends/values using
+        // interval_offsets (which have already been computed sequentially above).
+        let itv_bounds: Vec<(usize, usize)> = (0..n_queries)
+            .map(|q| (interval_offsets[q] as usize, interval_offsets[q + 1] as usize))
+            .collect();
+
+        let mut starts_chunks: Vec<&mut [i32]> = Vec::with_capacity(n_queries);
+        let mut ends_chunks: Vec<&mut [i32]> = Vec::with_capacity(n_queries);
+        let mut values_chunks: Vec<&mut [f32]> = Vec::with_capacity(n_queries);
+
+        {
+            let mut rest_s = &mut all_starts[..];
+            let mut rest_e = &mut all_ends[..];
+            let mut rest_v = &mut all_values[..];
+            let mut cursor = 0usize;
+            for &(s, e) in &itv_bounds {
+                let (_, tail_s) = rest_s.split_at_mut(s - cursor);
+                let (mid_s, tail_s2) = tail_s.split_at_mut(e - s);
+                starts_chunks.push(mid_s);
+                rest_s = tail_s2;
+
+                let (_, tail_e) = rest_e.split_at_mut(s - cursor);
+                let (mid_e, tail_e2) = tail_e.split_at_mut(e - s);
+                ends_chunks.push(mid_e);
+                rest_e = tail_e2;
+
+                let (_, tail_v) = rest_v.split_at_mut(s - cursor);
+                let (mid_v, tail_v2) = tail_v.split_at_mut(e - s);
+                values_chunks.push(mid_v);
+                rest_v = tail_v2;
+
+                cursor = e;
+            }
+        }
+
+        let tracks_slice = tracks.as_slice().unwrap();
+        starts_chunks
+            .into_par_iter()
+            .zip(ends_chunks.into_par_iter())
+            .zip(values_chunks.into_par_iter())
+            .enumerate()
+            .for_each(|(query, ((s_chunk, e_chunk), v_chunk))| {
+                let o_s = track_offsets[query] as usize;
+                let o_e = track_offsets[query + 1] as usize;
+                if o_s == o_e {
+                    return;
+                }
+                let track = &tracks_slice[o_s..o_e];
+                let scan = &scanned_masks[o_s..o_e];
+                let n_elems = scan.len();
+                let n_runs = scan[n_elems - 1] as usize;
+
+                let mut compacted = vec![0i32; n_runs + 1];
+                compacted[n_runs] = n_elems as i32;
+                for i in 0..n_elems {
+                    if i == 0 {
+                        compacted[0] = 0;
+                    } else if scan[i] != scan[i - 1] {
+                        compacted[scan[i] as usize - 1] = i as i32;
+                    }
+                }
+
+                let start = regions[[query, 1]];
+                for k in 0..n_runs {
+                    s_chunk[k] = compacted[k] + start;
+                    e_chunk[k] = compacted[k + 1] + start;
+                    v_chunk[k] = track[compacted[k] as usize];
+                }
+            });
+    } else {
+        for query in 0..n_queries {
+            let o_s = track_offsets[query] as usize;
+            let o_e = track_offsets[query + 1] as usize;
+            // Numba: if o_s == o_e: continue
+            if o_s == o_e {
+                continue;
+            }
+            let track = &tracks.as_slice().unwrap()[o_s..o_e];
+            let scan = &scanned_masks[o_s..o_e];
+            let n_elems = scan.len();
+            let n_runs = scan[n_elems - 1] as usize;
+
+            // _compact_mask: recovers run-boundary indices
+            // Numba:
+            //   compacted_backward_mask = np.empty(n_runs + 1, np.int32)
+            //   compacted_backward_mask[-1] = n_elems
+            //   for i in prange(n_elems):
+            //       if i == 0: compacted_backward_mask[0] = 0
+            //       elif scan[i] != scan[i-1]: compacted_backward_mask[scan[i] - 1] = i
+            let mut compacted = vec![0i32; n_runs + 1];
+            compacted[n_runs] = n_elems as i32;
+            for i in 0..n_elems {
+                if i == 0 {
+                    compacted[0] = 0;
+                } else if scan[i] != scan[i - 1] {
+                    compacted[scan[i] as usize - 1] = i as i32;
+                }
+            }
+
+            // values = track[compacted[:-1]]
+            // starts/ends = compacted[:-1] + region_start, compacted[1:] + region_start
+            let s = interval_offsets[query] as usize;
+            let start = regions[[query, 1]]; // region start (absolute genomic coord)
+
+            // Numba: compacted_backward_mask += start  (in-place, then used for starts/ends)
+            // We apply the shift at write time to avoid mutating compacted.
+            let n = n_runs; // == len(values)
+            for k in 0..n {
+                all_starts[s + k] = compacted[k] + start;
+                all_ends[s + k] = compacted[k + 1] + start;
+                all_values[s + k] = track[compacted[k] as usize];
+            }
+        }
+    }
+
+    (
+        Array1::from_vec(all_starts),
+        Array1::from_vec(all_ends),
+        Array1::from_vec(all_values),
+        Array1::from_vec(interval_offsets),
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::Array1;
+
+    /// Expected values hand-derived from the numba algorithm (verified by running
+    /// the Python reference implementation with np.uint64 arithmetic).
+    #[test]
+    fn test_xorshift64_vectors() {
+        // xorshift64(1):
+        //   x=1; x ^= 1<<13=0x2000 → 0x2001
+        //   x ^= 0x2001>>7=0x40   → 0x2041
+        //   x ^= 0x2041<<17=0x408200000 → 0x40822041 = 1082269761
+        assert_eq!(xorshift64(1), 1_082_269_761_u64);
+
+        // xorshift64(2) = 2164539522 (verified via Python np.uint64)
+        assert_eq!(xorshift64(2), 2_164_539_522_u64);
+
+        // xorshift64(42) = 45454805674
+        assert_eq!(xorshift64(42), 45_454_805_674_u64);
+
+        // xorshift64(0xdeadbeef) = 4018790486776397394
+        assert_eq!(xorshift64(0xdeadbeef), 4_018_790_486_776_397_394_u64);
+
+        // xorshift64(u64::MAX) — wrapping behaviour: 2**64-1 = 0xffffffffffffffff
+        // result = 0x3f801fc0 = 1065361344 (verified via Python np.uint64)
+        assert_eq!(xorshift64(u64::MAX), 1_065_361_344_u64);
+    }
+
+    #[test]
+    fn test_hash4_vectors() {
+        // hash4(1,2,3,4) = 11323120931611735037 (verified via Python)
+        assert_eq!(hash4(1, 2, 3, 4), 11_323_120_931_611_735_037_u64);
+
+        // hash4(0,0,0,0): h=0; xorshift64(0)=0 at each step → 0
+        assert_eq!(hash4(0, 0, 0, 0), 0_u64);
+
+        // hash4(0xdeadbeef, 0xcafe, 0xbabe, 1) = 5244362157944750963
+        assert_eq!(
+            hash4(0xdeadbeef, 0xcafe, 0xbabe, 1),
+            5_244_362_157_944_750_963_u64
+        );
+    }
+
+    // ------------------------------------------------------------------ //
+    // apply_insertion_fill tests                                           //
+    // ------------------------------------------------------------------ //
+
+    /// Helper: allocate out, run apply_insertion_fill, return the filled slice.
+    fn run_fill(
+        out_size: usize,
+        out_idx: usize,
+        writable_length: usize,
+        v_len: i64,
+        track: &[f32],
+        v_rel_pos: i64,
+        strategy_id: i64,
+        params: &[f64],
+        base_seed: u64,
+        query: u64,
+        hap: u64,
+    ) -> Vec<f32> {
+        let mut out_arr = Array1::<f32>::zeros(out_size);
+        {
+            let mut out_view = out_arr.view_mut();
+            let track_arr = Array1::from_vec(track.to_vec());
+            let params_arr = Array1::from_vec(params.to_vec());
+            apply_insertion_fill(
+                &mut out_view,
+                out_idx,
+                writable_length,
+                v_len,
+                track_arr.view(),
+                v_rel_pos,
+                strategy_id,
+                params_arr.view(),
+                base_seed,
+                query,
+                hap,
+            );
+        }
+        out_arr.to_vec()
+    }
+
+    /// REPEAT_5P_NORM: val = track[v_rel_pos] / v_len (f32/f32 → f32).
+    ///
+    /// track = [1.0, 6.0, 2.0], v_rel_pos = 1 → track[1] = 6.0f32
+    /// v_len = 3 → val = 6.0f32 / 3f32 = 2.0f32
+    /// writable_length = 3 → out[0..3] = [2.0, 2.0, 2.0]
+    /// sum = 6.0 = track[v_rel_pos] ✓ (sum-preserving)
+    #[test]
+    fn test_repeat_5p_norm() {
+        let track = [1.0f32, 6.0, 2.0];
+        let v_rel_pos = 1i64;
+        let v_len = 3i64;
+        let writable_length = 3;
+
+        // val = 6.0f32 / 3f32 = 2.0f32  (exact in f32)
+        let expected_val = 6.0f32 / 3.0f32;
+        let result = run_fill(
+            writable_length,
+            0,
+            writable_length,
+            v_len,
+            &track,
+            v_rel_pos,
+            REPEAT_5P_NORM,
+            &[0.0],
+            0,
+            0,
+            0,
+        );
+        assert_eq!(result.len(), writable_length);
+        for &v in &result {
+            assert_eq!(v, expected_val, "REPEAT_5P_NORM: expected {expected_val}, got {v}");
+        }
+        // Sum preservation check
+        let sum: f32 = result.iter().sum();
+        assert_eq!(sum, track[v_rel_pos as usize]);
+    }
+
+    /// REPEAT_5P_NORM with non-divisible values: verifies f32 precision.
+    ///
+    /// track = [0.0, 1.0, 0.0], v_rel_pos = 1, v_len = 3
+    /// val = 1.0f32 / 3f32 (not exactly representable)
+    #[test]
+    fn test_repeat_5p_norm_precision() {
+        let track = [0.0f32, 1.0, 0.0];
+        let v_rel_pos = 1i64;
+        let v_len = 3i64;
+        let writable_length = 3;
+
+        let expected_val = 1.0f32 / 3.0f32; // same f32 division as numba
+        let result = run_fill(
+            writable_length,
+            0,
+            writable_length,
+            v_len,
+            &track,
+            v_rel_pos,
+            REPEAT_5P_NORM,
+            &[0.0],
+            0,
+            0,
+            0,
+        );
+        for &v in &result {
+            assert_eq!(v, expected_val);
+        }
+    }
+
+    /// CONSTANT: fills every position with params[0] cast to f32.
+    ///
+    /// params[0] = 3.14 (f64), writable_length = 4
+    /// expected: each position = 3.14f64 as f32 = 3.14f32
+    #[test]
+    fn test_constant() {
+        let track = [0.0f32, 0.0, 0.0, 0.0, 0.0];
+        let result = run_fill(5, 1, 4, 1, &track, 0, CONSTANT, &[3.14f64], 0, 0, 0);
+        let expected = 3.14f64 as f32;
+        for i in 1..5 {
+            assert_eq!(result[i], expected, "CONSTANT at position {i}");
+        }
+        // position 0 should be untouched (still 0)
+        assert_eq!(result[0], 0.0f32);
+    }
+
+    /// CONSTANT with NaN: the default Constant(value=NaN) should write NaN.
+    #[test]
+    fn test_constant_nan() {
+        let track = [0.0f32];
+        let result = run_fill(3, 0, 3, 1, &track, 0, CONSTANT, &[f64::NAN], 0, 0, 0);
+        for &v in &result {
+            assert!(v.is_nan(), "expected NaN, got {v}");
+        }
+    }
+
+    /// FLANK_SAMPLE: deterministic given seed.
+    ///
+    /// Setup: track = [10.0, 20.0, 30.0, 40.0, 50.0], v_rel_pos=2, flank_width=1
+    /// pool: pool_lo = max(0, 2-1)=1, pool_hi = min(4, 2+1)=3, pool_size=3
+    /// pool values: track[1..=3] = [20.0, 30.0, 40.0]
+    ///
+    /// For base_seed=42, query=7, hap=1, out_idx=0, writable_length=4:
+    ///
+    /// Hand-derived using verified hash4:
+    ///   i=0: seed = hash4(42, 7, 1, 0); offset = seed % 3; track[1+offset]
+    ///   i=1: seed = hash4(42, 7, 1, 1); offset = seed % 3; track[1+offset]
+    ///   i=2: seed = hash4(42, 7, 1, 2); offset = seed % 3; track[1+offset]
+    ///   i=3: seed = hash4(42, 7, 1, 3); offset = seed % 3; track[1+offset]
+    ///
+    /// Computed by applying xorshift64 chain:
+    ///   hash4(42, 7, 1, 0) = xorshift64(xorshift64(xorshift64(42^7) ^ 1) ^ 0)
+    ///   We compute all hash values first and derive offsets below.
+    #[test]
+    fn test_flank_sample_deterministic() {
+        let track = [10.0f32, 20.0, 30.0, 40.0, 50.0];
+        let v_rel_pos = 2i64;
+        let flank_width = 1i64; // pool_lo=1, pool_hi=3, pool_size=3
+        let pool_lo = 1i64;
+        let pool_size = 3u64;
+
+        let base_seed = 42u64;
+        let query = 7u64;
+        let hap = 1u64;
+        let out_idx = 0usize;
+        let writable_length = 4;
+
+        // Hand-compute the expected hash values and pool indices:
+        // This uses our verified hash4 function.
+        let expected: Vec<f32> = (0..writable_length)
+            .map(|i| {
+                let seed = hash4(base_seed, query, hap, (out_idx + i) as u64);
+                let offset = (seed % pool_size) as i64;
+                track[(pool_lo + offset) as usize]
+            })
+            .collect();
+
+        let result = run_fill(
+            writable_length,
+            out_idx,
+            writable_length,
+            1,
+            &track,
+            v_rel_pos,
+            FLANK_SAMPLE,
+            &[flank_width as f64],
+            base_seed,
+            query,
+            hap,
+        );
+
+        assert_eq!(result, expected, "FLANK_SAMPLE: result did not match expected");
+
+        // Spot-check the first index by computing hash4 explicitly:
+        // hash4(42, 7, 1, 0):
+        //   h = 42
+        //   h = xorshift64(42 ^ 7) = xorshift64(45) = ?
+        let h0 = xorshift64(42 ^ 7); // xorshift64(45)
+        let h1 = xorshift64(h0 ^ 1);
+        let h2 = xorshift64(h1 ^ 0);
+        let offset0 = (h2 % pool_size) as i64;
+        assert_eq!(
+            result[0],
+            track[(pool_lo + offset0) as usize],
+            "FLANK_SAMPLE spot-check i=0 failed"
+        );
+    }
+
+    /// FLANK_SAMPLE with out_idx > 0: verifies that out_idx+i is used, not just i.
+    #[test]
+    fn test_flank_sample_out_idx_offset() {
+        let track = [10.0f32, 20.0, 30.0, 40.0, 50.0];
+        let v_rel_pos = 2i64;
+        let flank_width = 1i64;
+        let pool_lo = 1i64;
+        let pool_size = 3u64;
+        let base_seed = 100u64;
+        let query = 3u64;
+        let hap = 0u64;
+        let out_idx = 5usize;
+        let writable_length = 3;
+
+        let expected: Vec<f32> = (0..writable_length)
+            .map(|i| {
+                let seed = hash4(base_seed, query, hap, (out_idx + i) as u64);
+                let offset = (seed % pool_size) as i64;
+                track[(pool_lo + offset) as usize]
+            })
+            .collect();
+
+        let mut out_arr = Array1::<f32>::zeros(out_idx + writable_length);
+        {
+            let mut out_view = out_arr.view_mut();
+            let track_arr = Array1::from_vec(track.to_vec());
+            let params_arr = Array1::from_vec(vec![flank_width as f64]);
+            apply_insertion_fill(
+                &mut out_view,
+                out_idx,
+                writable_length,
+                1,
+                track_arr.view(),
+                v_rel_pos,
+                FLANK_SAMPLE,
+                params_arr.view(),
+                base_seed,
+                query,
+                hap,
+            );
+        }
+        let result: Vec<f32> = out_arr.iter().skip(out_idx).cloned().collect();
+        assert_eq!(result, expected, "FLANK_SAMPLE out_idx offset test failed");
+    }
+
+    /// INTERPOLATE order=1 (linear interpolation).
+    ///
+    /// order=1 → k = ceil(2/2) = 1, n_anchors = 2
+    /// track = [0.0, 4.0, 8.0] (indices 0,1,2), v_rel_pos=1, v_len=3
+    ///
+    /// Anchors (5' then 3' side):
+    ///   xs[0] = -0.0 = 0.0, ys[0] = track[max(1-0,0)=1] = 4.0
+    ///   xs[1] = 3.0+0.0 = 3.0, ys[1] = track[min(1+1+0,2)=2] = 8.0
+    ///
+    /// Lagrange at x=0: term_0 = 4.0 * (0-3)/(0-3) = 4.0*(-3/-3) = 4.0*1.0 = 4.0
+    ///                  term_1 = 8.0 * (0-0)/(3-0) = 8.0*0 = 0.0; acc=4.0
+    /// Lagrange at x=1: term_0 = 4.0 * (1-3)/(0-3) = 4.0*(-2/-3) = 4.0*0.6667 = 2.6667
+    ///                  term_1 = 8.0 * (1-0)/(3-0) = 8.0*(1/3) = 2.6667; acc=5.3333
+    /// Lagrange at x=2: term_0 = 4.0 * (2-3)/(0-3) = 4.0*(1/3) = 1.3333
+    ///                  term_1 = 8.0 * (2-0)/(3-0) = 8.0*(2/3) = 5.3333; acc=6.6667
+    ///
+    /// Check endpoints: at x=0 → 4.0 = track[1] ✓; at x=3 → 8.0 = track[2] ✓
+    #[test]
+    fn test_interpolate_order1() {
+        let track = [0.0f32, 4.0, 8.0];
+        let v_rel_pos = 1i64;
+        let v_len = 3i64;
+        let writable_length = 3;
+
+        // Hand-computed Lagrange values (f64 arithmetic, stored to f32):
+        // xs = [0.0, 3.0], ys = [4.0, 8.0]
+        // x=0: acc = 4.0*(0-3)/(0-3) + 8.0*(0-0)/(3-0) = 4.0 + 0.0 = 4.0
+        // x=1: acc = 4.0*(1-3)/(0-3) + 8.0*(1-0)/(3-0) = 4.0*(2/3) + 8.0*(1/3)
+        //           = 8.0/3.0 + 8.0/3.0 = 16.0/3.0
+        // x=2: acc = 4.0*(2-3)/(0-3) + 8.0*(2-0)/(3-0) = 4.0*(1/3) + 8.0*(2/3)
+        //           = 4.0/3.0 + 16.0/3.0 = 20.0/3.0
+        let xs = [0.0f64, 3.0f64];
+        let ys = [4.0f64, 8.0f64];
+        let expected: Vec<f32> = (0..writable_length)
+            .map(|i| {
+                let x = i as f64;
+                let mut acc = 0.0f64;
+                for a in 0..2usize {
+                    let mut term = ys[a];
+                    for b in 0..2usize {
+                        if b == a { continue; }
+                        term *= (x - xs[b]) / (xs[a] - xs[b]);
+                    }
+                    acc += term;
+                }
+                acc as f32
+            })
+            .collect();
+
+        let result = run_fill(
+            writable_length,
+            0,
+            writable_length,
+            v_len,
+            &track,
+            v_rel_pos,
+            INTERPOLATE,
+            &[1.0f64], // order=1
+            0,
+            0,
+            0,
+        );
+
+        assert_eq!(result.len(), writable_length);
+        // Endpoint check: at i=0, result should equal ys[0]=track[v_rel_pos]=4.0
+        assert_eq!(result[0], 4.0f32, "order=1 left endpoint must equal track[v_rel_pos]");
+        for (i, (&got, &exp)) in result.iter().zip(expected.iter()).enumerate() {
+            assert_eq!(got, exp, "INTERPOLATE order=1 at i={i}: got {got}, expected {exp}");
+        }
+    }
+
+    /// INTERPOLATE order=2.
+    ///
+    /// order=2 → k = ceil(3/2) = 2, n_anchors = 4
+    /// track = [1.0, 2.0, 4.0, 8.0, 16.0], v_rel_pos=2, v_len=2
+    ///
+    /// Anchors:
+    ///   5' side (j=0,1):
+    ///     xs[0]=-0.0=0.0, ys[0]=track[max(2-0,0)=2]=4.0
+    ///     xs[1]=-1.0,     ys[1]=track[max(2-1,0)=1]=2.0
+    ///   3' side (j=0,1):
+    ///     xs[2]=2.0+0.0=2.0, ys[2]=track[min(2+1+0,4)=3]=8.0
+    ///     xs[3]=2.0+1.0=3.0, ys[3]=track[min(2+1+1,4)=4]=16.0
+    ///
+    /// Lagrange at x=0,1 hand-computed via the same formula.
+    #[test]
+    fn test_interpolate_order2() {
+        let track = [1.0f32, 2.0, 4.0, 8.0, 16.0];
+        let v_rel_pos = 2i64;
+        let v_len = 2i64;
+        let writable_length = 2;
+
+        // Anchors: xs=[0.0, -1.0, 2.0, 3.0], ys=[4.0, 2.0, 8.0, 16.0]
+        let xs = [0.0f64, -1.0f64, 2.0f64, 3.0f64];
+        let ys = [4.0f64, 2.0f64, 8.0f64, 16.0f64];
+        let n = 4usize;
+
+        let expected: Vec<f32> = (0..writable_length)
+            .map(|i| {
+                let x = i as f64;
+                let mut acc = 0.0f64;
+                for a in 0..n {
+                    let mut term = ys[a];
+                    for b in 0..n {
+                        if b == a { continue; }
+                        term *= (x - xs[b]) / (xs[a] - xs[b]);
+                    }
+                    acc += term;
+                }
+                acc as f32
+            })
+            .collect();
+
+        let result = run_fill(
+            writable_length,
+            0,
+            writable_length,
+            v_len,
+            &track,
+            v_rel_pos,
+            INTERPOLATE,
+            &[2.0f64], // order=2
+            0,
+            0,
+            0,
+        );
+
+        // At x=0, result should equal ys[0] = track[v_rel_pos] = 4.0
+        assert_eq!(result[0], 4.0f32, "order=2 left endpoint must equal track[v_rel_pos]");
+        for (i, (&got, &exp)) in result.iter().zip(expected.iter()).enumerate() {
+            assert_eq!(got, exp, "INTERPOLATE order=2 at i={i}: got {got}, expected {exp}");
+        }
+    }
+
+    /// INTERPOLATE order=3.
+    ///
+    /// order=3 → k = ceil(4/2) = 2, n_anchors = 4 (same as order=2)
+    /// (The numba formula k=(order+1+1)//2 gives k=2 for both order=2 and order=3)
+    /// track = [3.0, 1.0, 5.0, 9.0, 2.0, 6.0], v_rel_pos=2, v_len=4
+    ///
+    /// Anchors:
+    ///   5' side (j=0,1):
+    ///     xs[0]=0.0, ys[0]=track[2]=5.0
+    ///     xs[1]=-1.0, ys[1]=track[1]=1.0
+    ///   3' side (j=0,1):
+    ///     xs[2]=4.0, ys[2]=track[3]=9.0
+    ///     xs[3]=5.0, ys[3]=track[4]=2.0
+    #[test]
+    fn test_interpolate_order3() {
+        let track = [3.0f32, 1.0, 5.0, 9.0, 2.0, 6.0];
+        let v_rel_pos = 2i64;
+        let v_len = 4i64;
+        let writable_length = 4;
+
+        // k=2, n_anchors=4
+        let xs = [0.0f64, -1.0f64, 4.0f64, 5.0f64];
+        let ys = [5.0f64, 1.0f64, 9.0f64, 2.0f64];
+        let n = 4usize;
+
+        let expected: Vec<f32> = (0..writable_length)
+            .map(|i| {
+                let x = i as f64;
+                let mut acc = 0.0f64;
+                for a in 0..n {
+                    let mut term = ys[a];
+                    for b in 0..n {
+                        if b == a { continue; }
+                        term *= (x - xs[b]) / (xs[a] - xs[b]);
+                    }
+                    acc += term;
+                }
+                acc as f32
+            })
+            .collect();
+
+        let result = run_fill(
+            writable_length,
+            0,
+            writable_length,
+            v_len,
+            &track,
+            v_rel_pos,
+            INTERPOLATE,
+            &[3.0f64], // order=3
+            0,
+            0,
+            0,
+        );
+
+        // At x=0, result should equal track[v_rel_pos]=5.0
+        assert_eq!(result[0], 5.0f32, "order=3 left endpoint must equal track[v_rel_pos]");
+        for (i, (&got, &exp)) in result.iter().zip(expected.iter()).enumerate() {
+            assert_eq!(got, exp, "INTERPOLATE order=3 at i={i}: got {got}, expected {exp}");
+        }
+    }
+
+    /// INTERPOLATE: verify that order=1 at x=v_len gives the 3' anchor value.
+    ///
+    /// With track=[2.0, 10.0, 6.0], v_rel_pos=1, v_len=2:
+    ///   xs=[0.0, 2.0], ys=[10.0, 6.0]
+    ///   At x=0: acc = 10.0*(0-2)/(0-2) + 6.0*(0-0)/(2-0) = 10.0 + 0.0 = 10.0 ✓
+    ///   At x=1: acc = 10.0*(1-2)/(0-2) + 6.0*(1-0)/(2-0) = 10.0*0.5 + 6.0*0.5 = 8.0
+    ///   (Note: x=v_len=2 would be exactly 6.0 but writable_length=2 so we test x=0,1)
+    #[test]
+    fn test_interpolate_order1_endpoints() {
+        let track = [2.0f32, 10.0, 6.0];
+        let v_rel_pos = 1i64;
+        let v_len = 2i64;
+
+        // writable_length = v_len = 2, covering x=0,1
+        let result = run_fill(
+            2,
+            0,
+            2,
+            v_len,
+            &track,
+            v_rel_pos,
+            INTERPOLATE,
+            &[1.0f64],
+            0,
+            0,
+            0,
+        );
+
+        // x=0 must equal track[v_rel_pos] = 10.0
+        assert_eq!(result[0], 10.0f32, "left endpoint");
+
+        // x=1: hand-computed
+        // xs=[0.0, 2.0], ys=[10.0, 6.0]
+        // term_0 = 10.0 * (1-2)/(0-2) = 10.0 * 0.5 = 5.0
+        // term_1 = 6.0 * (1-0)/(2-0) = 6.0 * 0.5 = 3.0; acc=8.0
+        let x = 1.0f64;
+        let xs = [0.0f64, 2.0f64];
+        let ys = [10.0f64, 6.0f64];
+        let mut acc = 0.0f64;
+        for a in 0..2 {
+            let mut term = ys[a];
+            for b in 0..2 {
+                if b == a { continue; }
+                term *= (x - xs[b]) / (xs[a] - xs[b]);
+            }
+            acc += term;
+        }
+        assert_eq!(result[1], acc as f32, "midpoint check");
+    }
+
+    /// REPEAT_5P: fills with track[v_rel_pos] directly.
+    #[test]
+    fn test_repeat_5p() {
+        let track = [5.0f32, 11.0, 7.0];
+        let v_rel_pos = 1i64;
+        let result = run_fill(4, 0, 4, 4, &track, v_rel_pos, REPEAT_5P, &[0.0], 0, 0, 0);
+        for &v in &result {
+            assert_eq!(v, 11.0f32, "REPEAT_5P: expected 11.0");
+        }
+    }
+
+    // ================================================================== //
+    // shift_and_realign_track_sparse tests                                //
+    // ================================================================== //
+
+    /// Helper to build the split (2, n) offsets and call `shift_and_realign_track_sparse`.
+    fn run_singular(
+        geno_v_idxs: &[i32],
+        geno_offsets_1d: &[i64], // 1-D (n+1)
+        offset_idx: usize,
+        v_starts: &[i32],
+        ilens: &[i32],
+        shift: i64,
+        track: &[f32],
+        query_start: i64,
+        out_len: usize,
+        params: &[f64],
+        keep: Option<&[bool]>,
+        strategy_id: i64,
+        base_seed: u64,
+        query: u64,
+        hap: u64,
+    ) -> Vec<f32> {
+        use ndarray::Array1;
+        let n = geno_offsets_1d.len() - 1;
+        let o_starts: Vec<i64> = geno_offsets_1d[..n].to_vec();
+        let o_stops: Vec<i64> = geno_offsets_1d[1..].to_vec();
+
+        let gvi_arr = Array1::from_vec(geno_v_idxs.to_vec());
+        let os_arr = Array1::from_vec(o_starts);
+        let oe_arr = Array1::from_vec(o_stops);
+        let vs_arr = Array1::from_vec(v_starts.to_vec());
+        let il_arr = Array1::from_vec(ilens.to_vec());
+        let track_arr = Array1::from_vec(track.to_vec());
+        let params_arr = Array1::from_vec(params.to_vec());
+
+        let mut out_arr = Array1::<f32>::zeros(out_len);
+        {
+            let mut out_view = out_arr.view_mut();
+            let keep_arr_opt = keep.map(|k| Array1::from_vec(k.to_vec()));
+            let keep_view = keep_arr_opt.as_ref().map(|a| a.view());
+            shift_and_realign_track_sparse(
+                offset_idx,
+                gvi_arr.view(),
+                os_arr.view(),
+                oe_arr.view(),
+                vs_arr.view(),
+                il_arr.view(),
+                shift,
+                track_arr.view(),
+                query_start,
+                &mut out_view,
+                params_arr.view(),
+                keep_view,
+                strategy_id,
+                base_seed,
+                query,
+                hap,
+            );
+        }
+        out_arr.to_vec()
+    }
+
+    /// No variants → out = track[:length] (shift must be 0).
+    #[test]
+    fn test_singular_no_variants() {
+        // track = [1.0, 2.0, 3.0, 4.0, 5.0], no variants, out_len = 4
+        let track = [1.0f32, 2.0, 3.0, 4.0, 5.0];
+        let geno_v_idxs: Vec<i32> = vec![];
+        let geno_offsets = vec![0i64, 0]; // one empty group
+        let v_starts: Vec<i32> = vec![];
+        let ilens: Vec<i32> = vec![];
+
+        let result = run_singular(
+            &geno_v_idxs,
+            &geno_offsets,
+            0,
+            &v_starts,
+            &ilens,
+            0, // shift
+            &track,
+            0, // query_start
+            4, // out_len
+            &[0.0],
+            None,
+            REPEAT_5P,
+            0,
+            0,
+            0,
+        );
+        assert_eq!(result, [1.0f32, 2.0, 3.0, 4.0], "no variants: copy track[:length]");
+    }
+
+    /// Deletion: track[v_rel_pos] repeated for writable_length; track advances by
+    /// |v_rel_end|.
+    ///
+    /// Setup:
+    ///   track = [10.0, 20.0, 30.0, 40.0, 50.0], query_start = 0, out_len = 4
+    ///   variant at v_start=1, ilen=-2 → v_rel_pos=1, v_diff=-2, v_rel_end=4
+    ///   v_len = max(0,-2)+1 = 1
+    ///   Expected: track[0..1] = [10.0], then track[1] repeated 1 time = [20.0],
+    ///   then track[4:] = [50.0], padded 0.0 if needed.
+    ///   Actually: out[0] = track[0] = 10.0 (ref up to v_rel_pos=1, track_len=1-0=1)
+    ///             out[1] = track[v_rel_pos=1] = 20.0 (repeated 1 time = v_len=1)
+    ///             track_idx = v_rel_end = 4; out_idx = 2
+    ///             fill rest: track[4:] = [50.0] → out[2] = 50.0; out[3] = 0.0 (pad)
+    #[test]
+    fn test_singular_deletion() {
+        let track = [10.0f32, 20.0, 30.0, 40.0, 50.0];
+        let v_starts = [1i32]; // v_start = 1
+        let ilens = [-2i32]; // deletion of 2 → v_rel_end = 1 - (-2) + 1 = 4... wait
+        // v_rel_end = v_rel_pos - min(0, v_diff) + 1 = 1 - (-2) + 1 = 4
+        // Actually: v_rel_end = 1 - min(0, -2) + 1 = 1 - (-2) + 1 = 4
+        // v_len = max(0, -2) + 1 = 0 + 1 = 1
+        // track up to v_rel_pos=1: track[0..1] = [10.0], out[0] = 10.0
+        // v_len=1 repeated: out[1] = track[1] = 20.0
+        // track_idx = 4; remaining: track[4..5] = [50.0] → out[2] = 50.0
+        // out[3] = 0.0 (trailing pad)
+        let geno_v_idxs = [0i32];
+        let geno_offsets = [0i64, 1];
+
+        let result = run_singular(
+            &geno_v_idxs,
+            &geno_offsets,
+            0,
+            &v_starts,
+            &ilens,
+            0,
+            &track,
+            0,
+            4,
+            &[0.0],
+            None,
+            REPEAT_5P,
+            0,
+            0,
+            0,
+        );
+        assert_eq!(result[0], 10.0f32, "ref before deletion");
+        assert_eq!(result[1], 20.0f32, "deletion: track[v_rel_pos] repeated");
+        assert_eq!(result[2], 50.0f32, "ref after deletion (track_idx=4)");
+        assert_eq!(result[3], 0.0f32, "trailing pad = 0.0");
+    }
+
+    /// Deletion whose `v_rel_end` runs past track end — trailing pad starts from out_idx.
+    ///
+    /// When a deletion is so large that `v_rel_end` exceeds `track_len`, `track_idx`
+    /// advances past the end of `track`, making `writable_ref` negative.  The fixed
+    /// kernel clamps `out_end_idx` to `out_idx` (matching the fixed numba kernel's
+    /// `max(0, min(unfilled, len(track)-track_idx))`), so the zero-pad covers exactly
+    /// `out[out_idx..length]` without overwriting already-written positions.
+    ///
+    /// Setup:
+    ///   track = [1.0, 2.0, 3.0, 4.0, 5.0] (track_len=5), query_start=0, out_len=8
+    ///   variant at v_start=3, ilen=-3 → v_rel_pos=3, v_diff=-3, v_rel_end=3-(-3)+1=7
+    ///   v_len = max(0,-3)+1 = 1
+    ///
+    /// Main loop:
+    ///   copy track[0..3] → out[0..3] = [1,2,3]; out_idx=3
+    ///   deletion REPEAT_5P: out[3] = track[3] = 4.0; out_idx=4
+    ///   track_idx = v_rel_end = 7  (past track end = 5!)
+    ///
+    /// Trailing fill (correct):
+    ///   writable_ref = min(4, 5-7) = -2  ← negative, no track bytes remain
+    ///   out_end_idx = out_idx = 4  (NOT (4 + -2).max(0) = 2)
+    ///   out[4..8] = 0.0
+    ///   Final: [1.0, 2.0, 3.0, 4.0, 0.0, 0.0, 0.0, 0.0]
+    #[test]
+    fn test_singular_deletion_past_track_end() {
+        // track_len=5, out_len=8, deletion at v_start=3 with ilen=-3
+        let track = [1.0f32, 2.0, 3.0, 4.0, 5.0];
+        let v_starts = [3i32];
+        let ilens = [-3i32]; // v_diff=-3, v_rel_end = 3-(-3)+1 = 7 (past track_len=5)
+        let geno_v_idxs = [0i32];
+        let geno_offsets = [0i64, 1];
+
+        let result = run_singular(
+            &geno_v_idxs,
+            &geno_offsets,
+            0,
+            &v_starts,
+            &ilens,
+            0, // shift
+            &track,
+            0, // query_start
+            8, // out_len
+            &[0.0],
+            None,
+            REPEAT_5P,
+            0,
+            0,
+            0,
+        );
+
+        // out[0..4] from main loop; zero-pad covers out[4..8] from out_idx (not index 2).
+        assert_eq!(result[0], 1.0f32, "ref[0]");
+        assert_eq!(result[1], 2.0f32, "ref[1]");
+        assert_eq!(result[2], 3.0f32, "ref[2] — must NOT be overwritten by zero-pad");
+        assert_eq!(result[3], 4.0f32, "deletion REPEAT_5P value — must NOT be overwritten");
+        assert_eq!(result[4], 0.0f32, "zero-pad[4]");
+        assert_eq!(result[5], 0.0f32, "zero-pad[5]");
+        assert_eq!(result[6], 0.0f32, "zero-pad[6]");
+        assert_eq!(result[7], 0.0f32, "zero-pad[7]");
+    }
+
+    /// Deletion drives track_idx past the track end (overshoot) — trailing pad from out_idx.
+    ///
+    /// Mirrors ``overshoot_ref_past_contig`` from reconstruct/mod.rs.
+    /// When writable_ref <= 0, out_end_idx must be clamped to out_idx so that
+    /// out[out_idx..length] is zero-padded without overwriting already-written positions.
+    ///
+    /// The fixed numba kernel uses ``max(0, min(unfilled, len(track)-track_idx))``,
+    /// giving writable_ref=0 and out_end_idx=out_idx. The Rust kernel must match.
+    ///
+    /// Setup (identical to test_singular_deletion_past_track_end):
+    ///   track=[1,2,3,4,5] (len=5), out_len=8, deletion at v_start=3, ilen=-3
+    ///   v_rel_end=7 (>track_len=5) → track_idx advances past track end
+    ///   After main loop: out[0..4]=[1,2,3,4], out_idx=4, track_idx=7
+    ///
+    /// Trailing fill (correct):
+    ///   writable_ref = min(4, 5-7) = -2  ← negative
+    ///   out_end_idx = out_idx = 4  (NOT (4 + -2).max(0) = 2)
+    ///   out[4..8] = 0.0
+    ///   Expected: [1.0, 2.0, 3.0, 4.0, 0.0, 0.0, 0.0, 0.0]
+    #[test]
+    fn overshoot_track_past_end() {
+        let track = [1.0f32, 2.0, 3.0, 4.0, 5.0];
+        let v_starts = [3i32];
+        let ilens = [-3i32];
+        let geno_v_idxs = [0i32];
+        let geno_offsets = [0i64, 1];
+
+        let result = run_singular(
+            &geno_v_idxs,
+            &geno_offsets,
+            0,
+            &v_starts,
+            &ilens,
+            0,
+            &track,
+            0,
+            8,
+            &[0.0],
+            None,
+            REPEAT_5P,
+            0,
+            0,
+            0,
+        );
+        // out[0..4] from main loop; out[4..8] zero-padded from out_idx (not index 2)
+        assert_eq!(
+            result,
+            [1.0f32, 2.0, 3.0, 4.0, 0.0, 0.0, 0.0, 0.0],
+            "overshoot: zero-pad must start from out_idx=4, not (out_idx+writable_ref).max(0)=2"
+        );
+    }
+
+    /// SNP (ilen=0) is SKIPPED — the output copies reference track straight through.
+    ///
+    /// Setup: track = [1.0, 2.0, 3.0, 4.0], query_start=0, out_len=4
+    ///   variant at v_start=2, ilen=0 → SNP, should be skipped
+    ///   Expected: out = [1.0, 2.0, 3.0, 4.0] (identical to track, SNP doesn't interrupt)
+    #[test]
+    fn test_singular_snp_skipped() {
+        let track = [1.0f32, 2.0, 3.0, 4.0];
+        let v_starts = [2i32];
+        let ilens = [0i32]; // SNP
+        let geno_v_idxs = [0i32];
+        let geno_offsets = [0i64, 1];
+
+        let result = run_singular(
+            &geno_v_idxs,
+            &geno_offsets,
+            0,
+            &v_starts,
+            &ilens,
+            0,
+            &track,
+            0,
+            4,
+            &[0.0],
+            None,
+            REPEAT_5P,
+            0,
+            0,
+            0,
+        );
+        // SNP is skipped — output equals track[:length]
+        assert_eq!(result, [1.0f32, 2.0, 3.0, 4.0], "SNP must be skipped for tracks");
+    }
+
+    /// Insertion with REPEAT_5P strategy: repeated track[v_rel_pos].
+    ///
+    /// Setup: track = [5.0, 10.0, 15.0, 20.0, 25.0], query_start=0, out_len=6
+    ///   variant at v_start=1, ilen=+2 → v_rel_pos=1, v_diff=2, v_rel_end=2
+    ///   v_len = max(0,2)+1 = 3
+    ///   REPEAT_5P: repeat track[v_rel_pos=1]=10.0 for writable_length=min(3, 6-1)=3
+    ///   ref before: track[0..1] = [5.0] → out[0]
+    ///   insertion: out[1..4] = [10.0, 10.0, 10.0]
+    ///   track_idx = v_rel_end = 2; remaining: track[2..5] → out[4..6] = [15.0, 20.0]
+    #[test]
+    fn test_singular_insertion_repeat5p() {
+        let track = [5.0f32, 10.0, 15.0, 20.0, 25.0];
+        let v_starts = [1i32];
+        let ilens = [2i32]; // insertion
+        let geno_v_idxs = [0i32];
+        let geno_offsets = [0i64, 1];
+
+        let result = run_singular(
+            &geno_v_idxs,
+            &geno_offsets,
+            0,
+            &v_starts,
+            &ilens,
+            0,
+            &track,
+            0,
+            6,
+            &[0.0],
+            None,
+            REPEAT_5P,
+            0,
+            0,
+            0,
+        );
+        assert_eq!(result[0], 5.0f32, "ref before insertion");
+        assert_eq!(result[1], 10.0f32, "insertion REPEAT_5P i=0");
+        assert_eq!(result[2], 10.0f32, "insertion REPEAT_5P i=1");
+        assert_eq!(result[3], 10.0f32, "insertion REPEAT_5P i=2");
+        assert_eq!(result[4], 15.0f32, "ref after insertion (track[2])");
+        assert_eq!(result[5], 20.0f32, "ref after insertion (track[3])");
+    }
+
+    /// Insertion with CONSTANT strategy: fills with params[0].
+    #[test]
+    fn test_singular_insertion_constant() {
+        let track = [5.0f32, 10.0, 15.0, 20.0];
+        let v_starts = [1i32];
+        let ilens = [1i32]; // insertion: v_len = 2
+        let geno_v_idxs = [0i32];
+        let geno_offsets = [0i64, 1];
+        let fill_val = 99.0f64;
+
+        // out_len=5: ref[0..1]=[5.0], ins[1..3]=[99.0,99.0], ref after=track[2..4]
+        let result = run_singular(
+            &geno_v_idxs,
+            &geno_offsets,
+            0,
+            &v_starts,
+            &ilens,
+            0,
+            &track,
+            0,
+            5,
+            &[fill_val],
+            None,
+            CONSTANT,
+            0,
+            0,
+            0,
+        );
+        assert_eq!(result[0], 5.0f32, "ref before insertion");
+        assert_eq!(result[1], fill_val as f32, "CONSTANT fill i=0");
+        assert_eq!(result[2], fill_val as f32, "CONSTANT fill i=1");
+        assert_eq!(result[3], 15.0f32, "ref after insertion (track[2])");
+        assert_eq!(result[4], 20.0f32, "ref after insertion (track[3])");
+    }
+
+    /// Shift: when shift > 0, track values are consumed from a later position.
+    ///
+    /// track = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0], shift=2, no variants, out_len=4
+    /// Expected: track[2..6] = [2.0, 3.0, 4.0, 5.0]
+    #[test]
+    fn test_singular_shift_no_variants() {
+        // With no variants, shift > 0 is handled by the post-loop track_idx adjustment.
+        // Numba: if shifted < shift: track_idx += shift - shifted; ...
+        // But the loop is never entered, so shifted stays 0.
+        // Post-loop: track_idx = 0 + shift = 2; writable_ref = min(4, 6-2) = 4
+        let track = [0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
+        let geno_v_idxs: Vec<i32> = vec![];
+        let geno_offsets = vec![0i64, 0]; // empty group
+        let v_starts: Vec<i32> = vec![];
+        let ilens: Vec<i32> = vec![];
+
+        // Note: numba says "guaranteed to have shift = 0" when n_variants == 0,
+        // so this tests the case where the variant list is empty BUT shift is 0.
+        // For non-zero shift with no variants, it's technically undefined (won't be
+        // called in production), but let's verify shift=0 with an offset.
+        let result = run_singular(
+            &geno_v_idxs,
+            &geno_offsets,
+            0,
+            &v_starts,
+            &ilens,
+            0, // shift=0 (no variants path)
+            &track,
+            0,
+            4,
+            &[0.0],
+            None,
+            REPEAT_5P,
+            0,
+            0,
+            0,
+        );
+        assert_eq!(result, [0.0f32, 1.0, 2.0, 3.0], "no variants + shift=0: copy track[:4]");
+    }
+
+    /// Shift=2 with one insertion variant: verify shift-through-variant logic.
+    ///
+    /// track=[0,1,2,3,4,5,6], query_start=0, shift=2, out_len=4
+    /// Insertion at v_start=1, ilen=+3 → v_rel_pos=1, v_len=4
+    ///
+    /// ref_shift_dist = 1 - 0 = 1
+    /// shifted + ref_shift_dist + v_len = 0 + 1 + 4 = 5 >= shift=2, so NOT "need more"
+    /// shifted + ref_shift_dist = 0 + 1 = 1 < shift=2, so NOT "can finish without variant"
+    /// allele_start_idx = 2 - 0 - 1 = 1; shifted=2; allele_start_idx(1) != v_len(4)
+    /// track_idx = v_rel_pos = 1; v_len -= 1 → v_len = 3
+    ///
+    /// Then v_diff=3 > 0, strategy=REPEAT_5P: repeat track[v_rel_pos=1]=1.0 for writable=min(3,4)=3
+    /// out[0..3] = [1.0, 1.0, 1.0]; track_idx = v_rel_end = 2; out_idx = 3
+    /// fill rest: track[2:] → out[3] = track[2] = 2.0
+    #[test]
+    fn test_singular_shift_through_insertion() {
+        let track: Vec<f32> = (0..7).map(|x| x as f32).collect();
+        let v_starts = [1i32]; // insertion at pos 1
+        let ilens = [3i32]; // +3 → v_len = 4, v_rel_end = 1 - 0 + 1 = 2
+        let geno_v_idxs = [0i32];
+        let geno_offsets = [0i64, 1];
+
+        let result = run_singular(
+            &geno_v_idxs,
+            &geno_offsets,
+            0,
+            &v_starts,
+            &ilens,
+            2, // shift
+            &track,
+            0,
+            4,
+            &[0.0],
+            None,
+            REPEAT_5P,
+            0,
+            0,
+            0,
+        );
+        // shifted=2, allele_start_idx=1 ≠ v_len=4 → track_idx=1, v_len=3
+        // v_diff=3≠0 and REPEAT_5P: out[0..3] = track[v_rel_pos=1] = 1.0
+        // out[3] = track[2] = 2.0
+        assert_eq!(result[0], 1.0f32, "insertion repeat after shift");
+        assert_eq!(result[1], 1.0f32, "insertion repeat");
+        assert_eq!(result[2], 1.0f32, "insertion repeat");
+        assert_eq!(result[3], 2.0f32, "ref after insertion");
+    }
+
+    // ================================================================== //
+    // shift_and_realign_tracks_sparse (batch) tests                      //
+    // ================================================================== //
+
+    /// Helper for the batch function.
+    fn run_batch(
+        out_len: usize,
+        out_offsets: &[i64],
+        regions: &[[i32; 3]],
+        shifts: &[i32],   // flat, will be reshaped (n_q, ploidy)
+        geno_offset_idx: &[i64], // flat (n_q * ploidy)
+        geno_v_idxs: &[i32],
+        geno_offsets_1d: &[i64],
+        v_starts: &[i32],
+        ilens: &[i32],
+        tracks: &[f32],
+        track_offsets: &[i64],
+        params: &[f64],
+        keep: Option<(&[bool], &[i64])>,
+        strategy_id: i64,
+        base_seed: u64,
+        ploidy: usize,
+        parallel: bool,
+    ) -> Vec<f32> {
+        use ndarray::{Array1, Array2};
+        let n_q = regions.len();
+        // Build (2, n_q*ploidy) offsets
+        let n = geno_offsets_1d.len() - 1;
+        let o_starts: Vec<i64> = geno_offsets_1d[..n].to_vec();
+        let o_stops: Vec<i64> = geno_offsets_1d[1..].to_vec();
+
+        let regions_arr = Array2::from_shape_vec(
+            (n_q, 3),
+            regions.iter().flat_map(|r| r.iter().cloned()).collect(),
+        )
+        .unwrap();
+        let shifts_arr = Array2::from_shape_vec(
+            (n_q, ploidy),
+            shifts.to_vec(),
+        )
+        .unwrap();
+        let goi_arr = Array2::from_shape_vec(
+            (n_q, ploidy),
+            geno_offset_idx.to_vec(),
+        )
+        .unwrap();
+
+        let out_offsets_arr = Array1::from_vec(out_offsets.to_vec());
+        let gvi_arr = Array1::from_vec(geno_v_idxs.to_vec());
+        let os_arr = Array1::from_vec(o_starts);
+        let oe_arr = Array1::from_vec(o_stops);
+        let vs_arr = Array1::from_vec(v_starts.to_vec());
+        let il_arr = Array1::from_vec(ilens.to_vec());
+        let tracks_arr = Array1::from_vec(tracks.to_vec());
+        let to_arr = Array1::from_vec(track_offsets.to_vec());
+        let params_arr = Array1::from_vec(params.to_vec());
+
+        let mut out_arr = Array1::<f32>::zeros(out_len);
+
+        let (keep_arr_opt, keep_off_arr_opt) = if let Some((k, ko)) = keep {
+            (
+                Some(Array1::from_vec(k.to_vec())),
+                Some(Array1::from_vec(ko.to_vec())),
+            )
+        } else {
+            (None, None)
+        };
+
+        shift_and_realign_tracks_sparse(
+            out_arr.view_mut(),
+            out_offsets_arr.view(),
+            regions_arr.view(),
+            shifts_arr.view(),
+            goi_arr.view(),
+            gvi_arr.view(),
+            os_arr.view(),
+            oe_arr.view(),
+            vs_arr.view(),
+            il_arr.view(),
+            tracks_arr.view(),
+            to_arr.view(),
+            params_arr.view(),
+            keep_arr_opt.as_ref().map(|a| a.view()),
+            keep_off_arr_opt.as_ref().map(|a| a.view()),
+            strategy_id,
+            base_seed,
+            parallel,
+        );
+
+        out_arr.to_vec()
+    }
+
+    /// Batch with 1 query, 1 hap, no variants → copies track.
+    #[test]
+    fn test_batch_single_no_variants() {
+        // track = [1.0, 2.0, 3.0, 4.0, 5.0] for query 0
+        let tracks = [1.0f32, 2.0, 3.0, 4.0, 5.0];
+        let regions = [[0i32, 0, 4]]; // length=4
+        let shifts = [0i32];
+        let geno_offset_idx = [0i64]; // (1, 1)
+        let geno_v_idxs: Vec<i32> = vec![];
+        let geno_offsets = [0i64, 0]; // empty group
+        let v_starts: Vec<i32> = vec![];
+        let ilens: Vec<i32> = vec![];
+        let track_offsets = [0i64, 5];
+        let out_offsets = [0i64, 4];
+        let params = [0.0f64];
+
+        let result = run_batch(
+            4,
+            &out_offsets,
+            &regions,
+            &shifts,
+            &geno_offset_idx,
+            &geno_v_idxs,
+            &geno_offsets,
+            &v_starts,
+            &ilens,
+            &tracks,
+            &track_offsets,
+            &params,
+            None,
+            REPEAT_5P,
+            0,
+            1, // ploidy
+            false,
+        );
+        assert_eq!(result, [1.0f32, 2.0, 3.0, 4.0], "batch single: copy track[:4]");
+    }
+
+    /// Batch with 2 queries, 1 hap each, SNPs — must pass through unchanged.
+    #[test]
+    fn test_batch_two_queries_snps() {
+        // query 0: track[0..3] = [1.0, 2.0, 3.0], SNP at pos 1 (skipped) → out=[1,2,3]
+        // query 1: track[3..6] = [4.0, 5.0, 6.0], no variants → out=[4,5,6]
+        let tracks = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
+        let regions = [[0i32, 0, 3], [0, 10, 13]];
+        let shifts = [0i32, 0];
+        let geno_offset_idx = [0i64, 1]; // q0→group0, q1→group1
+        let geno_v_idxs = [0i32]; // query 0 has SNP variant 0
+        let v_starts = [1i32]; // v at pos 1 (within q0 [0,3))
+        let ilens = [0i32]; // SNP → should be skipped
+        let geno_offsets = [0i64, 1, 1]; // group0=[0..1], group1=[1..1]=empty
+        let track_offsets = [0i64, 3, 6];
+        let out_offsets = [0i64, 3, 6];
+        let params = [0.0f64];
+
+        let result = run_batch(
+            6,
+            &out_offsets,
+            &regions,
+            &shifts,
+            &geno_offset_idx,
+            &geno_v_idxs,
+            &geno_offsets,
+            &v_starts,
+            &ilens,
+            &tracks,
+            &track_offsets,
+            &params,
+            None,
+            REPEAT_5P,
+            0,
+            1,
+            false,
+        );
+        // SNP skipped → query 0 output = track[0..3]
+        assert_eq!(result[..3], [1.0f32, 2.0, 3.0], "q0: SNP skipped, track copied");
+        // No variants in q1 → track[3..6]
+        assert_eq!(result[3..], [4.0f32, 5.0, 6.0], "q1: no variants, track copied");
+    }
+
+    // ================================================================== //
+    // tracks_to_intervals tests                                            //
+    // ================================================================== //
+
+    /// Hand-built RLE example with 3 queries:
+    /// - q0: empty (track_offsets[0]==track_offsets[1])  → 0 intervals
+    /// - q1: all-constant [5.0, 5.0, 5.0] at region [0, 10, 13] → 1 interval [10,13) val=5.0
+    /// - q2: two runs [1.0, 1.0, 2.0, 2.0, 2.0] at region [0, 20, 25] → 2 intervals
+    ///         [20,22) val=1.0  and  [22,25) val=2.0
+    ///
+    /// Expected offsets: [0, 0, 1, 3]
+    #[test]
+    fn test_tracks_to_intervals_hand_built() {
+        use super::tracks_to_intervals;
+        use ndarray::{Array1, Array2};
+
+        // regions: (n_queries, 3) — (contig_idx, start, end)
+        let regions_data = vec![
+            0i32, 0, 0,   // q0: empty length
+            0i32, 10, 13, // q1: [10, 13), length 3
+            0i32, 20, 25, // q2: [20, 25), length 5
+        ];
+        let regions = Array2::from_shape_vec((3, 3), regions_data).unwrap();
+
+        // tracks: q0 empty, q1 = [5,5,5], q2 = [1,1,2,2,2]
+        let tracks_data = vec![5.0f32, 5.0, 5.0, 1.0, 1.0, 2.0, 2.0, 2.0];
+        let tracks = Array1::from_vec(tracks_data);
+
+        // track_offsets: [0, 0, 3, 8]
+        let track_offsets = Array1::from_vec(vec![0i64, 0, 3, 8]);
+
+        let (starts, ends, values, offsets) =
+            tracks_to_intervals(regions.view(), tracks.view(), track_offsets.view(), false);
+
+        // offsets: [0, 0, 1, 3]
+        assert_eq!(offsets.as_slice().unwrap(), &[0i64, 0, 1, 3], "offsets mismatch");
+
+        // Total intervals = 3
+        assert_eq!(starts.len(), 3);
+        assert_eq!(ends.len(), 3);
+        assert_eq!(values.len(), 3);
+
+        // q1: interval 0 → [10, 13), val=5.0
+        assert_eq!(starts[0], 10i32, "q1 start");
+        assert_eq!(ends[0], 13i32, "q1 end");
+        assert_eq!(values[0], 5.0f32, "q1 value");
+
+        // q2: interval 1 → [20, 22), val=1.0
+        assert_eq!(starts[1], 20i32, "q2[0] start");
+        assert_eq!(ends[1], 22i32, "q2[0] end");
+        assert_eq!(values[1], 1.0f32, "q2[0] value");
+
+        // q2: interval 2 → [22, 25), val=2.0
+        assert_eq!(starts[2], 22i32, "q2[1] start");
+        assert_eq!(ends[2], 25i32, "q2[1] end");
+        assert_eq!(values[2], 2.0f32, "q2[1] value");
+    }
+
+    /// All-constant single query: exactly 1 interval covering full range.
+    #[test]
+    fn test_tracks_to_intervals_all_constant() {
+        use super::tracks_to_intervals;
+        use ndarray::{Array1, Array2};
+
+        let regions = Array2::from_shape_vec((1, 3), vec![0i32, 100, 107]).unwrap();
+        let tracks = Array1::from_vec(vec![3.14f32; 7]);
+        let track_offsets = Array1::from_vec(vec![0i64, 7]);
+
+        let (starts, ends, values, offsets) =
+            tracks_to_intervals(regions.view(), tracks.view(), track_offsets.view(), false);
+
+        assert_eq!(offsets.as_slice().unwrap(), &[0i64, 1]);
+        assert_eq!(starts.len(), 1);
+        assert_eq!(starts[0], 100i32);
+        assert_eq!(ends[0], 107i32);
+        assert_eq!(values[0], 3.14f32);
+    }
+
+    /// Empty query: track_offsets[0] == track_offsets[1] → 0 intervals, no panic.
+    #[test]
+    fn test_tracks_to_intervals_empty_query() {
+        use super::tracks_to_intervals;
+        use ndarray::{Array1, Array2};
+
+        let regions = Array2::from_shape_vec((1, 3), vec![0i32, 50, 50]).unwrap();
+        let tracks = Array1::from_vec(vec![]);
+        let track_offsets = Array1::from_vec(vec![0i64, 0]);
+
+        let (starts, ends, values, offsets) =
+            tracks_to_intervals(regions.view(), tracks.view(), track_offsets.view(), false);
+
+        assert_eq!(offsets.as_slice().unwrap(), &[0i64, 0]);
+        assert_eq!(starts.len(), 0);
+        assert_eq!(ends.len(), 0);
+        assert_eq!(values.len(), 0);
+    }
+
+    /// Zero-value intervals ARE included (not filtered).
+    #[test]
+    fn test_tracks_to_intervals_zero_value_included() {
+        use super::tracks_to_intervals;
+        use ndarray::{Array1, Array2};
+
+        // track = [0.0, 0.0, 1.0, 0.0] → 3 intervals: [0,2)=0.0, [2,3)=1.0, [3,4)=0.0
+        let regions = Array2::from_shape_vec((1, 3), vec![0i32, 0, 4]).unwrap();
+        let tracks = Array1::from_vec(vec![0.0f32, 0.0, 1.0, 0.0]);
+        let track_offsets = Array1::from_vec(vec![0i64, 4]);
+
+        let (starts, ends, values, offsets) =
+            tracks_to_intervals(regions.view(), tracks.view(), track_offsets.view(), false);
+
+        assert_eq!(offsets.as_slice().unwrap(), &[0i64, 3]);
+        assert_eq!(starts.len(), 3, "must have 3 intervals including zero-value ones");
+        assert_eq!(values[0], 0.0f32, "first interval is zero-value");
+        assert_eq!(starts[0], 0i32);
+        assert_eq!(ends[0], 2i32);
+        assert_eq!(values[1], 1.0f32);
+        assert_eq!(values[2], 0.0f32, "third interval is zero-value");
+        assert_eq!(starts[2], 3i32);
+        assert_eq!(ends[2], 4i32);
+    }
+}
diff --git a/src/variants/mod.rs b/src/variants/mod.rs
new file mode 100644
index 00000000..1a871d6f
--- /dev/null
+++ b/src/variants/mod.rs
@@ -0,0 +1,513 @@
+//! Flat variant gather/fill cores (pure ndarray). PyO3 lives in `crate::ffi`.
+pub mod windows;
+use ndarray::{Array1, ArrayView1};
+
+/// Generic per-row gather core. `T: Copy` — no num-traits needed.
+fn gather_rows_impl<T: Copy>(
+    geno_offset_idx: ArrayView1<i64>,
+    o_starts: ArrayView1<i64>,
+    o_stops: ArrayView1<i64>,
+    data: ArrayView1<T>,
+) -> (Array1<T>, Array1<i64>) {
+    let n_rows = geno_offset_idx.len();
+    let mut out_offsets = Array1::<i64>::zeros(n_rows + 1);
+    for i in 0..n_rows {
+        let goi = geno_offset_idx[i] as usize;
+        out_offsets[i + 1] = out_offsets[i] + (o_stops[goi] - o_starts[goi]);
+    }
+    let total = out_offsets[n_rows] as usize;
+    let mut v: Vec<T> = Vec::with_capacity(total);
+    for i in 0..n_rows {
+        let goi = geno_offset_idx[i] as usize;
+        let s = o_starts[goi] as usize;
+        let e = o_stops[goi] as usize;
+        for k in s..e {
+            v.push(data[k]);
+        }
+    }
+    (Array1::from_vec(v), out_offsets)
+}
+
+/// Per-row i32 gather (variant indices). Mirrors numba `_gather_v_idxs` / `_ss`.
+pub fn gather_rows_i32(
+    geno_offset_idx: ArrayView1<i64>,
+    o_starts: ArrayView1<i64>,
+    o_stops: ArrayView1<i64>,
+    data: ArrayView1<i32>,
+) -> (Array1<i32>, Array1<i64>) {
+    gather_rows_impl(geno_offset_idx, o_starts, o_stops, data)
+}
+
+/// Per-row f32 gather (dosage values). Preserves float32 dtype exactly.
+pub fn gather_rows_f32(
+    geno_offset_idx: ArrayView1<i64>,
+    o_starts: ArrayView1<i64>,
+    o_stops: ArrayView1<i64>,
+    data: ArrayView1<f32>,
+) -> (Array1<f32>, Array1<i64>) {
+    gather_rows_impl(geno_offset_idx, o_starts, o_stops, data)
+}
+
+/// Gather variable-length allele bytestrings. Mirrors numba `_gather_alleles`.
+pub fn gather_alleles(
+    v_idxs: ArrayView1<i32>,
+    allele_bytes: ArrayView1<u8>,
+    allele_offsets: ArrayView1<i64>,
+) -> (Array1<u8>, Array1<i64>) {
+    let n = v_idxs.len();
+    let mut seq_offsets = Array1::<i64>::zeros(n + 1);
+    for i in 0..n {
+        let v = v_idxs[i] as usize;
+        seq_offsets[i + 1] = seq_offsets[i] + (allele_offsets[v + 1] - allele_offsets[v]);
+    }
+    let total = seq_offsets[n] as usize;
+    let mut data = Array1::<u8>::zeros(total);
+    let mut dst = 0usize;
+    for i in 0..n {
+        let v = v_idxs[i] as usize;
+        let s = allele_offsets[v] as usize;
+        let e = allele_offsets[v + 1] as usize;
+        for k in s..e {
+            data[dst] = allele_bytes[k];
+            dst += 1;
+        }
+    }
+    (data, seq_offsets)
+}
+
+/// Reverse-complement the alleles of mask-selected `(b*p)` rows, in place.
+///
+/// `byte_data`   contiguous allele bytes (mutated in place)
+/// `seq_offsets` per-allele byte boundaries (len n_alleles + 1)
+/// `var_offsets` per-(b*p)-row allele boundaries (len n_rows + 1)
+/// `to_rc_row`   per-(b*p)-row bool mask (len n_rows)
+///
+/// Single fused pass: for each masked `(b*p)` row, reverse-complements each of
+/// its alleles directly via `reverse::rc_row`. `var_offsets` partition the
+/// alleles by row (contiguous, disjoint), so this RCs exactly the alleles the
+/// old per-allele-mask delegation did, in the same order — byte-identical —
+/// without the intermediate `Vec<bool>` alloc or the second full-allele scan.
+pub fn rc_alleles_inplace(
+    byte_data: &mut [u8],
+    seq_offsets: ndarray::ArrayView1<i64>,
+    var_offsets: ndarray::ArrayView1<i64>,
+    to_rc_row: ndarray::ArrayView1<bool>,
+) {
+    for g in 0..to_rc_row.len() {
+        if !to_rc_row[g] {
+            continue;
+        }
+        let a0 = var_offsets[g] as usize;
+        let a1 = var_offsets[g + 1] as usize;
+        for a in a0..a1 {
+            let s = seq_offsets[a] as usize;
+            let e = seq_offsets[a + 1] as usize;
+            crate::reverse::rc_row(&mut byte_data[s..e]);
+        }
+    }
+}
+
+/// Generic compact-keep core. Drops values where `keep[j]` is false and
+/// rebuilds row offsets. No `num_traits` dependency — uses `Vec<T>`.
+fn compact_keep_impl<T: Copy>(
+    values: ArrayView1<T>,
+    row_offsets: ArrayView1<i64>,
+    keep: ArrayView1<bool>,
+) -> (Array1<T>, Array1<i64>) {
+    let n_rows = row_offsets.len() - 1;
+    let mut new_offsets = Array1::<i64>::zeros(n_rows + 1);
+    let mut n_keep: i64 = 0;
+    for i in 0..n_rows {
+        for j in row_offsets[i] as usize..row_offsets[i + 1] as usize {
+            if keep[j] {
+                n_keep += 1;
+            }
+        }
+        new_offsets[i + 1] = n_keep;
+    }
+    let mut new_v: Vec<T> = Vec::with_capacity(n_keep as usize);
+    for j in 0..values.len() {
+        if keep[j] {
+            new_v.push(values[j]);
+        }
+    }
+    (Array1::from_vec(new_v), new_offsets)
+}
+
+/// Compact i32 values (variant indices). Mirrors numba `_compact_keep`.
+pub fn compact_keep_i32(
+    values: ArrayView1<i32>,
+    row_offsets: ArrayView1<i64>,
+    keep: ArrayView1<bool>,
+) -> (Array1<i32>, Array1<i64>) {
+    compact_keep_impl(values, row_offsets, keep)
+}
+
+/// Compact f32 values (dosage). Preserves float32 bit-pattern exactly.
+pub fn compact_keep_f32(
+    values: ArrayView1<f32>,
+    row_offsets: ArrayView1<i64>,
+    keep: ArrayView1<bool>,
+) -> (Array1<f32>, Array1<i64>) {
+    compact_keep_impl(values, row_offsets, keep)
+}
+
+/// Generic fill-empty-scalar core. Each empty row gets one `fill` element;
+/// non-empty rows copy through unchanged. No `num_traits` needed — `from_elem`.
+fn fill_empty_scalar_impl<T: Copy>(
+    data: ArrayView1<T>,
+    offsets: ArrayView1<i64>,
+    fill: T,
+) -> (Array1<T>, Array1<i64>) {
+    let n_rows = offsets.len() - 1;
+    let mut new_offsets = Array1::<i64>::zeros(n_rows + 1);
+    for i in 0..n_rows {
+        let ln = offsets[i + 1] - offsets[i];
+        new_offsets[i + 1] = new_offsets[i] + if ln > 0 { ln } else { 1 };
+    }
+    let total = new_offsets[n_rows] as usize;
+    // Pre-fill with `fill` so empty-row slots are already correct; copy non-empty.
+    let mut new_data = Array1::<T>::from_elem(total, fill);
+    for i in 0..n_rows {
+        let s = offsets[i] as usize;
+        let e = offsets[i + 1] as usize;
+        let mut d = new_offsets[i] as usize;
+        if e != s {
+            for k in s..e {
+                new_data[d] = data[k];
+                d += 1;
+            }
+        }
+    }
+    (new_data, new_offsets)
+}
+
+/// Fill-empty-scalar for i32 data (variant start / ilen). Mirrors numba `_fill_empty_scalar`.
+pub fn fill_empty_scalar_i32(
+    data: ArrayView1<i32>,
+    offsets: ArrayView1<i64>,
+    fill: i32,
+) -> (Array1<i32>, Array1<i64>) {
+    fill_empty_scalar_impl(data, offsets, fill)
+}
+
+/// Fill-empty-scalar for f32 data (dosage). Mirrors numba `_fill_empty_scalar`.
+pub fn fill_empty_scalar_f32(
+    data: ArrayView1<f32>,
+    offsets: ArrayView1<i64>,
+    fill: f32,
+) -> (Array1<f32>, Array1<i64>) {
+    fill_empty_scalar_impl(data, offsets, fill)
+}
+
+/// Generic fill-empty-fixed core. Each empty row gets `inner` copies of `fill`;
+/// non-empty rows copy their `n_var * inner` elements through.
+fn fill_empty_fixed_impl<T: Copy>(
+    data: ArrayView1<T>,
+    offsets: ArrayView1<i64>,
+    inner: i64,
+    fill: T,
+) -> (Array1<T>, Array1<i64>) {
+    let n_rows = offsets.len() - 1;
+    let mut new_offsets = Array1::<i64>::zeros(n_rows + 1);
+    for i in 0..n_rows {
+        let nv = offsets[i + 1] - offsets[i];
+        new_offsets[i + 1] = new_offsets[i] + if nv > 0 { nv } else { 1 };
+    }
+    let total_vars = new_offsets[n_rows] as usize;
+    let inner_u = inner as usize;
+    let mut new_data = Array1::<T>::from_elem(total_vars * inner_u, fill);
+    let mut dptr = 0usize;
+    for i in 0..n_rows {
+        let vs = offsets[i] as usize;
+        let ve = offsets[i + 1] as usize;
+        if ve == vs {
+            dptr += inner_u; // already filled by from_elem
+        } else {
+            for k in vs * inner_u..ve * inner_u {
+                new_data[dptr] = data[k];
+                dptr += 1;
+            }
+        }
+    }
+    (new_data, new_offsets)
+}
+
+/// Fill-empty-fixed for i32 data (flank_tokens). Mirrors numba `_fill_empty_fixed`.
+pub fn fill_empty_fixed_i32(
+    data: ArrayView1<i32>,
+    offsets: ArrayView1<i64>,
+    inner: i64,
+    fill: i32,
+) -> (Array1<i32>, Array1<i64>) {
+    fill_empty_fixed_impl(data, offsets, inner, fill)
+}
+
+/// Fill-empty-fixed for f32 data. Mirrors numba `_fill_empty_fixed`.
+pub fn fill_empty_fixed_f32(
+    data: ArrayView1<f32>,
+    offsets: ArrayView1<i64>,
+    inner: i64,
+    fill: f32,
+) -> (Array1<f32>, Array1<i64>) {
+    fill_empty_fixed_impl(data, offsets, inner, fill)
+}
+
+/// Generic two-level dummy-fill for allele/token bytestrings. Mirrors numba `_fill_empty_seq`.
+/// Empty variant-rows receive one dummy allele/token sequence of `dummy` elements.
+/// Returns `(new_data, new_var_offsets, new_seq_offsets)`.
+fn fill_empty_seq_impl<T: Copy>(
+    data: ArrayView1<T>,
+    var_offsets: ArrayView1<i64>,
+    seq_offsets: ArrayView1<i64>,
+    dummy: ArrayView1<T>,
+) -> (Array1<T>, Array1<i64>, Array1<i64>) {
+    let n_rows = var_offsets.len() - 1;
+    let l = dummy.len() as i64;
+    let mut new_var = Array1::<i64>::zeros(n_rows + 1);
+    for i in 0..n_rows {
+        let nv = var_offsets[i + 1] - var_offsets[i];
+        new_var[i + 1] = new_var[i] + if nv > 0 { nv } else { 1 };
+    }
+    let total_vars = new_var[n_rows] as usize;
+    let mut new_seq = Array1::<i64>::zeros(total_vars + 1);
+    let mut vptr = 0usize;
+    for i in 0..n_rows {
+        let vs = var_offsets[i] as usize;
+        let ve = var_offsets[i + 1] as usize;
+        if ve == vs {
+            new_seq[vptr + 1] = new_seq[vptr] + l;
+            vptr += 1;
+        } else {
+            for v in vs..ve {
+                let vlen = seq_offsets[v + 1] - seq_offsets[v];
+                new_seq[vptr + 1] = new_seq[vptr] + vlen;
+                vptr += 1;
+            }
+        }
+    }
+    let total = new_seq[total_vars] as usize;
+    let mut new_data: Vec<T> = Vec::with_capacity(total);
+    for i in 0..n_rows {
+        let vs = var_offsets[i] as usize;
+        let ve = var_offsets[i + 1] as usize;
+        if ve == vs {
+            for k in 0..dummy.len() {
+                new_data.push(dummy[k]);
+            }
+        } else {
+            for v in vs..ve {
+                let bs = seq_offsets[v] as usize;
+                let be = seq_offsets[v + 1] as usize;
+                for k in bs..be {
+                    new_data.push(data[k]);
+                }
+            }
+        }
+    }
+    (Array1::from_vec(new_data), new_var, new_seq)
+}
+
+/// Two-level dummy-fill for allele bytestrings (uint8). Mirrors numba `_fill_empty_seq`.
+pub fn fill_empty_seq_u8(
+    data: ArrayView1<u8>,
+    var_offsets: ArrayView1<i64>,
+    seq_offsets: ArrayView1<i64>,
+    dummy: ArrayView1<u8>,
+) -> (Array1<u8>, Array1<i64>, Array1<i64>) {
+    fill_empty_seq_impl(data, var_offsets, seq_offsets, dummy)
+}
+
+/// Two-level dummy-fill for token windows (int32). Mirrors numba `_fill_empty_seq`.
+pub fn fill_empty_seq_i32(
+    data: ArrayView1<i32>,
+    var_offsets: ArrayView1<i64>,
+    seq_offsets: ArrayView1<i64>,
+    dummy: ArrayView1<i32>,
+) -> (Array1<i32>, Array1<i64>, Array1<i64>) {
+    fill_empty_seq_impl(data, var_offsets, seq_offsets, dummy)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::arr1;
+
+    #[test]
+    fn test_gather_rows_basic() {
+        // 2 rows selecting offset groups 1 then 0.
+        let goi = arr1(&[1i64, 0]);
+        let o_starts = arr1(&[0i64, 2]);
+        let o_stops = arr1(&[2i64, 5]);
+        let data = arr1(&[10i32, 11, 12, 13, 14]);
+        let (v, off) = gather_rows_i32(goi.view(), o_starts.view(), o_stops.view(), data.view());
+        assert_eq!(v.to_vec(), vec![12, 13, 14, 10, 11]);
+        assert_eq!(off.to_vec(), vec![0, 3, 5]);
+    }
+
+    #[test]
+    fn test_gather_rows_f32() {
+        // Exact binary float32 values must be preserved — no rounding.
+        let goi = arr1(&[0i64]);
+        let o_starts = arr1(&[0i64]);
+        let o_stops = arr1(&[2i64]);
+        let data = arr1(&[0.25f32, 0.75f32]);
+        let (v, off) = gather_rows_f32(goi.view(), o_starts.view(), o_stops.view(), data.view());
+        assert_eq!(v.to_vec(), vec![0.25f32, 0.75f32]);
+        assert_eq!(off.to_vec(), vec![0i64, 2]);
+    }
+
+    #[test]
+    fn test_gather_alleles_basic() {
+        // alleles: v0="AC"(65,67), v1="G"(71). gather [1,0,1].
+        let v_idxs = arr1(&[1i32, 0, 1]);
+        let bytes = arr1(&[65u8, 67, 71]);
+        let offs = arr1(&[0i64, 2, 3]);
+        let (data, seq) = gather_alleles(v_idxs.view(), bytes.view(), offs.view());
+        assert_eq!(data.to_vec(), vec![71, 65, 67, 71]);
+        assert_eq!(seq.to_vec(), vec![0, 1, 3, 4]);
+    }
+
+    #[test]
+    fn test_compact_keep_i32() {
+        // 2 rows: [10, 11 | 12]; keep [T, F, T] → [10 | 12], offsets [0, 1, 2].
+        let vals = arr1(&[10i32, 11, 12]);
+        let off = arr1(&[0i64, 2, 3]);
+        let keep = arr1(&[true, false, true]);
+        let (v, o) = compact_keep_i32(vals.view(), off.view(), keep.view());
+        assert_eq!(v.to_vec(), vec![10, 12]);
+        assert_eq!(o.to_vec(), vec![0, 1, 2]);
+    }
+
+    #[test]
+    fn test_compact_keep_f32() {
+        // 1 row: [0.25, 0.75, 0.5]; keep [T, F, T] → [0.25, 0.5], offsets [0, 2].
+        let vals = arr1(&[0.25f32, 0.75f32, 0.5f32]);
+        let off = arr1(&[0i64, 3]);
+        let keep = arr1(&[true, false, true]);
+        let (v, o) = compact_keep_f32(vals.view(), off.view(), keep.view());
+        assert_eq!(v.to_vec(), vec![0.25f32, 0.5f32]);
+        assert_eq!(o.to_vec(), vec![0i64, 2]);
+    }
+
+    #[test]
+    fn test_fill_empty_scalar_i32() {
+        // 3 rows: offsets [0,2,2,3] — middle row is empty.
+        // Non-empty rows: [10,11] and [20]. Empty row gets one fill (99).
+        let data = arr1(&[10i32, 11, 20]);
+        let offsets = arr1(&[0i64, 2, 2, 3]);
+        let (v, o) = fill_empty_scalar_i32(data.view(), offsets.view(), 99);
+        assert_eq!(v.to_vec(), vec![10, 11, 99, 20]);
+        assert_eq!(o.to_vec(), vec![0i64, 2, 3, 4]);
+    }
+
+    #[test]
+    fn test_fill_empty_scalar_f32() {
+        // 2 rows: offsets [0,1,1] — second row is empty. fill = -1.0.
+        let data = arr1(&[0.5f32]);
+        let offsets = arr1(&[0i64, 1, 1]);
+        let (v, o) = fill_empty_scalar_f32(data.view(), offsets.view(), -1.0f32);
+        assert_eq!(v.to_vec(), vec![0.5f32, -1.0f32]);
+        assert_eq!(o.to_vec(), vec![0i64, 1, 2]);
+    }
+
+    #[test]
+    fn test_fill_empty_fixed_i32() {
+        // 3 rows: offsets [0,2,2,3], inner=2 — middle row empty → 2 copies of fill.
+        // data = [10,11, 12,13, 20,21] (2 per variant for rows 0 and 2).
+        let data = arr1(&[10i32, 11, 12, 13, 20, 21]);
+        let offsets = arr1(&[0i64, 2, 2, 3]);
+        let (v, o) = fill_empty_fixed_i32(data.view(), offsets.view(), 2, 7);
+        // Row 0: 2 vars * 2 inner = 4 elems [10,11,12,13]
+        // Row 1: empty → 1 dummy var * 2 inner = 2 elems [7,7]
+        // Row 2: 1 var * 2 inner = 2 elems [20,21]
+        assert_eq!(v.to_vec(), vec![10, 11, 12, 13, 7, 7, 20, 21]);
+        assert_eq!(o.to_vec(), vec![0i64, 2, 3, 4]);
+    }
+
+    #[test]
+    fn test_fill_empty_fixed_f32() {
+        // 2 rows: offsets [0,1,1], inner=3 — second row empty.
+        let data = arr1(&[1.0f32, 2.0, 3.0]);
+        let offsets = arr1(&[0i64, 1, 1]);
+        let (v, o) = fill_empty_fixed_f32(data.view(), offsets.view(), 3, 0.0f32);
+        assert_eq!(v.to_vec(), vec![1.0f32, 2.0, 3.0, 0.0, 0.0, 0.0]);
+        assert_eq!(o.to_vec(), vec![0i64, 1, 2]);
+    }
+
+    #[test]
+    fn test_fill_empty_seq_u8() {
+        // 3 rows: var_offsets [0,1,1,2] — middle row (row 1) is empty.
+        // Row 0: 1 variant with bytes [65,67] ("AC").
+        // Row 1: empty → gets dummy [78] ("N"), length 1.
+        // Row 2: 1 variant with bytes [71] ("G").
+        // seq_offsets: [0,2,3] (lengths: 2,1).
+        let data = arr1(&[65u8, 67, 71]);
+        let var_offsets = arr1(&[0i64, 1, 1, 2]);
+        let seq_offsets = arr1(&[0i64, 2, 3]);
+        let dummy = arr1(&[78u8]); // "N"
+        let (nd, nvar, nseq) =
+            fill_empty_seq_u8(data.view(), var_offsets.view(), seq_offsets.view(), dummy.view());
+        // new_var: row 0 has 1 var, row 1 empty→1 dummy, row 2 has 1 var → [0,1,2,3]
+        assert_eq!(nvar.to_vec(), vec![0i64, 1, 2, 3]);
+        // new_seq: var0 len=2, dummy len=1, var2 len=1 → [0,2,3,4]
+        assert_eq!(nseq.to_vec(), vec![0i64, 2, 3, 4]);
+        // new_data: [65,67] (row0), [78] (dummy), [71] (row2)
+        assert_eq!(nd.to_vec(), vec![65u8, 67, 78, 71]);
+    }
+
+    #[test]
+    fn test_fill_empty_seq_i32() {
+        // 2 rows: var_offsets [0,0,2] — first row (row 0) is empty.
+        // Row 0: empty → gets dummy token [999i32], length 1.
+        // Row 1: 2 variants: tokens [10,20] and [30,40,50].
+        // seq_offsets: [0,2,5].
+        let data = arr1(&[10i32, 20, 30, 40, 50]);
+        let var_offsets = arr1(&[0i64, 0, 2]);
+        let seq_offsets = arr1(&[0i64, 2, 5]);
+        let dummy = arr1(&[999i32]);
+        let (nd, nvar, nseq) =
+            fill_empty_seq_i32(data.view(), var_offsets.view(), seq_offsets.view(), dummy.view());
+        // new_var: row 0 empty→1, row 1 has 2 → [0,1,3]
+        assert_eq!(nvar.to_vec(), vec![0i64, 1, 3]);
+        // new_seq: dummy len=1, var0 len=2, var1 len=3 → [0,1,3,6]
+        assert_eq!(nseq.to_vec(), vec![0i64, 1, 3, 6]);
+        // new_data: [999] (dummy), [10,20] (var0), [30,40,50] (var1)
+        assert_eq!(nd.to_vec(), vec![999i32, 10, 20, 30, 40, 50]);
+    }
+
+    #[test]
+    fn rc_alleles_rcs_only_masked_rows() {
+        // 2 rows. row0 (masked) has 2 alleles: "AC","G". row1 (unmasked): "TT".
+        // seq_offsets delimit alleles: [0,2,3,5]; var_offsets delimit rows: [0,2,3].
+        let mut data = b"ACGTT".to_vec();
+        let seq_offsets = ndarray::array![0i64, 2, 3, 5];
+        let var_offsets = ndarray::array![0i64, 2, 3];
+        let to_rc_row = ndarray::array![true, false];
+        rc_alleles_inplace(&mut data, seq_offsets.view(), var_offsets.view(), to_rc_row.view());
+        // row0: "AC"->"GT", "G"->"C"; row1 "TT" untouched.
+        assert_eq!(&data, b"GTCTT");
+    }
+
+    #[test]
+    fn rc_alleles_all_false_is_noop() {
+        let mut data = b"ACG".to_vec();
+        let seq_offsets = ndarray::array![0i64, 1, 3];
+        let var_offsets = ndarray::array![0i64, 2];
+        let to_rc_row = ndarray::array![false];
+        rc_alleles_inplace(&mut data, seq_offsets.view(), var_offsets.view(), to_rc_row.view());
+        assert_eq!(&data, b"ACG");
+    }
+
+    #[test]
+    fn rc_alleles_handles_empty_allele_and_n() {
+        // 1 masked row, 2 alleles: "" (empty) and "ACN".
+        let mut data = b"ACN".to_vec();
+        let seq_offsets = ndarray::array![0i64, 0, 3];
+        let var_offsets = ndarray::array![0i64, 2];
+        let to_rc_row = ndarray::array![true];
+        rc_alleles_inplace(&mut data, seq_offsets.view(), var_offsets.view(), to_rc_row.view());
+        // "" stays ""; "ACN" -> revcomp -> "NGT".
+        assert_eq!(&data, b"NGT");
+    }
+}
diff --git a/src/variants/windows.rs b/src/variants/windows.rs
new file mode 100644
index 00000000..7ea986d3
--- /dev/null
+++ b/src/variants/windows.rs
@@ -0,0 +1,545 @@
+//! Variant-windows / variants flat-buffer assembly cores (pure ndarray).
+//! PyO3 lives in `crate::ffi`. Mirrors the Python helpers in
+//! `_dataset/_flat_flanks.py` (`tokenize_alleles`, `_slice_flanks`,
+//! `_assemble_alt_windows`, `compute_*`) — byte-identical by construction.
+use ndarray::{Array1, Array2, ArrayView1};
+
+/// Apply a 256-entry byte->token lookup table. `out[i] = lut[bytes[i]]`.
+/// Mirrors numpy `lut[bytes]`. `Tok` is the token dtype (u8 or i32).
+pub fn tokenize<Tok: Copy>(bytes: ArrayView1<u8>, lut: ArrayView1<Tok>) -> Array1<Tok> {
+    let bytes_s = bytes.as_slice().expect("tokenize: bytes must be contiguous");
+    let lut_s = lut.as_slice().expect("tokenize: lut must be contiguous");
+    // One upfront assertion lets the compiler prove every `b as usize` (< 256) is
+    // in-bounds for lut_s, eliminating the per-element bounds check.
+    assert!(lut_s.len() >= 256, "tokenize: lut must have >= 256 entries");
+    // Using raw slices instead of ArrayView1 removes the per-element ndarray stride
+    // multiply (imul rax, stride) that appeared in the indexed loop. collect() uses
+    // TrustedLen and pre-allocates, removing the per-element Vec capacity check.
+    let out: Vec<Tok> = bytes_s.iter().map(|&b| lut_s[b as usize]).collect();
+    Array1::from_vec(out)
+}
+
+/// Derive per-variant (f5, f3) fixed-`flank_len` flanks from a contiguous
+/// per-variant window read `[start-L, end+L)`. `f5` = first `L` bytes of each
+/// row, `f3` = last `L`. Both returned flat `(n*L,)`, variant-major. Mirrors
+/// `_slice_flanks` (`f5 = data[rw_off[:-1,None]+cols]`,
+/// `f3 = data[rw_off[1:,None]-L+cols]`).
+pub fn slice_flanks(
+    data: ArrayView1<u8>,
+    rw_off: ArrayView1<i64>,
+    flank_len: usize,
+) -> (Array1<u8>, Array1<u8>) {
+    let n = rw_off.len() - 1;
+    // Hoist contiguous slices upfront: eliminates the per-element ndarray stride
+    // multiply (imul) and bounds check (cmp/jae) that appeared in both inner
+    // k-loops. Using raw &[u8]/&[i64] lets LLVM see the loop as a plain copy.
+    let data_s = data.as_slice().expect("slice_flanks: data must be contiguous");
+    let rw_off_s = rw_off.as_slice().expect("slice_flanks: rw_off must be contiguous");
+    let mut f5: Vec<u8> = Vec::with_capacity(n * flank_len);
+    let mut f3: Vec<u8> = Vec::with_capacity(n * flank_len);
+    for i in 0..n {
+        let s = rw_off_s[i] as usize;
+        let e = rw_off_s[i + 1] as usize;
+        // extend_from_slice replaces flank_len individual push calls with a
+        // single slice-bounds check + memcpy, removing the per-byte capacity
+        // check and enabling vectorisation.
+        f5.extend_from_slice(&data_s[s..s + flank_len]);
+        f3.extend_from_slice(&data_s[e - flank_len..e]);
+    }
+    (Array1::from_vec(f5), Array1::from_vec(f3))
+}
+
+/// Concatenate `flank5 . alt . flank3` per variant into a flat byte buffer.
+/// `f5`/`f3` are `(n*flank_len,)` variant-major. Mirrors numba
+/// `_assemble_alt_windows`. Returns `(out_bytes, out_offsets)`.
+pub fn assemble_alt_window(
+    f5: ArrayView1<u8>,
+    f3: ArrayView1<u8>,
+    alt_data: ArrayView1<u8>,
+    alt_seq_off: ArrayView1<i64>,
+    flank_len: usize,
+) -> (Array1<u8>, Array1<i64>) {
+    let n = alt_seq_off.len() - 1;
+    // Hoist contiguous slices upfront: eliminates per-element ndarray stride
+    // multiply (imul) and bounds checks (cmp/jae) in both the offset-build loop
+    // and the assembly loop. Raw &[T] lets LLVM see the inner copies as plain
+    // memcpy, matching the slice_flanks pattern already applied to this file.
+    let f5_s = f5.as_slice().expect("assemble_alt_window: f5 must be contiguous");
+    let f3_s = f3.as_slice().expect("assemble_alt_window: f3 must be contiguous");
+    let alt_data_s =
+        alt_data.as_slice().expect("assemble_alt_window: alt_data must be contiguous");
+    let alt_seq_off_s =
+        alt_seq_off.as_slice().expect("assemble_alt_window: alt_seq_off must be contiguous");
+
+    let mut out_off: Vec<i64> = Vec::with_capacity(n + 1);
+    out_off.push(0);
+    for i in 0..n {
+        let alt_len = alt_seq_off_s[i + 1] - alt_seq_off_s[i];
+        out_off.push(out_off[i] + 2 * flank_len as i64 + alt_len);
+    }
+    let total = out_off[n] as usize;
+    let mut out: Vec<u8> = Vec::with_capacity(total);
+    for i in 0..n {
+        // extend_from_slice: single bounds check + memcpy, not per-byte push.
+        out.extend_from_slice(&f5_s[i * flank_len..(i + 1) * flank_len]);
+        let a = alt_seq_off_s[i] as usize;
+        let b = alt_seq_off_s[i + 1] as usize;
+        out.extend_from_slice(&alt_data_s[a..b]);
+        out.extend_from_slice(&f3_s[i * flank_len..(i + 1) * flank_len]);
+    }
+    (Array1::from_vec(out), Array1::from_vec(out_off))
+}
+
+/// Fetch the per-variant reference window `[start-L, end+L)` into one flat
+/// buffer, with `ends = starts - min(ilen, 0) + 1`. Returns `(data, rw_off)`
+/// where `rw_off` are per-variant byte boundaries (len `n+1`). Reuses
+/// `reference::get_reference`'s padded core (absolute-coordinate OOB padding).
+/// Mirrors `reference.fetch(v_contigs, starts-L, ends+L)`.
+pub fn fetch_windows(
+    v_contigs: ArrayView1<i32>,
+    starts_v: ArrayView1<i32>,
+    ilens_v: ArrayView1<i32>,
+    flank_len: i64,
+    reference: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+) -> (Array1<u8>, Array1<i64>) {
+    let n = starts_v.len();
+    let mut regions = Array2::<i32>::zeros((n, 3));
+    let mut rw_off = Array1::<i64>::zeros(n + 1);
+    for i in 0..n {
+        let start = starts_v[i] as i64;
+        let ilen = ilens_v[i] as i64;
+        let end = start - ilen.min(0) + 1;
+        let rstart = start - flank_len;
+        let rend = end + flank_len;
+        regions[[i, 0]] = v_contigs[i];
+        regions[[i, 1]] = rstart as i32;
+        regions[[i, 2]] = rend as i32;
+        rw_off[i + 1] = rw_off[i] + (rend - rstart);
+    }
+    let data = crate::reference::get_reference(
+        regions.view(),
+        rw_off.view(),
+        reference,
+        ref_offsets,
+        pad_char,
+        false, // serial: disjoint output already; this is per-variant fanout
+        None,  // to_rc: window/flank fetch is always forward; strand RC handled elsewhere
+    );
+    (data, rw_off)
+}
+
+/// Assembled flat buffers returned by the mode orchestrators. `byte_bufs` carry
+/// raw allele bytes (u8); `tok_bufs` carry LUT-applied tokens (`Tok`). Each
+/// tuple is `(field_name, data, seq_offsets)`.
+pub struct VariantBufs<Tok> {
+    pub byte_bufs: Vec<(&'static str, Array1<u8>, Array1<i64>)>,
+    pub tok_bufs: Vec<(&'static str, Array1<Tok>, Array1<i64>)>,
+}
+
+/// Gather per-selected-variant `start`/`ilen` from the GLOBAL arrays via `v_idxs`.
+fn gather_starts_ilens(
+    v_idxs: ArrayView1<i32>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+) -> (Array1<i32>, Array1<i32>) {
+    let n = v_idxs.len();
+    let mut s = Array1::<i32>::zeros(n);
+    let mut il = Array1::<i32>::zeros(n);
+    for i in 0..n {
+        let v = v_idxs[i] as usize;
+        s[i] = v_starts[v];
+        il[i] = ilens[v];
+    }
+    (s, il)
+}
+
+/// Plain-`variants` assembly tail: raw alt bytes (always), raw ref bytes
+/// (optional), `flank_tokens` ride-along (optional). Mirrors the variants tail
+/// of `get_variants_flat` (gather_alleles + compute_flank_tokens).
+#[allow(clippy::too_many_arguments)]
+pub fn assemble_variants_mode<Tok: Copy>(
+    v_idxs: ArrayView1<i32>,
+    row_offsets: ArrayView1<i64>,
+    alt_global: ArrayView1<u8>,
+    alt_off_global: ArrayView1<i64>,
+    ref_global: Option<ArrayView1<u8>>,
+    ref_off_global: Option<ArrayView1<i64>>,
+    want_flank: bool,
+    flank_len: i64,
+    lut: Option<ArrayView1<Tok>>,
+    v_contigs: ArrayView1<i32>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+    reference: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+) -> VariantBufs<Tok> {
+    let mut byte_bufs = Vec::new();
+    let mut tok_bufs = Vec::new();
+
+    let (alt_data, alt_seq_off) =
+        crate::variants::gather_alleles(v_idxs, alt_global, alt_off_global);
+    byte_bufs.push(("alt", alt_data, alt_seq_off));
+
+    if let (Some(rg), Some(ro)) = (ref_global, ref_off_global) {
+        let (ref_data, ref_seq_off) = crate::variants::gather_alleles(v_idxs, rg, ro);
+        byte_bufs.push(("ref", ref_data, ref_seq_off));
+    }
+
+    if want_flank {
+        let lut = lut.expect("flank tokens requested but no token LUT supplied");
+        let (starts_v, ilens_v) = gather_starts_ilens(v_idxs, v_starts, ilens);
+        let (rw_data, rw_off) = fetch_windows(
+            v_contigs, starts_v.view(), ilens_v.view(), flank_len, reference, ref_offsets,
+            pad_char,
+        );
+        let l = flank_len as usize;
+        let (f5, f3) = slice_flanks(rw_data.view(), rw_off.view(), l);
+        // Concatenate [f5 | f3] per variant (2L tokens, variant-major), tokenize.
+        let n = f5.len() / l;
+        let mut flank_bytes: Vec<u8> = Vec::with_capacity(n * 2 * l);
+        for i in 0..n {
+            for k in 0..l {
+                flank_bytes.push(f5[i * l + k]);
+            }
+            for k in 0..l {
+                flank_bytes.push(f3[i * l + k]);
+            }
+        }
+        let fb = Array1::from_vec(flank_bytes);
+        let tok = tokenize(fb.view(), lut);
+        // flank_tokens offsets are the variant-level row_offsets (fixed 2L inner
+        // axis carried separately Python-side as a trailing regular dim).
+        tok_bufs.push(("flank_tokens", tok, row_offsets.to_owned()));
+    }
+
+    VariantBufs { byte_bufs, tok_bufs }
+}
+
+/// `variant-windows` assembly tail. `ref_mode`/`alt_mode`: 1 = flanked window
+/// (`[start-L,end+L)` for ref; `flank5.alt.flank3` for alt), 2 = bare tokenized
+/// allele. Produces only token buffers (scalar fields are handled Python-side).
+/// Mirrors the windows branch of `get_variants_flat` (incl. the single fused
+/// fetch shared by ref_window + alt_window).
+#[allow(clippy::too_many_arguments)]
+pub fn assemble_windows_mode<Tok: Copy>(
+    v_idxs: ArrayView1<i32>,
+    _row_offsets: ArrayView1<i64>,
+    ref_mode: i64,
+    alt_mode: i64,
+    alt_global: ArrayView1<u8>,
+    alt_off_global: ArrayView1<i64>,
+    ref_global: Option<ArrayView1<u8>>,
+    ref_off_global: Option<ArrayView1<i64>>,
+    flank_len: i64,
+    lut: ArrayView1<Tok>,
+    v_contigs: ArrayView1<i32>,
+    v_starts: ArrayView1<i32>,
+    ilens: ArrayView1<i32>,
+    reference: ArrayView1<u8>,
+    ref_offsets: ArrayView1<i64>,
+    pad_char: u8,
+) -> VariantBufs<Tok> {
+    let mut tok_bufs = Vec::new();
+    let l = flank_len as usize;
+
+    // alt alleles are always gathered (needed for alt window or bare alt).
+    let (alt_data, alt_seq_off) =
+        crate::variants::gather_alleles(v_idxs, alt_global, alt_off_global);
+
+    // One fused fetch if either side needs a window read.
+    let need_fetch = ref_mode == 1 || alt_mode == 1;
+    let fetched = if need_fetch {
+        let (starts_v, ilens_v) = gather_starts_ilens(v_idxs, v_starts, ilens);
+        Some(fetch_windows(
+            v_contigs, starts_v.view(), ilens_v.view(), flank_len, reference, ref_offsets,
+            pad_char,
+        ))
+    } else {
+        None
+    };
+
+    // ref side (ordered first to match Python field insertion order).
+    if ref_mode == 1 {
+        let (rw_data, rw_off) = fetched.as_ref().expect("ref window needs a fetch");
+        let tok = tokenize(rw_data.view(), lut);
+        tok_bufs.push(("ref_window", tok, rw_off.clone()));
+    } else if ref_mode == 2 {
+        let rg = ref_global.expect("bare ref allele needs ref byte buffer");
+        let ro = ref_off_global.expect("bare ref allele needs ref offsets");
+        let (ref_data, ref_seq_off) = crate::variants::gather_alleles(v_idxs, rg, ro);
+        let tok = tokenize(ref_data.view(), lut);
+        tok_bufs.push(("ref", tok, ref_seq_off));
+    }
+
+    // alt side.
+    if alt_mode == 1 {
+        let (rw_data, rw_off) = fetched.as_ref().expect("alt window needs a fetch");
+        let (f5, f3) = slice_flanks(rw_data.view(), rw_off.view(), l);
+        let (alt_bytes, alt_off) = assemble_alt_window(
+            f5.view(),
+            f3.view(),
+            alt_data.view(),
+            alt_seq_off.view(),
+            l,
+        );
+        let tok = tokenize(alt_bytes.view(), lut);
+        tok_bufs.push(("alt_window", tok, alt_off));
+    } else if alt_mode == 2 {
+        let tok = tokenize(alt_data.view(), lut);
+        tok_bufs.push(("alt", tok, alt_seq_off));
+    }
+
+    VariantBufs { byte_bufs: Vec::new(), tok_bufs }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::arr1;
+
+    #[test]
+    fn test_tokenize_u8() {
+        // lut maps byte 65('A')->0, 67('C')->1, everything else->9 (unknown).
+        let mut lut = vec![9u8; 256];
+        lut[65] = 0;
+        lut[67] = 1;
+        let lut = Array1::from_vec(lut);
+        let bytes = arr1(&[65u8, 67, 78]); // A, C, N(unknown)
+        let out = tokenize(bytes.view(), lut.view());
+        assert_eq!(out.to_vec(), vec![0u8, 1, 9]);
+    }
+
+    #[test]
+    fn test_tokenize_i32() {
+        // i32 tokens (alphabet larger than 255 forces i32 in Python).
+        let mut lut = vec![999i32; 256];
+        lut[71] = 300; // 'G' -> 300
+        let lut = Array1::from_vec(lut);
+        let bytes = arr1(&[71u8, 84]); // G, T(unknown)
+        let out = tokenize(bytes.view(), lut.view());
+        assert_eq!(out.to_vec(), vec![300i32, 999]);
+    }
+
+    #[test]
+    fn test_slice_flanks() {
+        // 2 variants, L=2. var0 window=[1,2,3,4,5] (len 5), var1=[6,7,8,9] (len 4).
+        // rw_off = [0, 5, 9].
+        let data = arr1(&[1u8, 2, 3, 4, 5, 6, 7, 8, 9]);
+        let rw_off = arr1(&[0i64, 5, 9]);
+        let (f5, f3) = slice_flanks(data.view(), rw_off.view(), 2);
+        // f5: first 2 of each = [1,2 | 6,7]; f3: last 2 of each = [4,5 | 8,9]
+        assert_eq!(f5.to_vec(), vec![1u8, 2, 6, 7]);
+        assert_eq!(f3.to_vec(), vec![4u8, 5, 8, 9]);
+    }
+
+    #[test]
+    fn test_assemble_alt_window() {
+        // L=1. f5=[10|20], f3=[11|21]. alt: var0="A"(65), var1="CG"(67,71).
+        let f5 = arr1(&[10u8, 20]);
+        let f3 = arr1(&[11u8, 21]);
+        let alt_data = arr1(&[65u8, 67, 71]);
+        let alt_seq_off = arr1(&[0i64, 1, 3]);
+        let (out, off) = assemble_alt_window(
+            f5.view(),
+            f3.view(),
+            alt_data.view(),
+            alt_seq_off.view(),
+            1,
+        );
+        // var0: 10, 65, 11  (2*1 + 1 = 3 bytes)
+        // var1: 20, 67,71, 21  (2*1 + 2 = 4 bytes)
+        assert_eq!(out.to_vec(), vec![10u8, 65, 11, 20, 67, 71, 21]);
+        assert_eq!(off.to_vec(), vec![0i64, 3, 7]);
+    }
+
+    #[test]
+    fn test_fetch_windows() {
+        use ndarray::Array1 as A1;
+        // Single contig reference: bytes 0..20.
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        // 1 variant, contig 0, start=5, ilen=0 (SNP) → end = 5 - 0 + 1 = 6.
+        // L=2 → read [start-L, end+L) = [3, 8) → bytes [3,4,5,6,7].
+        let v_contigs = arr1(&[0i32]);
+        let starts = arr1(&[5i32]);
+        let ilens = arr1(&[0i32]);
+        let (data, rw_off) = fetch_windows(
+            v_contigs.view(),
+            starts.view(),
+            ilens.view(),
+            2,
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        assert_eq!(data.to_vec(), vec![3u8, 4, 5, 6, 7]);
+        assert_eq!(rw_off.to_vec(), vec![0i64, 5]);
+    }
+
+    #[test]
+    fn test_fetch_windows_deletion_widens() {
+        use ndarray::Array1 as A1;
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        // ilen=-2 (2bp deletion) → end = start - (-2) + 1 = start + 3.
+        // start=5, L=1 → read [4, 9) → bytes [4,5,6,7,8] (len 5).
+        let v_contigs = arr1(&[0i32]);
+        let starts = arr1(&[5i32]);
+        let ilens = arr1(&[-2i32]);
+        let (data, rw_off) = fetch_windows(
+            v_contigs.view(),
+            starts.view(),
+            ilens.view(),
+            1,
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        assert_eq!(data.to_vec(), vec![4u8, 5, 6, 7, 8]);
+        assert_eq!(rw_off.to_vec(), vec![0i64, 5]);
+    }
+
+    #[test]
+    fn test_assemble_windows_mode_both_windows() {
+        use ndarray::Array1 as A1;
+        // Global alt alleles: v0="A"(65). offsets [0,1].
+        let alt_global = arr1(&[65u8]);
+        let alt_off = arr1(&[0i64, 1]);
+        let v_idxs = arr1(&[0i32]);
+        let row_offsets = arr1(&[0i64, 1]);
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        let v_starts = arr1(&[5i32]);
+        let ilens = arr1(&[0i32]);
+        let v_contigs = arr1(&[0i32]);
+        let lut: A1<u8> = A1::from_vec((0u8..=255).collect()); // identity
+
+        let bufs = assemble_windows_mode::<u8>(
+            v_idxs.view(),
+            row_offsets.view(),
+            1, // ref_mode = window
+            1, // alt_mode = window
+            alt_global.view(),
+            alt_off.view(),
+            None,
+            None,
+            1, // flank_len
+            lut.view(),
+            v_contigs.view(),
+            v_starts.view(),
+            ilens.view(),
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        // SNP start=5 ilen=0 → end=6; read [4,7) = [4,5,6]. L=1.
+        // ref_window tokens (identity) = [4,5,6], off [0,3].
+        // alt_window = f5[4] . alt[65] . f3[6] = [4,65,6], off [0,3].
+        assert_eq!(bufs.byte_bufs.len(), 0);
+        let names: Vec<&str> = bufs.tok_bufs.iter().map(|t| t.0).collect();
+        assert_eq!(names, vec!["ref_window", "alt_window"]);
+        assert_eq!(bufs.tok_bufs[0].1.to_vec(), vec![4u8, 5, 6]);
+        assert_eq!(bufs.tok_bufs[0].2.to_vec(), vec![0i64, 3]);
+        assert_eq!(bufs.tok_bufs[1].1.to_vec(), vec![4u8, 65, 6]);
+        assert_eq!(bufs.tok_bufs[1].2.to_vec(), vec![0i64, 3]);
+    }
+
+    #[test]
+    fn test_assemble_windows_mode_bare_alleles() {
+        use ndarray::Array1 as A1;
+        // alt v0="AC"(65,67); ref v0="G"(71).
+        let alt_global = arr1(&[65u8, 67]);
+        let alt_off = arr1(&[0i64, 2]);
+        let ref_global = arr1(&[71u8]);
+        let ref_off = arr1(&[0i64, 1]);
+        let v_idxs = arr1(&[0i32]);
+        let row_offsets = arr1(&[0i64, 1]);
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        let v_starts = arr1(&[5i32]);
+        let ilens = arr1(&[0i32]);
+        let v_contigs = arr1(&[0i32]);
+        let lut: A1<u8> = A1::from_vec((0u8..=255).collect());
+
+        let bufs = assemble_windows_mode::<u8>(
+            v_idxs.view(),
+            row_offsets.view(),
+            2, // ref_mode = allele (bare)
+            2, // alt_mode = allele (bare)
+            alt_global.view(),
+            alt_off.view(),
+            Some(ref_global.view()),
+            Some(ref_off.view()),
+            1,
+            lut.view(),
+            v_contigs.view(),
+            v_starts.view(),
+            ilens.view(),
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        let names: Vec<&str> = bufs.tok_bufs.iter().map(|t| t.0).collect();
+        assert_eq!(names, vec!["ref", "alt"]);
+        // bare ref tokens = [71], off [0,1]; bare alt tokens = [65,67], off [0,2].
+        assert_eq!(bufs.tok_bufs[0].1.to_vec(), vec![71u8]);
+        assert_eq!(bufs.tok_bufs[0].2.to_vec(), vec![0i64, 1]);
+        assert_eq!(bufs.tok_bufs[1].1.to_vec(), vec![65u8, 67]);
+        assert_eq!(bufs.tok_bufs[1].2.to_vec(), vec![0i64, 2]);
+    }
+
+    #[test]
+    fn test_assemble_variants_mode_alt_and_flank() {
+        use ndarray::Array1 as A1;
+        // Global alleles: v0="A"(65), v1="CG"(67,71). offsets [0,1,3].
+        let alt_global = arr1(&[65u8, 67, 71]);
+        let alt_off = arr1(&[0i64, 1, 3]);
+        // Select v_idxs [1, 0] in one row.
+        let v_idxs = arr1(&[1i32, 0]);
+        let row_offsets = arr1(&[0i64, 2]);
+        // Reference 0..20, single contig. v_starts/ilens are GLOBAL (indexed by v_idx).
+        let reference: A1<u8> = A1::from_vec((0u8..20).collect());
+        let ref_offsets = arr1(&[0i64, 20]);
+        let v_starts = arr1(&[5i32, 8]); // global per-variant
+        let ilens = arr1(&[0i32, 0]);
+        let v_contigs = arr1(&[0i32, 0]); // per-selected-variant contig
+        // L=1, token LUT: identity-ish u8 (byte value -> itself for the test).
+        let lut: A1<u8> = A1::from_vec((0u8..=255).collect());
+
+        let bufs = assemble_variants_mode::<u8>(
+            v_idxs.view(),
+            row_offsets.view(),
+            alt_global.view(),
+            alt_off.view(),
+            None, // no ref alleles
+            None,
+            true, // want_flank
+            1,    // flank_len
+            Some(lut.view()),
+            v_contigs.view(),
+            v_starts.view(),
+            ilens.view(),
+            reference.view(),
+            ref_offsets.view(),
+            b'N',
+        );
+        // byte_bufs: only "alt". v_idxs [1,0] → "CG" then "A" → [67,71,65], off [0,2,3].
+        assert_eq!(bufs.byte_bufs.len(), 1);
+        let (name, data, off) = &bufs.byte_bufs[0];
+        assert_eq!(*name, "alt");
+        assert_eq!(data.to_vec(), vec![67u8, 71, 65]);
+        assert_eq!(off.to_vec(), vec![0i64, 2, 3]);
+        // tok_bufs: only "flank_tokens". Each variant: [f5(1) | f3(1)] = 2 tokens.
+        // var0 = v_idx 1: start=8, ilen=0 → end=9, read [7,10) = [7,8,9]; f5=[7], f3=[9].
+        // var1 = v_idx 0: start=5, ilen=0 → end=6, read [4,7) = [4,5,6]; f5=[4], f3=[6].
+        // tokens (identity lut) = [7,9, 4,6]; offsets = row_offsets [0,2].
+        assert_eq!(bufs.tok_bufs.len(), 1);
+        let (tname, tdata, toff) = &bufs.tok_bufs[0];
+        assert_eq!(*tname, "flank_tokens");
+        assert_eq!(tdata.to_vec(), vec![7u8, 9, 4, 6]);
+        assert_eq!(toff.to_vec(), vec![0i64, 2]);
+    }
+}
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
index 69c995eb..e6d31e18 100644
--- a/tests/benchmarks/conftest.py
+++ b/tests/benchmarks/conftest.py
@@ -15,7 +15,7 @@
 
 import genvarloader as gvl
 from genvarloader._dataset import _haps, _reconstruct, _tracks
-from tests.benchmarks._capture import capture_first_call
+from tests.benchmarks._capture import CapturedCall, capture_first_call
 from tests.benchmarks._indices import batch_indices
 
 DATA = Path(__file__).resolve().parent / "data"
@@ -44,6 +44,8 @@ def _batch_indices(ds, n: int):
 def captured_haplotypes(bench_dataset):
     ds = bench_dataset.with_seqs("haplotypes").with_len(SEQLEN)
     r, s = _batch_indices(ds, BATCH)
+    # Capture the rust reconstruct_haplotypes_from_sparse call by temporarily
+    # wrapping the module-level attribute so capture_first_call can intercept it.
     recon = capture_first_call(
         targets=[(_haps, "reconstruct_haplotypes_from_sparse")],
         thunk=lambda: ds[r, s],
@@ -78,14 +80,34 @@ def captured_intervals_to_tracks(bench_dataset):
 def captured_realign_tracks(bench_dataset):
     # shift_and_realign_tracks_sparse only fires on the haplotype+tracks path
     # (_reconstruct.py); the tracks-only path (_tracks.py) never realigns.
+    #
+    # The rust path calls _shift_and_realign_tracks_sparse_rust_wrapper, which
+    # is not a module-level attribute accessible via capture_first_call's setattr
+    # trick.  Instead, we patch _reconstruct._shift_and_realign_tracks_sparse_rust_wrapper
+    # directly with a recording wrapper so the exact callable the benchmark
+    # replays is captured.
     ds = (
         bench_dataset.with_seqs("haplotypes").with_tracks("read-depth").with_len(SEQLEN)
     )
     r, s = _batch_indices(ds, BATCH)
-    return capture_first_call(
-        targets=[(_reconstruct, "shift_and_realign_tracks_sparse")],
-        thunk=lambda: ds[r, s],
-    )
+    original = _reconstruct._shift_and_realign_tracks_sparse_rust_wrapper
+    captured: list[CapturedCall] = []
+
+    def recorder(*args, **kwargs):
+        if not captured:
+            captured.append(CapturedCall(args=args, kwargs=dict(kwargs)))
+        return original(*args, **kwargs)
+
+    _reconstruct._shift_and_realign_tracks_sparse_rust_wrapper = recorder
+    try:
+        ds[r, s]
+    finally:
+        _reconstruct._shift_and_realign_tracks_sparse_rust_wrapper = original
+    if not captured:
+        raise RuntimeError(
+            "shift_and_realign_tracks_sparse was never called while running the thunk"
+        )
+    return captured[0]
 
 
 # NOTE: a ``captured_germline_ccfs`` fixture was intentionally dropped. The
diff --git a/tests/benchmarks/profiling/profile.py b/tests/benchmarks/profiling/profile.py
index b565d2f5..ed12a9f3 100644
--- a/tests/benchmarks/profiling/profile.py
+++ b/tests/benchmarks/profiling/profile.py
@@ -33,20 +33,55 @@
 def build(ds, mode: str):
     if mode == "haplotypes":
         return ds.with_seqs("haplotypes").with_len(SEQLEN)
+    if mode == "annotated":
+        return ds.with_seqs("annotated").with_len(SEQLEN)
     if mode == "tracks":
+        # tracks-only: no sequences (the cheapest path; per-batch fixed cost dominates).
         return ds.with_seqs(None).with_tracks("read-depth").with_len(SEQLEN)
+    if mode == "tracks-seqs":
+        # haplotypes + re-aligned tracks together.
+        return ds.with_seqs("haplotypes").with_tracks("read-depth").with_len(SEQLEN)
     if mode == "variants":
         # Variants are ragged by definition (allele lengths vary), so they are
         # queried variable-length — `with_len` only makes sense for the seq/track
         # outputs, which this mode doesn't request.
         return ds.with_seqs("variants")
+    if mode == "variant-windows":
+        # Tokenized per-variant ref/alt windows (flat-only; needs a reference).
+        import seqpro as sp
+
+        import genvarloader as gvl
+
+        return (
+            ds.with_tracks(False)
+            .with_output_format("flat")
+            .with_seqs(
+                "variant-windows",
+                gvl.VarWindowOpt(
+                    flank_length=128,
+                    token_alphabet=sp.DNA.alphabet.encode(),
+                    unknown_token=len(sp.DNA),
+                    ref="window",
+                    alt="window",
+                ),
+            )
+        )
     raise SystemExit(f"unknown mode {mode!r}")
 
 
 def main() -> None:
     p = argparse.ArgumentParser()
     p.add_argument(
-        "--mode", choices=["haplotypes", "tracks", "variants"], required=True
+        "--mode",
+        choices=[
+            "haplotypes",
+            "annotated",
+            "tracks",
+            "tracks-seqs",
+            "variants",
+            "variant-windows",
+        ],
+        required=True,
     )
     p.add_argument("--n-batches", type=int, default=N_BATCHES)
     args = p.parse_args()
diff --git a/tests/benchmarks/profiling/profile_write_realistic.py b/tests/benchmarks/profiling/profile_write_realistic.py
new file mode 100644
index 00000000..1e79202a
--- /dev/null
+++ b/tests/benchmarks/profiling/profile_write_realistic.py
@@ -0,0 +1,119 @@
+"""Time gvl.write() and a real per-sample BigWigs gvl.update() on the chr22_geuv corpus.
+
+Exercises the full Rust write path (genoray sparse genotypes + Rust bigWig
+streaming writer). Prep (sample choice, plink2 slice) runs untimed; only the
+gvl.write / gvl.update call is measured.
+
+Usage (needs /carter sources or GVL_BENCH_SOURCE bundle):
+    pixi run -e dev python tests/benchmarks/profiling/profile_write_realistic.py --op write
+    pixi run -e dev python tests/benchmarks/profiling/profile_write_realistic.py --op update
+
+Peak RSS:
+    NUMBA_NUM_THREADS=1 .pixi/envs/dev/bin/memray run -o w.bin \\
+        tests/benchmarks/profiling/profile_write_realistic.py --op write
+    .pixi/envs/dev/bin/memray stats w.bin
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+import polars as pl
+
+_REPO_ROOT = Path(__file__).resolve().parents[3]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+from tests.benchmarks.data import build_realistic as br  # noqa: E402
+
+CORPUS_TAG = "chr22_geuv"
+
+
+def _resolve_bigwig_paths(samples: list[str]) -> dict[str, str]:
+    """Resolve per-sample chr22 bigWig paths exactly as build_realistic.build_dataset."""
+    smap = pl.read_csv(br.SAMPLE_MAP)
+    paths: dict[str, str] = {}
+    for sample, full_path in smap.select("sample", "path").iter_rows():
+        if sample not in samples:
+            continue
+        bw = br.BW_CHR22_DIR / Path(full_path).name
+        if not bw.exists():
+            raise SystemExit(f"Missing chr22 bigwig for {sample}: {bw}")
+        paths[sample] = str(bw)
+    assert set(paths) == set(samples), set(samples) - set(paths)
+    return paths
+
+
+def _prep() -> tuple[list[str], Path, Path, dict[str, str]]:
+    """Untimed prep: choose samples, build regions BED, slice + filter PGEN, resolve bigwigs."""
+    samples = br.choose_samples()
+    bed_path = br.copy_regions()
+    pgen = br.slice_pgen(samples, bed_path)
+    pgen = br.drop_unsupported_variants(pgen)
+    paths = _resolve_bigwig_paths(samples)
+    return samples, pgen, bed_path, paths
+
+
+def run_write(out: Path) -> float:
+    import genvarloader as gvl
+    from genoray import PGEN
+
+    samples, pgen, bed_path, paths = _prep()
+    tracks = gvl.BigWigs("read-depth", paths)
+    t0 = time.perf_counter()
+    gvl.write(
+        path=out,
+        bed=bed_path,
+        variants=PGEN(pgen),
+        tracks=tracks,
+        samples=samples,
+        overwrite=True,
+        extend_to_length=False,
+    )
+    return time.perf_counter() - t0
+
+
+def run_update(out: Path) -> tuple[float, str]:
+    import genvarloader as gvl
+    from genoray import PGEN
+
+    samples, pgen, bed_path, paths = _prep()
+    # Build a base dataset (untimed) to update.
+    gvl.write(
+        path=out,
+        bed=bed_path,
+        variants=PGEN(pgen),
+        tracks=gvl.BigWigs("read-depth", paths),
+        samples=samples,
+        overwrite=True,
+        extend_to_length=False,
+    )
+    # Timed: add a SECOND per-sample BigWigs track via update (Rust bigWig writer).
+    add = gvl.BigWigs("read-depth-2", paths)
+    t0 = time.perf_counter()
+    gvl.update(out, tracks=add, max_mem="4g")
+    wall = time.perf_counter() - t0
+    return wall, f"track=read-depth-2 samples={len(samples)}"
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--op", choices=["write", "update"], required=True)
+    args = p.parse_args()
+
+    with tempfile.TemporaryDirectory(dir=str(_REPO_ROOT)) as tmp:
+        out = Path(tmp) / "chr22_geuv_bench.gvl"
+        if args.op == "write":
+            wall = run_write(out)
+            print(f"op=write corpus={CORPUS_TAG} wall={wall:.3f}s")
+        else:
+            wall, info = run_update(out)
+            print(f"op=update corpus={CORPUS_TAG} wall={wall:.3f}s ({info})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/benchmarks/test_e2e.py b/tests/benchmarks/test_e2e.py
index bd1e1e29..7b20ad50 100644
--- a/tests/benchmarks/test_e2e.py
+++ b/tests/benchmarks/test_e2e.py
@@ -4,16 +4,32 @@
 
 from __future__ import annotations
 
+import pytest
+
 from tests.benchmarks._indices import batch_indices
 
 SEQLEN = 16384
 BATCH = 32
 
+# Fold ITERATIONS calls into each timed sample so per-batch OS-scheduler jitter on
+# the shared HPC node averages out. Without this the fast tracks-only path (~1.5 ms)
+# is noise-dominated: a single ~0.5 ms scheduler hiccup is ~30% of one call but only
+# ~3% of a 10-call sample. pedantic divides the round time by ``iterations``, so the
+# reported figure stays per-``ds[r, s]`` (directly comparable across paths/backends).
+ROUNDS = 50
+ITERATIONS = 10
+WARMUP_ROUNDS = 5
+
 
 def _bench_indexing(benchmark, ds):
     r, s = batch_indices(ds.shape[0], ds.shape[1], BATCH)
-    ds[r, s]  # warmup (JIT link, caches)
-    result = benchmark(lambda: ds[r, s])
+    ds[r, s]  # warmup (JIT link, caches) before the timed rounds
+    result = benchmark.pedantic(
+        lambda: ds[r, s],
+        rounds=ROUNDS,
+        iterations=ITERATIONS,
+        warmup_rounds=WARMUP_ROUNDS,
+    )
     assert result is not None
 
 
@@ -27,6 +43,13 @@ def test_e2e_annotated(benchmark, bench_dataset):
     _bench_indexing(benchmark, ds)
 
 
+@pytest.mark.xfail(
+    strict=False,
+    reason=(
+        "pre-existing Phase 2: _FlatVariants has no to_fixed for with_len on variants; "
+        "predates Phase 3"
+    ),
+)
 def test_e2e_variants(benchmark, bench_dataset):
     ds = bench_dataset.with_seqs("variants").with_len(SEQLEN)
     _bench_indexing(benchmark, ds)
diff --git a/tests/benchmarks/test_micro.py b/tests/benchmarks/test_micro.py
index 42288dbb..4b306977 100644
--- a/tests/benchmarks/test_micro.py
+++ b/tests/benchmarks/test_micro.py
@@ -4,13 +4,16 @@
 from __future__ import annotations
 
 import numpy as np
+import pytest
 
 from genvarloader._dataset._genotypes import (
     get_diffs_sparse,
     reconstruct_haplotypes_from_sparse,
 )
 from genvarloader._dataset._intervals import intervals_to_tracks
-from genvarloader._dataset._tracks import shift_and_realign_tracks_sparse
+from genvarloader._dataset._tracks import (
+    _shift_and_realign_tracks_sparse_rust_wrapper as shift_and_realign_tracks_sparse,
+)
 
 
 def _warm_and_run(benchmark, fn, captured):
@@ -35,6 +38,9 @@ def test_get_diffs_sparse(benchmark, captured_diffs):
     assert result.size > 0
 
 
+@pytest.mark.skip(
+    reason="kernel fused into rust (W3/W5); micro-benchmark pending redesign — W6"
+)
 def test_reconstruct_haplotypes_from_sparse(benchmark, captured_haplotypes):
     # returns None; writes into the preallocated `out` buffer
     _warm_and_run(benchmark, reconstruct_haplotypes_from_sparse, captured_haplotypes)
@@ -42,6 +48,9 @@ def test_reconstruct_haplotypes_from_sparse(benchmark, captured_haplotypes):
     assert out is not None and np.asarray(out).size > 0
 
 
+@pytest.mark.skip(
+    reason="kernel fused into rust (W3/W5); micro-benchmark pending redesign — W6"
+)
 def test_intervals_to_tracks(benchmark, captured_intervals_to_tracks):
     # returns None; writes into the preallocated `out` buffer
     _warm_and_run(benchmark, intervals_to_tracks, captured_intervals_to_tracks)
@@ -49,6 +58,9 @@ def test_intervals_to_tracks(benchmark, captured_intervals_to_tracks):
     assert out is not None and np.asarray(out).size > 0
 
 
+@pytest.mark.skip(
+    reason="kernel fused into rust (W3/W5); micro-benchmark pending redesign — W6"
+)
 def test_shift_and_realign_tracks_sparse(benchmark, captured_realign_tracks):
     # returns None; writes into the preallocated `out` buffer
     _warm_and_run(benchmark, shift_and_realign_tracks_sparse, captured_realign_tracks)
diff --git a/tests/dataset/test_flat_flanks.py b/tests/dataset/test_flat_flanks.py
index 929a3336..65732a90 100644
--- a/tests/dataset/test_flat_flanks.py
+++ b/tests/dataset/test_flat_flanks.py
@@ -707,18 +707,24 @@ def test_dummy_variant_windows_fill_empty_region_all_unk(snap_dataset):
 
 
 def test_variant_windows_single_fetch_per_decode(snap_dataset, monkeypatch):
-    """ref=window, alt=window decode must call Reference.fetch exactly once."""
-    import genvarloader._dataset._reference as refmod
+    """Both-window decode must invoke the assemble_variant_buffers kernel exactly once.
+
+    The single fused fetch+assemble invariant moved into the kernel in Target 7
+    (reference read now lives inside the Rust/numba kernel rather than Python
+    Reference.fetch), so we assert the dispatched kernel fires exactly once per
+    both-window decode.
+    """
+    import genvarloader._dataset._flat_variants as _fv
     from genvarloader._dataset._flat_variants import VarWindowOpt
 
     calls = {"n": 0}
-    orig = refmod.Reference.fetch
+    real_fn = _fv._assemble_variant_buffers_rust
 
-    def spy(self, *a, **k):
+    def spy(*a, **k):
         calls["n"] += 1
-        return orig(self, *a, **k)
+        return real_fn(*a, **k)
 
-    monkeypatch.setattr(refmod.Reference, "fetch", spy)
+    monkeypatch.setattr(_fv, "_assemble_variant_buffers_rust", spy)
 
     ds = (
         snap_dataset.with_tracks(False)
@@ -732,7 +738,7 @@ def spy(self, *a, **k):
     out = ds[[0, 1, 2], [0, 1, 2]]
     assert out.ref_window is not None and out.alt_window is not None
     assert calls["n"] == 1, (
-        f"expected 1 reference.fetch for both-window decode, got {calls['n']}"
+        f"expected 1 assemble_variant_buffers kernel call for both-window decode, got {calls['n']}"
     )
 
 
diff --git a/tests/dataset/test_open.py b/tests/dataset/test_open.py
index 90d8886b..a3fa6438 100644
--- a/tests/dataset/test_open.py
+++ b/tests/dataset/test_open.py
@@ -30,6 +30,7 @@ def _write_minimal_metadata(path: Path, *, ploidy: int | None = None) -> None:
         "max_jitter": 0,
         "ploidy": ploidy,
         "version": None,
+        "format_version": "2.0.0",
         "svar_link": None,
     }
     (path / "metadata.json").write_text(json.dumps(meta))
diff --git a/tests/dataset/test_query_spliced.py b/tests/dataset/test_query_spliced.py
new file mode 100644
index 00000000..3cd082b2
--- /dev/null
+++ b/tests/dataset/test_query_spliced.py
@@ -0,0 +1,11 @@
+import inspect
+
+from genvarloader._dataset import _query
+
+
+def test_spliced_has_no_dead_variant_guard():
+    src = inspect.getsource(_query._getitem_spliced)
+    assert "_VARIANT_TYPES_S" not in src, (
+        "spliced variant RC guard is unreachable (spliced variants are rejected "
+        "upstream) and must be removed"
+    )
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
new file mode 100644
index 00000000..7cde533f
--- /dev/null
+++ b/tests/integration/conftest.py
@@ -0,0 +1,46 @@
+"""Shared fixtures for tests/integration/."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pyBigWig
+import pytest
+
+import genvarloader as gvl
+
+
+@pytest.fixture
+def track_dataset_path(source_bed, vcf_dir, tmp_path) -> Path:
+    """A freshly-written 2.0 dataset (phased VCF + one BigWig 'cov' track),
+    yielded as a writable path so tests may downgrade/migrate it in place.
+
+    Mirrors tests/dataset/conftest.py::snap_dataset but yields a path (not an
+    opened Dataset) and is function-scoped so each test gets a mutable copy.
+    """
+    from genoray import VCF
+
+    samples = ["s0", "s1", "s2"]
+    contig_sizes = [("chr1", 2_000_000), ("chr2", 2_000_000)]
+    bw_paths: dict[str, str] = {}
+    for i, s in enumerate(samples):
+        p = tmp_path / f"{s}.bw"
+        with pyBigWig.open(str(p), "w") as bw:
+            bw.addHeader(contig_sizes, maxZooms=0)
+            v = float(i + 1)
+            bw.addEntries(
+                ["chr1", "chr1", "chr2", "chr2"],
+                [499_990, 1_010_686, 17_320, 1_234_560],
+                ends=[500_030, 1_010_706, 17_340, 1_234_580],
+                values=[v, v, v, v],
+            )
+        bw_paths[s] = str(p)
+    out = tmp_path / "ds.gvl"
+    gvl.write(
+        path=out,
+        bed=source_bed,
+        variants=VCF(vcf_dir / "filtered_source.vcf.gz"),
+        tracks=gvl.BigWigs("cov", bw_paths),
+        max_jitter=2,
+    )
+    return out
diff --git a/tests/integration/dataset/__init__.py b/tests/integration/dataset/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/dataset/test_write_tracks_e2e.py b/tests/integration/dataset/test_write_tracks_e2e.py
index ba3305bb..72b29d6c 100644
--- a/tests/integration/dataset/test_write_tracks_e2e.py
+++ b/tests/integration/dataset/test_write_tracks_e2e.py
@@ -36,22 +36,20 @@ def test_write_with_table_only_roundtrip(tmp_path):
     out = tmp_path / "ds.gvl"
     gvl.write(path=out, bed=bed, tracks=table)
 
-    # Sanity: the dataset directory has the expected per-track folder.
-    assert (out / "intervals" / "signal" / "intervals.npy").exists()
-    assert (out / "intervals" / "signal" / "offsets.npy").exists()
+    # Sanity: the dataset directory has the expected per-track SoA files.
+    sig_dir = out / "intervals" / "signal"
+    for name in ("starts.npy", "ends.npy", "values.npy", "offsets.npy"):
+        assert (sig_dir / name).exists()
 
     # Read intervals back and confirm values round-trip.
-    INTERVAL_DTYPE = np.dtype(
-        [("start", np.int32), ("end", np.int32), ("value", np.float32)],
-        align=True,
-    )
-    arr = np.memmap(
-        out / "intervals" / "signal" / "intervals.npy", dtype=INTERVAL_DTYPE, mode="r"
-    )
+    starts = np.memmap(sig_dir / "starts.npy", dtype=np.int32, mode="r")
+    ends = np.memmap(sig_dir / "ends.npy", dtype=np.int32, mode="r")
+    values = np.memmap(sig_dir / "values.npy", dtype=np.float32, mode="r")
     # Both samples + both regions should produce 4 intervals total.
-    assert arr.shape[0] == 4
-    values = sorted(float(v) for v in arr["value"])
-    assert values == [1.0, 2.0, 3.0, 4.0]
+    assert len(starts) == 4
+    assert len(ends) == 4
+    assert len(values) == 4
+    assert sorted(float(v) for v in values) == [1.0, 2.0, 3.0, 4.0]
 
 
 def test_write_with_mixed_bigwigs_and_table(tmp_path, bigwig_dir: Path):
@@ -87,8 +85,10 @@ def test_write_with_mixed_bigwigs_and_table(tmp_path, bigwig_dir: Path):
     out = tmp_path / "mixed.gvl"
     gvl.write(path=out, bed=bed, tracks=[bw, table])
 
-    assert (out / "intervals" / "bw_signal" / "intervals.npy").exists()
-    assert (out / "intervals" / "tab_signal" / "intervals.npy").exists()
+    for track_name in ("bw_signal", "tab_signal"):
+        track_dir = out / "intervals" / track_name
+        for name in ("starts.npy", "ends.npy", "values.npy", "offsets.npy"):
+            assert (track_dir / name).exists()
 
 
 def test_write_with_variants_and_tracks(tmp_path, vcf_dir: Path):
@@ -121,8 +121,9 @@ def test_write_with_variants_and_tracks(tmp_path, vcf_dir: Path):
     gvl.write(path=out, bed=bed, variants=vcf, tracks=table)
 
     assert (out / "genotypes").is_dir()
-    assert (out / "intervals" / "signal" / "intervals.npy").exists()
-    assert (out / "intervals" / "signal" / "offsets.npy").exists()
+    sig_dir = out / "intervals" / "signal"
+    for name in ("starts.npy", "ends.npy", "values.npy", "offsets.npy"):
+        assert (sig_dir / name).exists()
 
     import json
 
diff --git a/tests/integration/test_format_2_soa.py b/tests/integration/test_format_2_soa.py
new file mode 100644
index 00000000..59822b60
--- /dev/null
+++ b/tests/integration/test_format_2_soa.py
@@ -0,0 +1,42 @@
+"""Format 2.0 stores track intervals as struct-of-arrays (Task 1)."""
+
+from __future__ import annotations
+
+import json
+
+import numpy as np
+
+import genvarloader as gvl
+from genvarloader._dataset._write import DATASET_FORMAT_VERSION
+
+
+def test_dataset_version_is_2(track_dataset_path):
+    assert str(DATASET_FORMAT_VERSION) == "2.0.0"
+    meta = json.loads((track_dataset_path / "metadata.json").read_text())
+    assert meta["format_version"] == "2.0.0"
+
+
+def test_soa_files_present_and_aos_absent(track_dataset_path):
+    track_dir = track_dataset_path / "intervals" / "cov"
+    assert (track_dir / "starts.npy").exists()
+    assert (track_dir / "ends.npy").exists()
+    assert (track_dir / "values.npy").exists()
+    assert (track_dir / "offsets.npy").exists()
+    assert not (track_dir / "intervals.npy").exists()
+
+
+def test_soa_files_contiguous_and_typed(track_dataset_path):
+    track_dir = track_dataset_path / "intervals" / "cov"
+    starts = np.memmap(track_dir / "starts.npy", dtype=np.int32, mode="r")
+    ends = np.memmap(track_dir / "ends.npy", dtype=np.int32, mode="r")
+    values = np.memmap(track_dir / "values.npy", dtype=np.float32, mode="r")
+    assert starts.flags["C_CONTIGUOUS"]
+    assert ends.flags["C_CONTIGUOUS"]
+    assert values.flags["C_CONTIGUOUS"]
+    assert len(starts) == len(ends) == len(values)
+
+
+def test_reads_back(track_dataset_path, reference):
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_tracks("cov")
+    out = ds[0, 0]
+    assert out is not None
diff --git a/tests/integration/test_format_version_gate.py b/tests/integration/test_format_version_gate.py
new file mode 100644
index 00000000..e4e4a4e7
--- /dev/null
+++ b/tests/integration/test_format_version_gate.py
@@ -0,0 +1,46 @@
+"""Open-time format_version gate (Task 2)."""
+
+from __future__ import annotations
+
+import json
+import shutil
+
+import pytest
+
+import genvarloader as gvl
+
+
+def _set_version(path, version):
+    meta_path = path / "metadata.json"
+    raw = json.loads(meta_path.read_text())
+    raw["format_version"] = version
+    meta_path.write_text(json.dumps(raw))
+
+
+def test_old_major_raises_migrate_hint(track_dataset_path, reference):
+    _set_version(track_dataset_path, "1.0.0")
+    with pytest.raises(ValueError, match="migrate"):
+        gvl.Dataset.open(track_dataset_path, reference=reference)
+
+
+def test_none_version_raises_migrate_hint(track_dataset_path, reference, tmp_path):
+    dst = tmp_path / "noneversion.gvl"
+    shutil.copytree(track_dataset_path, dst)
+    meta_path = dst / "metadata.json"
+    raw = json.loads(meta_path.read_text())
+    raw["format_version"] = None
+    meta_path.write_text(json.dumps(raw))
+    with pytest.raises(ValueError, match="migrate"):
+        gvl.Dataset.open(dst, reference=reference)
+
+
+def test_future_major_raises_upgrade_hint(track_dataset_path, reference):
+    _set_version(track_dataset_path, "3.0.0")
+    with pytest.raises(ValueError, match="[Uu]pgrade"):
+        gvl.Dataset.open(track_dataset_path, reference=reference)
+
+
+def test_current_major_opens(track_dataset_path, reference):
+    # written fresh at 2.0.0 by the fixture
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference)
+    assert ds is not None
diff --git a/tests/integration/test_haps_ffi_cache.py b/tests/integration/test_haps_ffi_cache.py
new file mode 100644
index 00000000..e89c77ec
--- /dev/null
+++ b/tests/integration/test_haps_ffi_cache.py
@@ -0,0 +1,41 @@
+"""Haps caches FFI-ready sub-linear arrays once (Task 5)."""
+
+from __future__ import annotations
+
+import numpy as np
+
+import genvarloader as gvl
+from genvarloader._dataset._haps import Haps
+
+
+def _haps(track_dataset_path, reference) -> Haps:
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_seqs(
+        "haplotypes"
+    )
+    seqs = ds._seqs
+    assert isinstance(seqs, Haps)
+    return seqs
+
+
+def test_ffi_static_cached(track_dataset_path, reference):
+    haps = _haps(track_dataset_path, reference)
+    first = haps.ffi_static
+    second = haps.ffi_static
+    assert first is second  # cached, computed once
+
+
+def test_ffi_static_contiguous_and_typed(track_dataset_path, reference):
+    s = _haps(track_dataset_path, reference).ffi_static
+    assert s.v_starts.dtype == np.int32 and s.v_starts.flags["C_CONTIGUOUS"]
+    assert s.ilens.dtype == np.int32 and s.ilens.flags["C_CONTIGUOUS"]
+    assert s.alt_alleles.dtype == np.uint8 and s.alt_alleles.flags["C_CONTIGUOUS"]
+    assert s.alt_offsets.dtype == np.int64 and s.alt_offsets.flags["C_CONTIGUOUS"]
+    assert s.ref is not None and s.ref.dtype == np.uint8 and s.ref.flags["C_CONTIGUOUS"]
+    assert s.ref_offsets is not None and s.ref_offsets.dtype == np.int64
+
+
+def test_ffi_static_v_starts_matches_source(track_dataset_path, reference):
+    haps = _haps(track_dataset_path, reference)
+    np.testing.assert_array_equal(
+        haps.ffi_static.v_starts, np.asarray(haps.variants.start, np.int32)
+    )
diff --git a/tests/integration/test_migrate.py b/tests/integration/test_migrate.py
new file mode 100644
index 00000000..64be1c58
--- /dev/null
+++ b/tests/integration/test_migrate.py
@@ -0,0 +1,126 @@
+"""gvl.migrate: 1.x AoS -> 2.0 SoA round-trip, idempotency, crash-safety (Task 3)."""
+
+from __future__ import annotations
+
+import json
+
+import numpy as np
+
+import genvarloader as gvl
+from genvarloader._ragged import INTERVAL_DTYPE
+
+
+def _track_dirs(path):
+    for base in ("intervals", "annot_intervals"):
+        d = path / base
+        if d.is_dir():
+            for child in sorted(d.iterdir()):
+                if child.is_dir():
+                    yield child
+
+
+def _downgrade_to_aos(path):
+    """Rewrite a fresh 2.0 SoA dataset back to a 1.x AoS dataset in place."""
+    for d in _track_dirs(path):
+        starts = np.memmap(d / "starts.npy", dtype=np.int32, mode="r")
+        ends = np.memmap(d / "ends.npy", dtype=np.int32, mode="r")
+        values = np.memmap(d / "values.npy", dtype=np.float32, mode="r")
+        rec = np.empty(len(starts), dtype=INTERVAL_DTYPE)
+        rec["start"] = starts
+        rec["end"] = ends
+        rec["value"] = values
+        out = np.memmap(
+            d / "intervals.npy", dtype=INTERVAL_DTYPE, mode="w+", shape=rec.shape
+        )
+        out[:] = rec
+        out.flush()
+        del starts, ends, values, out
+        (d / "starts.npy").unlink()
+        (d / "ends.npy").unlink()
+        (d / "values.npy").unlink()
+    meta_path = path / "metadata.json"
+    raw = json.loads(meta_path.read_text())
+    raw["format_version"] = "1.0.0"
+    meta_path.write_text(json.dumps(raw))
+
+
+def _read_track_values(ds):
+    """Return the raw realigned track float values for region 0, sample 0.
+
+    With both seqs and tracks active, [0, 0] returns a 2-tuple (seq, tracks).
+    We take the last element (tracks), which is a Ragged[float32] / RaggedTracks,
+    and return its flat data buffer for byte-identical comparison.
+    """
+    result = ds.with_tracks("cov")[0, 0]
+    # When both seqs and tracks are active the result is a 2-tuple; take tracks.
+    trk = result[-1] if isinstance(result, tuple) else result
+    return trk.data.copy()
+
+
+def test_round_trip_byte_identical(track_dataset_path, reference):
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference)
+    before = _read_track_values(ds)
+
+    _downgrade_to_aos(track_dataset_path)
+    gvl.migrate(track_dataset_path)
+
+    track_dir = track_dataset_path / "intervals" / "cov"
+    assert (track_dir / "starts.npy").exists()
+    assert (track_dir / "ends.npy").exists()
+    assert (track_dir / "values.npy").exists()
+    assert not (track_dir / "intervals.npy").exists()
+    assert (
+        json.loads((track_dataset_path / "metadata.json").read_text())["format_version"]
+        == "2.0.0"
+    )
+
+    after = gvl.Dataset.open(track_dataset_path, reference=reference)
+    np.testing.assert_array_equal(_read_track_values(after), before)
+
+
+def test_idempotent(track_dataset_path):
+    _downgrade_to_aos(track_dataset_path)
+    gvl.migrate(track_dataset_path)
+    gvl.migrate(track_dataset_path)  # second run is a no-op, must not raise
+    track_dir = track_dataset_path / "intervals" / "cov"
+    assert not (track_dir / "intervals.npy").exists()
+
+
+def test_resumable_after_interrupt_before_metadata_bump(track_dataset_path):
+    """Crash after SoA written but before metadata bump: still 1.x, re-runnable."""
+    _downgrade_to_aos(track_dataset_path)
+    # Simulate partial migration: write SoA, leave AoS + 1.x metadata.
+    from genvarloader._dataset._migrate import _migrate_track
+
+    for d in _track_dirs(track_dataset_path):
+        _migrate_track(d)
+    meta = json.loads((track_dataset_path / "metadata.json").read_text())
+    assert meta["format_version"] == "1.0.0"  # not bumped yet
+    track_dir = track_dataset_path / "intervals" / "cov"
+    assert (track_dir / "intervals.npy").exists()  # AoS still present
+
+    gvl.migrate(track_dataset_path)  # completes the migration
+    assert (
+        json.loads((track_dataset_path / "metadata.json").read_text())["format_version"]
+        == "2.0.0"
+    )
+    assert not (track_dir / "intervals.npy").exists()
+
+
+def test_cleans_leftover_aos_after_interrupt_before_delete(track_dataset_path):
+    """Crash after metadata bump but before AoS delete: re-run removes AoS."""
+    _downgrade_to_aos(track_dataset_path)
+    gvl.migrate(track_dataset_path)  # full migration -> SoA + 2.0 metadata
+    track_dir = track_dataset_path / "intervals" / "cov"
+    # Re-introduce a leftover AoS file (as if delete was interrupted).
+    starts = np.memmap(track_dir / "starts.npy", dtype=np.int32, mode="r")
+    rec = np.zeros(len(starts), dtype=INTERVAL_DTYPE)
+    out = np.memmap(
+        track_dir / "intervals.npy", dtype=INTERVAL_DTYPE, mode="w+", shape=rec.shape
+    )
+    out[:] = rec
+    out.flush()
+    del starts, out
+
+    gvl.migrate(track_dataset_path)  # idempotent cleanup
+    assert not (track_dir / "intervals.npy").exists()
diff --git a/tests/integration/test_scale_guard.py b/tests/integration/test_scale_guard.py
new file mode 100644
index 00000000..28898c63
--- /dev/null
+++ b/tests/integration/test_scale_guard.py
@@ -0,0 +1,80 @@
+"""Scale-guard: no per-batch copy materializes a memmap on the read path (Task 4).
+
+Mirrors the py-spy diagnostic that found the defect: monkeypatch
+np.ascontiguousarray over one ds[r, s] and assert zero copies whose source
+.base is an np.memmap.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import genvarloader as gvl
+
+
+@pytest.fixture
+def _no_memmap_copies(monkeypatch):
+    real = np.ascontiguousarray
+    offenders: list[str] = []
+
+    def spy(a, dtype=None, *args, **kwargs):
+        arr = np.asarray(a)
+        base = getattr(arr, "base", None)
+        if isinstance(base, np.memmap) or isinstance(arr, np.memmap):
+            # A copy would be forced iff non-contiguous or dtype-mismatched.
+            would_copy = (not arr.flags["C_CONTIGUOUS"]) or (
+                dtype is not None and arr.dtype != np.dtype(dtype)
+            )
+            if would_copy:
+                offenders.append(f"{getattr(arr, 'shape', None)} {arr.dtype}->{dtype}")
+        return real(a, dtype, *args, **kwargs)
+
+    monkeypatch.setattr(np, "ascontiguousarray", spy)
+    return offenders
+
+
+def test_tracks_only_no_memmap_copy(track_dataset_path, reference, _no_memmap_copies):
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_tracks("cov")
+    _ = ds[0, 0]
+    assert _no_memmap_copies == [], f"sample-scale memmap copies: {_no_memmap_copies}"
+
+
+def test_haps_no_memmap_copy(track_dataset_path, reference, _no_memmap_copies):
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_seqs(
+        "haplotypes"
+    )
+    _ = ds[0, 0]
+    assert _no_memmap_copies == [], f"sample-scale memmap copies: {_no_memmap_copies}"
+
+
+def test_annotated_no_memmap_copy(track_dataset_path, reference, _no_memmap_copies):
+    ds = gvl.Dataset.open(track_dataset_path, reference=reference).with_seqs(
+        "annotated"
+    )
+    _ = ds[0, 0]
+    assert _no_memmap_copies == [], f"sample-scale memmap copies: {_no_memmap_copies}"
+
+
+def test_haps_and_tracks_no_memmap_copy(
+    track_dataset_path, reference, _no_memmap_copies
+):
+    ds = (
+        gvl.Dataset.open(track_dataset_path, reference=reference)
+        .with_seqs("haplotypes")
+        .with_tracks("cov")
+    )
+    _ = ds[0, 0]
+    assert _no_memmap_copies == [], f"sample-scale memmap copies: {_no_memmap_copies}"
+
+
+def test_annotated_and_tracks_no_memmap_copy(
+    track_dataset_path, reference, _no_memmap_copies
+):
+    ds = (
+        gvl.Dataset.open(track_dataset_path, reference=reference)
+        .with_seqs("annotated")
+        .with_tracks("cov")
+    )
+    _ = ds[0, 0]
+    assert _no_memmap_copies == [], f"sample-scale memmap copies: {_no_memmap_copies}"
diff --git a/tests/integration/test_write_parallel.py b/tests/integration/test_write_parallel.py
index 2bb4f636..3d5a09e7 100644
--- a/tests/integration/test_write_parallel.py
+++ b/tests/integration/test_write_parallel.py
@@ -60,9 +60,28 @@ def annot_bw(tmp_path: Path) -> Path:
 # ---------------------------------------------------------------------------
 
 
-def _load_intervals(ds_path: Path, subdir: str, name: str) -> np.ndarray:
-    """Load intervals.npy from ``ds_path/<subdir>/<name>/intervals.npy``."""
-    return np.array(np.memmap(ds_path / subdir / name / "intervals.npy", mode="r"))
+def _load_intervals(ds_path: Path, subdir: str, name: str) -> dict[str, np.ndarray]:
+    """Load SoA interval arrays from ``ds_path/<subdir>/<name>/``.
+
+    Returns a dict with keys ``starts``, ``ends``, ``values``, ``offsets``
+    containing the raw memmapped arrays for starts.npy, ends.npy, values.npy,
+    and offsets.npy respectively.  Callers compare all four arrays so that
+    the parallel and sequential write paths are verified to be byte-identical
+    across every SoA file.
+    """
+    track_dir = ds_path / subdir / name
+    return {
+        "starts": np.array(
+            np.memmap(track_dir / "starts.npy", dtype=np.int32, mode="r")
+        ),
+        "ends": np.array(np.memmap(track_dir / "ends.npy", dtype=np.int32, mode="r")),
+        "values": np.array(
+            np.memmap(track_dir / "values.npy", dtype=np.float32, mode="r")
+        ),
+        "offsets": np.array(
+            np.memmap(track_dir / "offsets.npy", dtype=np.int64, mode="r")
+        ),
+    }
 
 
 # ---------------------------------------------------------------------------
@@ -99,18 +118,20 @@ def test_parallel_write_matches_sequential(
     vcf3 = VCF(vcf_dir / "filtered_source.vcf.gz")
     gvl.write(c_dir, BED, variants=vcf3, annot_tracks={"ann": annot_bw})
 
-    # --- compare track bytes ---
+    # --- compare track bytes (starts, ends, values, offsets) ---
     a_track = _load_intervals(a_dir, "intervals", "signal")
     b_track = _load_intervals(b_dir, "intervals", "signal")
-    assert np.array_equal(a_track, b_track), (
-        f"Track intervals differ between parallel (a) and sequential (b):\n"
-        f"a={a_track}\nb={b_track}"
-    )
+    for arr_name in ("starts", "ends", "values", "offsets"):
+        assert np.array_equal(a_track[arr_name], b_track[arr_name]), (
+            f"Track {arr_name}.npy differs between parallel (a) and sequential (b):\n"
+            f"a={a_track[arr_name]}\nb={b_track[arr_name]}"
+        )
 
-    # --- compare annot bytes ---
+    # --- compare annot bytes (starts, ends, values, offsets) ---
     a_annot = _load_intervals(a_dir, "annot_intervals", "ann")
     c_annot = _load_intervals(c_dir, "annot_intervals", "ann")
-    assert np.array_equal(a_annot, c_annot), (
-        f"Annot intervals differ between parallel (a) and sequential (c):\n"
-        f"a={a_annot}\nc={c_annot}"
-    )
+    for arr_name in ("starts", "ends", "values", "offsets"):
+        assert np.array_equal(a_annot[arr_name], c_annot[arr_name]), (
+            f"Annot {arr_name}.npy differs between parallel (a) and sequential (c):\n"
+            f"a={a_annot[arr_name]}\nc={c_annot[arr_name]}"
+        )
diff --git a/tests/parity/_fixtures.py b/tests/parity/_fixtures.py
index 1153ccd5..0b7759db 100644
--- a/tests/parity/_fixtures.py
+++ b/tests/parity/_fixtures.py
@@ -4,9 +4,87 @@
 
 from pathlib import Path
 
+import numpy as np
+import pyBigWig
+
 import genvarloader as gvl
 from tests._bigwig_corpus import DEFAULT_CONTIGS, make_regions, make_synthetic_bigwigs
 
+# Contigs used by the session-level synthetic case (build_case / conftest).
+# These match _SESSION_CONTIGS in tests/_builders/case.py.
+_SESSION_CONTIGS = {"chr1": 1_300_000, "chr2": 1_300_000}
+_SESSION_SAMPLES = ["s0", "s1", "s2"]
+
+
+# Contigs and samples for the jittered-track fixture (§242 regression coverage).
+_JITTER_CONTIGS = {"chr21": 200_000, "chr22": 150_000}
+_JITTER_SAMPLES = ["s0", "s1", "s2"]
+# Constant BigWig signal value per sample: s0→1.0, s1→2.0, s2→3.0.
+# Hand-computable: for any region [start, end), sample j yields [j+1.0] * (end-start).
+_JITTER_SIGNAL_PER_SAMPLE: dict[str, float] = {
+    s: float(i + 1) for i, s in enumerate(_JITTER_SAMPLES)
+}
+
+
+def build_track_dataset_jittered(work_dir: Path, max_jitter: int) -> Path:
+    """Write a track-only GVL dataset with ``max_jitter > 0`` for #242 parity coverage.
+
+    Signal design
+    -------------
+    Each sample has a SINGLE constant BigWig interval covering the ENTIRE contig
+    (s0=1.0, s1=2.0, s2=3.0).  Any read window is fully covered, so the expected
+    track over any region [start, end) with jitter=0 is just the per-sample constant
+    repeated for ``(end - start)`` positions — trivially hand-computable.
+
+    #242 condition
+    --------------
+    ``gvl.write`` clips BigWig intervals to the jitter-EXPANDED window
+    ``[chromStart - max_jitter, chromEnd + max_jitter]``, so the stored interval
+    start is ``chromStart - max_jitter < chromStart``.  ``Dataset.open`` queries
+    at the ORIGINAL ``chromStart``.  This means ``itv.start < query_start`` — the
+    exact boundary condition that PR #244 fixed in both kernels.
+
+    Regions are placed well inside contig bounds so the expanded write window
+    ``[chromStart - max_jitter, chromEnd + max_jitter]`` never underflows (all
+    chromStarts ≥ 1000, so expanded start ≥ 996 ≥ 0 for max_jitter ≤ 1000).
+    """
+    import polars as pl
+
+    work_dir = Path(work_dir)
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    bw_dir = work_dir / "bw"
+    bw_dir.mkdir(exist_ok=True)
+
+    header = [(c, length) for c, length in _JITTER_CONTIGS.items()]
+    sample_to_bw: dict[str, str] = {}
+    for sample, value in _JITTER_SIGNAL_PER_SAMPLE.items():
+        bw_path = bw_dir / f"{sample}.bw"
+        with pyBigWig.open(str(bw_path), "w") as bw:
+            bw.addHeader(header, maxZooms=0)
+            for contig, length in _JITTER_CONTIGS.items():
+                # Single interval covering the entire contig → constant signal everywhere.
+                bw.addEntries([contig], [0], ends=[int(length)], values=[float(value)])
+        sample_to_bw[sample] = str(bw_path)
+
+    track = gvl.BigWigs("signal", sample_to_bw)
+
+    # Three regions spanning two contigs, already in natural sort order
+    # (chr21 before chr22, ascending chromStart within contig).  This keeps
+    # regions.npy and input_regions.arrow in the same row order so the
+    # r_idx_map alignment in the test is trivially [0, 1, 2].
+    bed = pl.DataFrame(
+        {
+            "chrom": ["chr21", "chr21", "chr22"],
+            "chromStart": [1000, 5000, 1000],
+            "chromEnd": [1020, 5020, 1020],
+        }
+    )
+
+    out = work_dir / "jittered_ds.gvl"
+    gvl.write(path=out, bed=bed, tracks=track, max_jitter=max_jitter, overwrite=True)
+    return out
+
 
 def build_track_dataset(work_dir: Path) -> Path:
     """Write a small track-only GVL dataset and return its path.
@@ -30,3 +108,174 @@ def build_track_dataset(work_dir: Path) -> Path:
     out = work_dir / "ds.gvl"
     gvl.write(path=out, bed=bed, tracks=track, overwrite=True)
     return out
+
+
+def _make_session_bigwigs(bw_dir: Path, seed: int = 42) -> dict[str, str]:
+    """Write one BigWig per session sample over the session contigs.
+
+    Uses dense, non-overlapping intervals with density=0.05 (one interval
+    every ~20 bp on average) so that synthetic regions of width ~200–2000 bp
+    reliably contain multiple non-zero values.  The function is deterministic
+    given `seed` so repeated calls produce identical files.
+
+    Returns a mapping {sample_name: str(bw_path)}.
+    """
+    bw_dir.mkdir(parents=True, exist_ok=True)
+    header = [(c, length) for c, length in _SESSION_CONTIGS.items()]
+    paths: dict[str, str] = {}
+    for i, sample in enumerate(_SESSION_SAMPLES):
+        rng = np.random.default_rng(seed + i)
+        path = bw_dir / f"{sample}.bw"
+        with pyBigWig.open(str(path), "w") as bw:
+            bw.addHeader(header, maxZooms=0)
+            for contig, length in _SESSION_CONTIGS.items():
+                # ~5 % density → one interval per ~20 bp
+                n = max(2, int(length * 0.05))
+                starts = np.unique(rng.integers(0, length - 1, size=n).astype(np.int64))
+                starts.sort()
+                ends = np.empty_like(starts)
+                ends[:-1] = starts[1:]
+                ends[-1] = min(int(starts[-1]) + 1, length)
+                keep = ends > starts
+                starts, ends = starts[keep], ends[keep]
+                values = rng.standard_normal(len(starts)).astype(np.float32)
+                bw.addEntries(
+                    [contig] * len(starts),
+                    [int(s) for s in starts],
+                    ends=[int(e) for e in ends],
+                    values=[float(v) for v in values],
+                )
+        paths[sample] = str(path)
+    return paths
+
+
+def build_strand_mixed_dataset(work_dir: Path, svar_path: Path) -> Path:
+    """Write a variants+tracks GVL dataset with mixed + and − strand regions.
+
+    Strand layout (index → region → strand):
+      0: chr1:1010685-1010705  strand=+1  (overlaps GAGA→G deletion on chr1)
+      1: chr1:1110686-1110706  strand=−1  (non-vacuity anchor: GAATGTAAGACGCAGCGTGC)
+      2: chr1:1210686-1210706  strand=+1
+      3: chr2:14360-14380      strand=−1
+      4: chr2:1110686-1110706  strand=+1
+
+    Region 1 (the first -strand region) carries a non-palindromic reference
+    sequence so the non-vacuity assertion in
+    ``test_negative_strand_actually_reverse_complements`` reliably fires.
+
+    ``max_jitter=0`` is used here for the simplest deterministic geometry (no
+    jitter expansion, so stored interval starts equal query starts).  The #242
+    boundary condition (stored interval starts preceding the query start) was
+    fixed in both ``intervals_to_tracks`` kernels via the left-clip
+    ``s = max(itv.start - query_start, 0)`` (PR #244; #242 CLOSED).
+    End-to-end max_jitter>0 parity is covered by
+    ``test_tracks_max_jitter_intervals_parity_and_oracle``.
+    """
+    from genoray import SparseVar
+    import polars as pl
+
+    work_dir = Path(work_dir)
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    bw_dir = work_dir / "bw"
+    sample_to_bw = _make_session_bigwigs(bw_dir, seed=42)
+    track = gvl.BigWigs("signal", sample_to_bw)
+    sv = SparseVar(svar_path)
+
+    bed = pl.DataFrame(
+        {
+            "chrom": ["chr1", "chr1", "chr1", "chr2", "chr2"],
+            "chromStart": [1010685, 1110686, 1210686, 14360, 1110686],
+            "chromEnd": [1010705, 1110706, 1210706, 14380, 1110706],
+            "strand": ["+", "-", "+", "-", "+"],
+        }
+    )
+
+    out = work_dir / "strand_ds.gvl"
+    gvl.write(
+        path=out,
+        bed=bed,
+        variants=sv,
+        tracks=track,
+        max_jitter=0,
+        overwrite=True,
+    )
+    return out
+
+
+def build_haps_tracks_dataset(work_dir: Path, svar_path: Path) -> Path:
+    """Write a variants+tracks GVL dataset and return its path.
+
+    Uses the caller-supplied SparseVar file (which must cover chr1/chr2
+    with samples s0/s1/s2, as produced by the session-level build_case
+    fixture).  Synthetic BigWig tracks are written with matching samples
+    and contigs.  The dataset is written with **max_jitter=0** for the
+    simplest deterministic geometry: no jitter expansion, so stored
+    interval starts equal the query starts.  This keeps the fixture
+    focused on what it exists to test — variants (including indels) that
+    trigger ``shift_and_realign_tracks_sparse``.
+
+    #242 / PR #244
+    --------------
+    The boundary condition where stored interval starts precede the query
+    start (``itv.start < query_start``) was root-caused and fixed in both
+    ``intervals_to_tracks`` kernels via the left-clip
+    ``s = max(itv.start - query_start, 0)`` (PR #244; #242 CLOSED).
+    ``max_jitter=0`` here is retained only for the simplest deterministic
+    geometry, not because of any live panic or contract violation.
+    End-to-end max_jitter>0 parity is covered by
+    ``test_tracks_max_jitter_intervals_parity_and_oracle``.
+
+    Returns the path to the written dataset directory.
+    """
+    from genoray import SparseVar
+    import polars as pl
+
+    work_dir = Path(work_dir)
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    # Build BigWigs for the three session samples over chr1/chr2.
+    bw_dir = work_dir / "bw"
+    sample_to_bw = _make_session_bigwigs(bw_dir, seed=42)
+    track = gvl.BigWigs("signal", sample_to_bw)
+
+    # Derive regions from the SparseVar file: one short region per indel
+    # so that we are guaranteed to have indel-bearing regions (which are
+    # needed to exercise the realignment kernel).  Width=200 is wide enough
+    # to overlap several BigWig intervals at density=0.05.
+    sv = SparseVar(svar_path)
+    bed = pl.DataFrame(
+        {
+            "chrom": ["chr1", "chr1", "chr1", "chr2", "chr2"],
+            "chromStart": [
+                1010685,  # overlaps GAGA→G deletion on chr1
+                1110686,  # overlaps A→TTT insertion on chr1
+                1210686,  # overlaps C→G SNP on chr1 (mixed indels)
+                14360,  # overlaps chr2 SNP region
+                1110686,  # chr2 G→A/T multiallelic (indel neighbours)
+            ],
+            "chromEnd": [
+                1010705,
+                1110706,
+                1210706,
+                14380,
+                1110706,
+            ],
+        }
+    )
+
+    out = work_dir / "ds.gvl"
+    # max_jitter=0: simplest deterministic geometry (no jitter expansion).
+    # #242 is fixed via the intervals_to_tracks left-clip (PR #244, #242 CLOSED);
+    # max_jitter=0 here keeps interval starts == query starts for straightforward
+    # indel-realignment testing. See test_tracks_max_jitter_intervals_parity_and_oracle
+    # for max_jitter>0 end-to-end parity coverage.
+    gvl.write(
+        path=out,
+        bed=bed,
+        variants=sv,
+        tracks=track,
+        max_jitter=0,
+        overwrite=True,
+    )
+    return out
diff --git a/tests/parity/_golden.py b/tests/parity/_golden.py
new file mode 100644
index 00000000..4033c39a
--- /dev/null
+++ b/tests/parity/_golden.py
@@ -0,0 +1,436 @@
+# tests/parity/_golden.py
+"""Frozen-golden snapshot + replay for the parity suite.
+
+Goldens are generated from the RUST implementation and cross-checked against
+the numba oracle at generation time (see generate_goldens.py). Replay imports
+rust callables DIRECTLY — never via _dispatch — so these tests survive the
+numba/dispatch deletion in Stage B.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from pathlib import Path
+
+import numpy as np
+from hypothesis import HealthCheck, Phase, given, settings
+
+GOLDEN_DIR = Path(__file__).parent / "golden"
+
+
+def collect_examples(strategy, n: int) -> list:
+    """Deterministically draw ``n`` examples from a hypothesis strategy.
+
+    Derandomized + no database + generate-only phase ⇒ stable across runs for a
+    fixed hypothesis version. Inputs are frozen INTO the golden, so the replay
+    test never re-runs hypothesis.
+    """
+    out: list = []
+
+    @settings(
+        max_examples=n,
+        derandomize=True,
+        database=None,
+        phases=[Phase.generate],
+        suppress_health_check=list(HealthCheck),
+        deadline=None,
+    )
+    @given(strategy)
+    def _collect(ex):
+        if len(out) < n:
+            out.append(ex)
+
+    _collect()
+    return out
+
+
+def save_golden(name: str, cases: list) -> None:
+    GOLDEN_DIR.mkdir(parents=True, exist_ok=True)
+    np.savez_compressed(GOLDEN_DIR / f"{name}.npz", cases=np.array(cases, dtype=object))
+
+
+def load_golden(name: str) -> list:
+    data = np.load(GOLDEN_DIR / f"{name}.npz", allow_pickle=True)
+    return list(data["cases"])
+
+
+# --- direct rust-callable table -------------------------------------------------
+# Each entry MUST equal the `rust=` argument of the matching register(...) call in
+# production. Verify each against the dispatch map before trusting it.
+def _build_rust_kernels() -> dict[str, Callable]:
+    from genvarloader import genvarloader as _ext  # compiled extension
+
+    # Kernels whose registered rust= is a Python wrapper (not a bare FFI function):
+    # import the same wrapper the register() call used.
+    from genvarloader._dataset._reference import (
+        _get_reference_rust,  # wraps _ext.get_reference; normalises dtypes + int(pad_char)
+    )
+    from genvarloader._dataset._tracks import (
+        _shift_and_realign_tracks_sparse_rust_wrapper,  # wraps _ext.shift_and_realign_tracks_sparse
+    )
+
+    from genvarloader._dataset._flat_variants import (
+        _assemble_variant_buffers_rust,  # Python wrapper: routes to u8/i32 by lut dtype
+        _rc_alleles_rust,  # Python wrapper: asserts contiguous uint8 then calls ext
+    )
+
+    # Shim for reconstruct_haplotypes_from_sparse: the FFI now requires `parallel`
+    # but existing replay_inplace callers don't pass it. Default to False (serial)
+    # so existing golden replays are byte-identical to the pre-C1 implementation.
+    # The rayon-equivalence test explicitly passes parallel=True to exercise the
+    # parallel branch.
+    _rhfs_raw = _ext.reconstruct_haplotypes_from_sparse
+
+    def _reconstruct_haplotypes_from_sparse_shim(
+        *args, parallel: bool = False, **kwargs
+    ):
+        return _rhfs_raw(*args, parallel=parallel, **kwargs)
+
+    # Shim for tracks_to_intervals: FFI now requires `parallel` but existing
+    # replay_tuple callers don't pass it. Default to False (serial) so existing
+    # golden replays stay byte-identical. The rayon-equivalence test explicitly
+    # passes parallel=True/False to exercise both branches.
+    _tti_raw = _ext.tracks_to_intervals
+
+    def _tracks_to_intervals_shim(*args, parallel: bool = False, **kwargs):
+        return _tti_raw(*args, parallel=parallel, **kwargs)
+
+    # Shim for intervals_to_tracks: FFI now requires `parallel` but existing
+    # replay_inplace callers don't pass it. Default to False (serial) so
+    # existing golden replays stay byte-identical. The rayon-equivalence test
+    # explicitly passes parallel=True/False to exercise both branches.
+    _itt_raw = _ext.intervals_to_tracks
+
+    def _intervals_to_tracks_shim(*args, parallel: bool = False, **kwargs):
+        return _itt_raw(*args, parallel=parallel, **kwargs)
+
+    # Shim for get_diffs_sparse: FFI now requires `parallel` but existing
+    # replay_tuple callers don't pass it. Default to False (serial) so existing
+    # golden replays stay byte-identical. The rayon-equivalence test explicitly
+    # passes parallel=True/False to exercise both branches.
+    _gds_raw = _ext.get_diffs_sparse
+
+    def _get_diffs_sparse_shim(*args, parallel: bool = False, **kwargs):
+        return _gds_raw(*args, parallel=parallel, **kwargs)
+
+    table: dict[str, Callable] = {
+        "intervals_to_tracks": _intervals_to_tracks_shim,
+        "tracks_to_intervals": _tracks_to_intervals_shim,
+        "get_diffs_sparse": _get_diffs_sparse_shim,
+        "choose_exonic_variants": _ext.choose_exonic_variants,
+        "gather_alleles": _ext.gather_alleles,
+        "gather_rows_i32": _ext.gather_rows_i32,
+        "gather_rows_f32": _ext.gather_rows_f32,
+        "compact_keep_i32": _ext.compact_keep_i32,
+        "compact_keep_f32": _ext.compact_keep_f32,
+        "fill_empty_scalar_i32": _ext.fill_empty_scalar_i32,
+        "fill_empty_scalar_f32": _ext.fill_empty_scalar_f32,
+        "fill_empty_fixed_i32": _ext.fill_empty_fixed_i32,
+        "fill_empty_fixed_f32": _ext.fill_empty_fixed_f32,
+        "fill_empty_seq_u8": _ext.fill_empty_seq_u8,
+        "fill_empty_seq_i32": _ext.fill_empty_seq_i32,
+        # These registered rust= callables are Python wrappers, NOT bare FFI functions.
+        # Using the wrapper ensures correct input normalisation (dtypes, int casts, etc.)
+        # and keeps RUST_KERNELS in sync with the dispatch table.
+        "get_reference": _get_reference_rust,
+        "shift_and_realign_tracks_sparse": _shift_and_realign_tracks_sparse_rust_wrapper,
+        # Shim adds `parallel=False` default so existing replay_inplace callers
+        # (which don't pass parallel) continue to work unchanged.
+        "reconstruct_haplotypes_from_sparse": _reconstruct_haplotypes_from_sparse_shim,
+        # rc_alleles: registered rust= is _rc_alleles_rust (wrapper); use wrapper here.
+        "rc_alleles": _rc_alleles_rust,
+        # assemble_variant_buffers: registered rust= is _assemble_variant_buffers_rust
+        # (dtype-selecting shim: routes to u8/i32 monomorphization by lut dtype).
+        "assemble_variant_buffers": _assemble_variant_buffers_rust,
+    }
+    return table
+
+
+RUST_KERNELS: dict[str, Callable] = _build_rust_kernels()
+
+
+def _eq(name: str, i: int, got, exp) -> None:
+    got = np.asarray(got)
+    exp = np.asarray(exp)
+    assert got.dtype == exp.dtype, f"{name}[{i}]: dtype {got.dtype} != {exp.dtype}"
+    assert got.shape == exp.shape, f"{name}[{i}]: shape {got.shape} != {exp.shape}"
+    np.testing.assert_array_equal(got, exp, err_msg=f"{name}[{i}] value mismatch")
+
+
+def replay_return(name: str, cases: list) -> None:
+    fn = RUST_KERNELS[name]
+    for ci, (inputs, golden) in enumerate(cases):
+        _eq(f"{name}#{ci}", 0, fn(*inputs), golden)
+
+
+def replay_tuple(name: str, cases: list) -> None:
+    fn = RUST_KERNELS[name]
+    for ci, (inputs, golden) in enumerate(cases):
+        got = fn(*inputs)
+        got = got if isinstance(got, tuple) else (got,)
+        gold = golden if isinstance(golden, tuple) else (golden,)
+        assert len(got) == len(gold), (
+            f"{name}#{ci}: tuple len {len(got)} != {len(gold)}"
+        )
+        for j, (a, b) in enumerate(zip(got, gold)):
+            _eq(f"{name}#{ci}", j, a, b)
+
+
+def replay_inplace(
+    name: str, cases: list, out_factory: Callable, out_index: int
+) -> None:
+    fn = RUST_KERNELS[name]
+    for ci, (inputs, golden) in enumerate(cases):
+        out = out_factory(inputs)
+        args = list(inputs)
+        args.insert(out_index, out)
+        fn(*args)
+        _eq(f"{name}#{ci}", 0, out, golden)
+
+
+def replay_dict(name: str, cases: list) -> None:
+    fn = RUST_KERNELS[name]
+    for ci, (inputs, golden) in enumerate(cases):
+        got = fn(*inputs)
+        assert set(got) == set(golden), f"{name}#{ci}: keys {set(got)} != {set(golden)}"
+        for k in sorted(golden):
+            _eq(
+                f"{name}#{ci}:{k}.data",
+                0,
+                np.asarray(got[k][0]),
+                np.asarray(golden[k][0]),
+            )
+            _eq(
+                f"{name}#{ci}:{k}.off",
+                1,
+                np.asarray(got[k][1], np.int64),
+                np.asarray(golden[k][1], np.int64),
+            )
+
+
+# ---------------------------------------------------------------------------
+# Dataset-level output serialization (flatten + compare)
+# ---------------------------------------------------------------------------
+
+
+def flatten_output(out):
+    """Serialize a Dataset.__getitem__ result to a dict of arrays for golden storage.
+
+    Handles:
+      - seqpro.rag.Ragged         → {"kind":"ragged", "data":..., "offsets":...}
+      - RaggedAnnotatedHaps        → {"kind":"annot", "haps_data":..., ...}
+      - RaggedVariants             → {"kind":"ragged_variants", "field_names":[...], "fields":{...}}
+      - _FlatVariantWindows        → {"kind":"flat_variant_windows", "windows":{...}}
+      - plain ndarray              → {"kind":"array", "data":...}
+      - tuple thereof              → {"kind":"tuple", "items":[...]}
+    """
+    from seqpro.rag import Ragged
+    from genvarloader._ragged import RaggedAnnotatedHaps
+
+    # Lazily import to avoid circular imports at module level
+    try:
+        from genvarloader._dataset._rag_variants import (
+            RaggedVariants as _RaggedVariants,
+        )
+    except Exception:
+        _RaggedVariants = None
+
+    try:
+        from genvarloader._dataset._flat_variants import _FlatVariantWindows as _FVW
+    except Exception:
+        _FVW = None
+
+    # RaggedAnnotatedHaps must come before Ragged (it's a subclass of Ragged)
+    if isinstance(out, RaggedAnnotatedHaps):
+        return {
+            "kind": "annot",
+            "haps_data": np.asarray(out.haps.data),
+            "haps_offsets": np.asarray(out.haps.offsets, np.int64),
+            "var_idxs_data": np.asarray(out.var_idxs.data),
+            "var_idxs_offsets": np.asarray(out.var_idxs.offsets, np.int64),
+            "ref_coords_data": np.asarray(out.ref_coords.data),
+            "ref_coords_offsets": np.asarray(out.ref_coords.offsets, np.int64),
+        }
+
+    # RaggedVariants must come before Ragged (it's a subclass)
+    if _RaggedVariants is not None and isinstance(out, _RaggedVariants):
+        flat_fields: dict = {}
+        for fname in out.fields:
+            f = out[fname]
+            is_str = bool(getattr(f, "is_string", False))
+            flat_fields[fname] = {
+                "is_string": is_str,
+                "data": np.asarray(f.data, dtype="S1")
+                if is_str
+                else np.asarray(f.data),
+                "offsets": np.asarray(f.offsets, np.int64),
+            }
+        return {
+            "kind": "ragged_variants",
+            "field_names": list(out.fields),
+            "fields": flat_fields,
+        }
+
+    if _FVW is not None and isinstance(out, _FVW):
+        flat_wins: dict = {}
+        for wname in ("ref_window", "alt_window", "ref", "alt"):
+            w = getattr(out, wname, None)
+            if w is not None:
+                flat_wins[wname] = {
+                    "data": np.asarray(w.data),
+                    "seq_offsets": np.asarray(w.seq_offsets, np.int64),
+                    "var_offsets": np.asarray(w.var_offsets, np.int64),
+                }
+        return {"kind": "flat_variant_windows", "windows": flat_wins}
+
+    if isinstance(out, Ragged):
+        return {
+            "kind": "ragged",
+            "data": np.asarray(out.data),
+            "offsets": np.asarray(out.offsets, np.int64),
+        }
+
+    if isinstance(out, tuple):
+        return {"kind": "tuple", "items": [flatten_output(o) for o in out]}
+
+    return {"kind": "array", "data": np.asarray(out)}
+
+
+def _assert_flat_eq(got_flat, exp_flat, name: str) -> None:
+    """Recursively assert two flattened dicts are byte-identical."""
+    got_kind = (
+        got_flat["kind"] if isinstance(got_flat, dict) else type(got_flat).__name__
+    )
+    exp_kind = (
+        exp_flat["kind"] if isinstance(exp_flat, dict) else type(exp_flat).__name__
+    )
+    assert got_kind == exp_kind, f"{name}: kind {got_kind!r} != {exp_kind!r}"
+    kind = got_flat["kind"]
+
+    if kind == "ragged":
+        _eq(name + ".data", 0, got_flat["data"], exp_flat["data"])
+        _eq(name + ".offsets", 0, got_flat["offsets"], exp_flat["offsets"])
+
+    elif kind == "annot":
+        for key in (
+            "haps_data",
+            "haps_offsets",
+            "var_idxs_data",
+            "var_idxs_offsets",
+            "ref_coords_data",
+            "ref_coords_offsets",
+        ):
+            _eq(f"{name}.{key}", 0, got_flat[key], exp_flat[key])
+
+    elif kind == "array":
+        _eq(name + ".data", 0, got_flat["data"], exp_flat["data"])
+
+    elif kind == "tuple":
+        gi, ei = got_flat["items"], exp_flat["items"]
+        assert len(gi) == len(ei), f"{name}: tuple len {len(gi)} != {len(ei)}"
+        for i, (g, e) in enumerate(zip(gi, ei)):
+            _assert_flat_eq(g, e, f"{name}[{i}]")
+
+    elif kind == "ragged_variants":
+        gf, ef = got_flat["fields"], exp_flat["fields"]
+        assert set(gf) == set(ef), f"{name}: field names {set(gf)} != {set(ef)}"
+        for fname in ef:
+            g, e = gf[fname], ef[fname]
+            assert g["is_string"] == e["is_string"], (
+                f"{name}.{fname}: is_string mismatch"
+            )
+            _eq(f"{name}.{fname}.data", 0, g["data"], e["data"])
+            _eq(f"{name}.{fname}.offsets", 0, g["offsets"], e["offsets"])
+
+    elif kind == "flat_variant_windows":
+        gw, ew = got_flat["windows"], exp_flat["windows"]
+        assert set(gw) == set(ew), f"{name}: windows {set(gw)} != {set(ew)}"
+        for wname in ew:
+            g, e = gw[wname], ew[wname]
+            _eq(f"{name}.{wname}.data", 0, g["data"], e["data"])
+            _eq(f"{name}.{wname}.seq_offsets", 0, g["seq_offsets"], e["seq_offsets"])
+            _eq(f"{name}.{wname}.var_offsets", 0, g["var_offsets"], e["var_offsets"])
+
+    else:
+        raise ValueError(f"Unknown kind {kind!r}")
+
+
+def assert_output_matches_golden(out, golden) -> None:
+    """Assert a fresh Dataset output equals a frozen golden (byte-identical)."""
+    got_flat = flatten_output(out)
+    _assert_flat_eq(got_flat, golden, "output")
+
+
+def save_flat_golden(name: str, out) -> None:
+    """Flatten ``out`` and save as a single-item golden for dataset-level replay."""
+    save_golden(name, [flatten_output(out)])
+
+
+def load_flat_golden(name: str):
+    """Load a single flattened dataset golden saved via ``save_flat_golden``."""
+    return load_golden(name)[0]
+
+
+def make_kernel_spy(kernel_name: str):
+    """Install a counting spy on the direct rust callable at its production call site.
+
+    Returns ``(spy_fn, calls_dict, restore_fn)``. Call ``restore_fn()`` to undo.
+    """
+    import importlib
+
+    # Each entry is (primary_module, attr_name, [extra_modules_to_also_patch]).
+    # Extra modules have the same attr bound via a direct import; we must patch
+    # each alias so the spy intercepts all call sites.
+    _KERNEL_SITES: dict[str, tuple[str, str, list[str]]] = {
+        "get_reference": (
+            "genvarloader._dataset._reference",
+            "_get_reference_rust",
+            [],
+        ),
+        "assemble_variant_buffers": (
+            "genvarloader._dataset._flat_variants",
+            "_assemble_variant_buffers_rust",
+            [],
+        ),
+        "gather_rows_i32": (
+            "genvarloader._dataset._flat_variants",
+            "_gather_rows_i32_rust",
+            [],
+        ),
+        "compact_keep_i32": (
+            "genvarloader._dataset._flat_variants",
+            "_compact_keep_i32_rust",
+            [],
+        ),
+        "rc_alleles": (
+            "genvarloader._dataset._flat_variants",
+            "_rc_alleles_rust",
+            ["genvarloader._dataset._rag_variants"],
+        ),
+    }
+
+    if kernel_name not in _KERNEL_SITES:
+        raise KeyError(
+            f"make_kernel_spy: no site registered for {kernel_name!r}; known: {sorted(_KERNEL_SITES)}"
+        )
+
+    mod_name, attr_name, extra_mod_names = _KERNEL_SITES[kernel_name]
+    mod = importlib.import_module(mod_name)
+    orig = getattr(mod, attr_name)
+    calls: dict = {"n": 0}
+
+    def spy(*a, **k):
+        calls["n"] += 1
+        return orig(*a, **k)
+
+    setattr(mod, attr_name, spy)
+    extra_mods = [importlib.import_module(m) for m in extra_mod_names]
+    for em in extra_mods:
+        setattr(em, attr_name, spy)
+
+    def restore():
+        setattr(mod, attr_name, orig)
+        for em in extra_mods:
+            setattr(em, attr_name, orig)
+
+    return spy, calls, restore
diff --git a/tests/parity/_harness.py b/tests/parity/_harness.py
deleted file mode 100644
index 3fc77557..00000000
--- a/tests/parity/_harness.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Run both registered backends and assert byte-identical output."""
-
-from __future__ import annotations
-
-import numpy as np
-
-from genvarloader import _dispatch
-
-
-def assert_kernel_parity(name: str, *inputs) -> None:
-    numba_fn, rust_fn = _dispatch.backends(name)
-    got_numba = numba_fn(*inputs)
-    got_rust = rust_fn(*inputs)
-    assert got_numba.dtype == got_rust.dtype, (
-        f"{name}: dtype {got_numba.dtype} != {got_rust.dtype}"
-    )
-    assert got_numba.shape == got_rust.shape, (
-        f"{name}: shape {got_numba.shape} != {got_rust.shape}"
-    )
-    np.testing.assert_array_equal(got_numba, got_rust)
-
-
-def assert_inplace_kernel_parity(name, inputs, out_factory, out_index) -> None:
-    """Parity for kernels that WRITE an output buffer in place (return None).
-
-    ``inputs`` is the read-only argument tuple WITHOUT the out buffer. A fresh
-    out buffer is built per backend via ``out_factory()`` and inserted at
-    positional ``out_index``. Asserts the two written buffers are byte-identical.
-    """
-    numba_fn, rust_fn = _dispatch.backends(name)
-
-    out_numba = out_factory()
-    args = list(inputs)
-    args.insert(out_index, out_numba)
-    numba_fn(*args)
-
-    out_rust = out_factory()
-    args = list(inputs)
-    args.insert(out_index, out_rust)
-    rust_fn(*args)
-
-    assert out_numba.dtype == out_rust.dtype, (
-        f"{name}: dtype {out_numba.dtype} != {out_rust.dtype}"
-    )
-    assert out_numba.shape == out_rust.shape, (
-        f"{name}: shape {out_numba.shape} != {out_rust.shape}"
-    )
-    np.testing.assert_array_equal(out_numba, out_rust)
diff --git a/tests/parity/generate_goldens.py b/tests/parity/generate_goldens.py
new file mode 100644
index 00000000..7b711419
--- /dev/null
+++ b/tests/parity/generate_goldens.py
@@ -0,0 +1,669 @@
+# tests/parity/generate_goldens.py
+"""Regenerate frozen golden fixtures for the parity suite.
+
+RUN MANUALLY while numba is still installed (Stage A):
+    pixi run -e dev python -m tests.parity.generate_goldens
+
+For each kernel: draw N deterministic examples, compute the golden from RUST,
+and assert the numba oracle agrees BEFORE saving.
+
+*** DANGER (post-W5): numba was DELETED in W5. Re-running this script now freezes
+rust == rust with NO oracle cross-check — a silent rust==rust freeze that defeats
+the parity contract. Only regenerate on a numba-PRESENT checkout (a commit at or
+before the Stage-A snapshot, with numba installed), or the goldens are meaningless. ***
+
+Verified signatures / out_index values (ground-truthed against existing parity tests):
+
+intervals_to_tracks (test_intervals_to_tracks_parity.py):
+  Strategy yields 7-tuple: (offset_idxs, starts, itv_starts, itv_ends, itv_values,
+    itv_offsets, out_offsets). out_index=6; out dtype float32; size=int(inp[6][-1]).
+  Confirmed: assert_inplace_kernel_parity("intervals_to_tracks", inputs, ..., out_index=6).
+  Brief placeholder (out_index=7) was wrong.
+
+shift_and_realign_tracks_sparse (test_shift_and_realign_tracks_parity.py):
+  Strategy yields (total_out, inputs_tuple); out=np.zeros(total_out, f32) at index 0.
+  Registered rust= is _shift_and_realign_tracks_sparse_rust_wrapper (Python wrapper).
+
+reconstruct_haplotypes_from_sparse (test_reconstruct_haplotypes_parity.py):
+  Strategy yields (total_out, inputs_tuple); out=np.zeros(total_out, u8) at index 0.
+  Registered rust= is _ext.reconstruct_haplotypes_from_sparse (bare FFI).
+
+get_diffs_sparse, choose_exonic_variants, gather_rows_i32/f32:
+  Require _as_starts_stops(offsets) normalisation; confirmed via test_flat_variants_parity.py
+  and test_get_diffs_sparse_parity.py / test_choose_exonic_variants_parity.py.
+
+gather_alleles: requires ascontiguousarray on all inputs.
+
+fill_empty_scalar_i32/f32: fill arg must be Python int/float (not np.scalar).
+fill_empty_fixed_i32/f32: inner and fill args must be Python int/float.
+  Confirmed via _fill_empty_scalar / _fill_empty_fixed public wrapper source.
+
+get_reference: registered rust= is _get_reference_rust wrapper (normalises dtypes,
+  converts pad_char to int). RUST_KERNELS entry updated in _golden.py to match.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+try:
+    from genvarloader import _dispatch
+except ImportError:
+    _dispatch = None
+
+from genvarloader._dataset._genotypes import _as_starts_stops
+from tests.parity import _golden, strategies
+
+RETURN, TUPLE, INPLACE = "return", "tuple", "inplace"
+
+
+# ---------------------------------------------------------------------------
+# Input normalizers — mirror what the existing parity tests pass to kernels.
+# Each function takes the raw strategy output and returns a normalised tuple.
+# ---------------------------------------------------------------------------
+
+
+def _pre_get_diffs_sparse(inp):
+    """Normalise offsets to (2,n) int64 and ensure all arrays are contiguous."""
+    goi, gvi, offsets, ilens, keep, keep_off, qs, qe, vs = inp
+    return (
+        np.ascontiguousarray(goi, np.int64),
+        np.ascontiguousarray(gvi, np.int32),
+        _as_starts_stops(offsets),
+        np.ascontiguousarray(ilens, np.int32),
+        None if keep is None else np.ascontiguousarray(keep, np.bool_),
+        None if keep_off is None else np.ascontiguousarray(keep_off, np.int64),
+        None if qs is None else np.ascontiguousarray(qs, np.int32),
+        None if qe is None else np.ascontiguousarray(qe, np.int32),
+        None if vs is None else np.ascontiguousarray(vs, np.int32),
+    )
+
+
+def _pre_choose_exonic(inp):
+    qs, qe, goi, gvi, offsets, vs, ilens = inp
+    return (
+        np.ascontiguousarray(qs, np.int32),
+        np.ascontiguousarray(qe, np.int32),
+        np.ascontiguousarray(goi, np.int64),
+        np.ascontiguousarray(gvi, np.int32),
+        _as_starts_stops(offsets),
+        np.ascontiguousarray(vs, np.int32),
+        np.ascontiguousarray(ilens, np.int32),
+    )
+
+
+def _pre_gather_rows(inp):
+    goi, off, data = inp
+    return (
+        np.ascontiguousarray(goi, np.int64),
+        _as_starts_stops(off),
+        np.ascontiguousarray(data),
+    )
+
+
+def _pre_gather_alleles(inp):
+    v_idxs, allele_bytes, allele_offsets = inp
+    return (
+        np.ascontiguousarray(v_idxs, np.int32),
+        np.ascontiguousarray(allele_bytes, np.uint8),
+        np.ascontiguousarray(allele_offsets, np.int64),
+    )
+
+
+def _pre_fill_empty_scalar_i32(inp):
+    data, offsets, fill = inp
+    return (data, offsets, int(fill))
+
+
+def _pre_fill_empty_scalar_f32(inp):
+    data, offsets, fill = inp
+    return (data, offsets, float(fill))
+
+
+def _pre_fill_empty_fixed_i32(inp):
+    data, offsets, inner, fill = inp
+    return (data, offsets, int(inner), int(fill))
+
+
+def _pre_fill_empty_fixed_f32(inp):
+    data, offsets, inner, fill = inp
+    return (data, offsets, int(inner), float(fill))
+
+
+# ---------------------------------------------------------------------------
+# Kernel registry
+# ---------------------------------------------------------------------------
+
+# SPEC: (name, strategy, shape, n, preprocess_fn)
+#   shape   = RETURN | TUPLE — how the rust callable returns its result
+#   preprocess_fn: callable(raw_inp) → normalised_inp, or None for no-op
+SPEC: list[tuple] = [
+    (
+        "get_diffs_sparse",
+        strategies.get_diffs_sparse_inputs(),
+        TUPLE,
+        200,
+        _pre_get_diffs_sparse,
+    ),
+    (
+        "choose_exonic_variants",
+        strategies.choose_exonic_variants_inputs(),
+        TUPLE,
+        200,
+        _pre_choose_exonic,
+    ),
+    (
+        "gather_rows_i32",
+        strategies.gather_rows_inputs(np.int32),
+        TUPLE,
+        100,
+        _pre_gather_rows,
+    ),
+    (
+        "gather_rows_f32",
+        strategies.gather_rows_inputs(np.float32),
+        TUPLE,
+        100,
+        _pre_gather_rows,
+    ),
+    (
+        "gather_alleles",
+        strategies.gather_alleles_inputs(),
+        TUPLE,
+        100,
+        _pre_gather_alleles,
+    ),
+    ("compact_keep_i32", strategies.compact_keep_inputs(np.int32), TUPLE, 100, None),
+    ("compact_keep_f32", strategies.compact_keep_inputs(np.float32), TUPLE, 100, None),
+    (
+        "fill_empty_scalar_i32",
+        strategies.fill_empty_scalar_inputs(np.int32),
+        TUPLE,
+        100,
+        _pre_fill_empty_scalar_i32,
+    ),
+    (
+        "fill_empty_scalar_f32",
+        strategies.fill_empty_scalar_inputs(np.float32),
+        TUPLE,
+        100,
+        _pre_fill_empty_scalar_f32,
+    ),
+    (
+        "fill_empty_fixed_i32",
+        strategies.fill_empty_fixed_inputs(np.int32),
+        TUPLE,
+        100,
+        _pre_fill_empty_fixed_i32,
+    ),
+    (
+        "fill_empty_fixed_f32",
+        strategies.fill_empty_fixed_inputs(np.float32),
+        TUPLE,
+        100,
+        _pre_fill_empty_fixed_f32,
+    ),
+    ("fill_empty_seq_u8", strategies.fill_empty_seq_inputs(np.uint8), TUPLE, 100, None),
+    (
+        "fill_empty_seq_i32",
+        strategies.fill_empty_seq_inputs(np.int32),
+        TUPLE,
+        100,
+        None,
+    ),
+    ("tracks_to_intervals", strategies.tracks_to_intervals_inputs(), TUPLE, 200, None),
+    ("get_reference", strategies.get_reference_inputs(), RETURN, 200, None),
+]
+
+# INPLACE_SPEC: (name, strategy, n, out_factory, out_index)
+#   For shift_and_realign and reconstruct: strategy yields (total_out, inputs_tuple),
+#     out_factory receives total_out (scalar), out inserted at index 0.
+#   For intervals_to_tracks: strategy yields 7-tuple directly, out_factory receives
+#     the inputs tuple, out inserted at index 6 (verified: assert_inplace_kernel_parity
+#     in test_intervals_to_tracks_parity.py uses out_index=6, NOT 7).
+INPLACE_SPEC: list[tuple] = [
+    (
+        "intervals_to_tracks",
+        strategies.intervals_to_tracks_inputs(),
+        200,
+        # inp[6] = out_offsets; inp[6][-1] = total output length.
+        # NaN sentinel: unwritten positions stay NaN and are caught by oracle.
+        lambda inp: np.full(int(inp[6][-1]), np.nan, np.float32),
+        6,  # out is inserted before out_offsets (the 7th element)
+    ),
+    (
+        "shift_and_realign_tracks_sparse",
+        strategies.shift_and_realign_tracks_inputs(),
+        200,
+        lambda total_out: np.zeros(total_out, np.float32),
+        0,
+    ),
+    (
+        "reconstruct_haplotypes_from_sparse",
+        strategies.reconstruct_haplotypes_inputs(),
+        200,
+        lambda total_out: np.zeros(total_out, np.uint8),
+        0,
+    ),
+]
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _normalize(out):
+    """Normalise kernel output to ndarray or tuple of ndarrays for comparison."""
+    if isinstance(out, tuple):
+        return tuple(np.asarray(x) for x in out)
+    if isinstance(out, dict):
+        return {k: (np.asarray(v[0]), np.asarray(v[1])) for k, v in out.items()}
+    return np.asarray(out)
+
+
+def _assert_oracle(name: str, a, b) -> None:
+    """Assert numba (a) == rust (b); both already normalised.
+
+    If this fires it is a REAL numba/rust divergence — do NOT suppress it.
+    See the numba-oracle-bug policy: determine whether numba is the buggy side,
+    file a separate issue, and block this golden until the divergence is resolved.
+    """
+    if isinstance(a, tuple):
+        assert len(a) == len(b), f"{name}: tuple len {len(a)} != {len(b)}"
+        for i, (x, y) in enumerate(zip(a, b)):
+            np.testing.assert_array_equal(x, y, err_msg=f"{name}[{i}] oracle mismatch")
+    elif isinstance(a, dict):
+        assert set(a) == set(b), f"{name}: dict keys mismatch {set(a)} vs {set(b)}"
+        for k in a:
+            np.testing.assert_array_equal(a[k][0], b[k][0])
+            np.testing.assert_array_equal(
+                np.asarray(a[k][1], np.int64), np.asarray(b[k][1], np.int64)
+            )
+    else:
+        np.testing.assert_array_equal(a, b, err_msg=f"{name} oracle mismatch")
+
+
+def _have_numba(name: str) -> bool:
+    if _dispatch is None:
+        return False
+    try:
+        _dispatch.backends(name)
+        return True
+    except Exception:
+        return False
+
+
+# ---------------------------------------------------------------------------
+# Generators
+# ---------------------------------------------------------------------------
+
+
+def gen_value_kernels() -> None:
+    for name, strat, shape, n, preprocess in SPEC:
+        examples = _golden.collect_examples(strat, n)
+        rust = _golden.RUST_KERNELS[name]
+        nb_fn = _dispatch.backends(name)[0] if _have_numba(name) else None
+        cases = []
+        for raw_inp in examples:
+            inp = preprocess(raw_inp) if preprocess is not None else raw_inp
+            r = _normalize(rust(*inp))
+            if nb_fn is not None:
+                _assert_oracle(name, _normalize(nb_fn(*inp)), r)
+            cases.append((inp, r))
+        _golden.save_golden(name, cases)
+        print(f"  {name}: {len(cases)} cases")
+
+
+def gen_inplace_kernels() -> None:
+    for name, strat, n, out_factory, out_index in INPLACE_SPEC:
+        examples = _golden.collect_examples(strat, n)
+        rust = _golden.RUST_KERNELS[name]
+        nb_fn = _dispatch.backends(name)[0] if _have_numba(name) else None
+        cases = []
+        for ex in examples:
+            # shift/reconstruct strategies yield (total_out, inputs_tuple);
+            # intervals_to_tracks yields the 7-element inputs tuple directly.
+            if isinstance(ex, tuple) and len(ex) == 2 and np.isscalar(ex[0]):
+                total_out, inputs = ex
+
+                def of(_inp, t=total_out):
+                    return out_factory(t)
+            else:
+                inputs = ex
+                of = out_factory
+            # Run Rust kernel on a fresh out buffer
+            out_r = of(inputs)
+            args = list(inputs)
+            args.insert(out_index, out_r)
+            rust(*args)
+            # Cross-check against numba oracle — STOP if mismatch (not suppressed)
+            if nb_fn is not None:
+                out_n = of(inputs)
+                args_n = list(inputs)
+                args_n.insert(out_index, out_n)
+                nb_fn(*args_n)
+                np.testing.assert_array_equal(
+                    out_n, out_r, err_msg=f"{name} oracle mismatch"
+                )
+            cases.append((inputs, np.asarray(out_r)))
+        _golden.save_golden(name, cases)
+        print(f"  {name}: {len(cases)} cases")
+
+
+# ---------------------------------------------------------------------------
+# PRNG primitives (xorshift64 / hash4): deterministic scalar table
+# ---------------------------------------------------------------------------
+
+UINT64_MAX = 2**64 - 1
+
+
+def gen_prng() -> None:
+    """Freeze xorshift64 and hash4 golden tables.
+
+    Deterministic inputs; no hypothesis required here — we pick a fixed list of
+    representative uint64 values and cross-check rust vs numba at generation time.
+    """
+    from genvarloader._dataset._tracks import _hash4 as _hash4_numba
+    from genvarloader._dataset._tracks import _xorshift64 as _xorshift64_numba
+    from genvarloader.genvarloader import _debug_hash4 as _hash4_rust
+    from genvarloader.genvarloader import _debug_xorshift64 as _xorshift64_rust
+
+    # Representative uint64 inputs: 0, 1, small values, mid-range, near-max.
+    xs_inputs: list[int] = [
+        0,
+        1,
+        2,
+        42,
+        255,
+        256,
+        65535,
+        65536,
+        0xDEAD,
+        0xBEEF,
+        0xDEADBEEF,
+        0xCAFEBABEDEAD,
+        2**32 - 1,
+        2**32,
+        2**48,
+        2**63 - 1,
+        2**63,
+        UINT64_MAX - 1,
+        UINT64_MAX,
+    ] + list(range(1000, 1100))  # 100 sequential values for sequential patterns
+
+    xs_cases = []
+    for x in xs_inputs:
+        rust_out = int(_xorshift64_rust(x))
+        numba_out = int(_xorshift64_numba(np.uint64(x)))
+        if rust_out != numba_out:
+            raise AssertionError(
+                f"xorshift64({x:#x}): rust={rust_out:#x} numba={numba_out:#x}"
+            )
+        xs_cases.append(((x,), np.uint64(rust_out)))
+    _golden.save_golden("prng_xorshift64", xs_cases)
+    print(f"  prng_xorshift64: {len(xs_cases)} cases")
+
+    # hash4: representative (a, b, c, d) quadruples.
+    h4_quads: list[tuple[int, int, int, int]] = [
+        (0, 0, 0, 0),
+        (1, 2, 3, 4),
+        (0xDEADBEEF, 0xCAFE, 0xBABE, 1),
+        (UINT64_MAX, UINT64_MAX, UINT64_MAX, UINT64_MAX),
+        (2**63, 0, 0, 0),
+        (1, 0, 0, 0),
+        (0, 1, 0, 0),
+        (0, 0, 1, 0),
+        (0, 0, 0, 1),
+        (42, 43, 44, 45),
+        (2**32, 2**32 + 1, 2**32 + 2, 2**32 + 3),
+    ] + [(i, i + 1, i + 2, i + 3) for i in range(100, 150)]
+
+    h4_cases = []
+    for a, b, c, d in h4_quads:
+        rust_out = int(_hash4_rust(a, b, c, d))
+        numba_out = int(
+            _hash4_numba(np.uint64(a), np.uint64(b), np.uint64(c), np.uint64(d))
+        )
+        if rust_out != numba_out:
+            raise AssertionError(
+                f"hash4({a:#x},{b:#x},{c:#x},{d:#x}): rust={rust_out:#x} numba={numba_out:#x}"
+            )
+        h4_cases.append(((a, b, c, d), np.uint64(rust_out)))
+    _golden.save_golden("prng_hash4", h4_cases)
+    print(f"  prng_hash4: {len(h4_cases)} cases")
+
+
+# ---------------------------------------------------------------------------
+# rc_alleles: freeze in-place RC golden
+# ---------------------------------------------------------------------------
+
+
+def _rc_alleles_batch_strategy():
+    """Composite strategy mirroring the test_rc_alleles_parity._allele_batch."""
+    from hypothesis import strategies as st
+
+    _ACGTN = np.frombuffer(b"ACGTN", np.uint8)
+
+    @st.composite
+    def _allele_batch(draw):
+        n_rows = draw(st.integers(1, 4))
+        alleles_per_row = [draw(st.integers(0, 3)) for _ in range(n_rows)]
+        var_offsets = np.concatenate([[0], np.cumsum(alleles_per_row)]).astype(np.int64)
+        n_alleles = int(var_offsets[-1])
+        lens = [draw(st.integers(0, 5)) for _ in range(n_alleles)]
+        seq_offsets = np.concatenate([[0], np.cumsum(lens)]).astype(np.int64)
+        total = int(seq_offsets[-1])
+        data = (
+            _ACGTN[draw(st.lists(st.integers(0, 4), min_size=total, max_size=total))]
+            if total
+            else np.zeros(0, np.uint8)
+        )
+        data = np.ascontiguousarray(data, np.uint8)
+        mask = np.array([draw(st.booleans()) for _ in range(n_rows)], np.bool_)
+        return data, seq_offsets, var_offsets, mask
+
+    return _allele_batch()
+
+
+def gen_rc_alleles() -> None:
+    """Freeze rc_alleles golden: store (initial_byte_data, seq_off, var_off, mask) → result."""
+    nb_fn = _dispatch.backends("rc_alleles")[0] if _have_numba("rc_alleles") else None
+    rust_fn = _golden.RUST_KERNELS["rc_alleles"]
+    strat = _rc_alleles_batch_strategy()
+    examples = _golden.collect_examples(strat, 200)
+    cases = []
+    for raw in examples:
+        data, seq_offsets, var_offsets, mask = raw
+        # Normalise inputs (mirrors _rc_alleles_rust wrapper requirements)
+        data = np.ascontiguousarray(data, np.uint8)
+        seq_offsets = np.ascontiguousarray(seq_offsets, np.int64)
+        var_offsets = np.ascontiguousarray(var_offsets, np.int64)
+        mask = np.ascontiguousarray(mask, np.bool_)
+
+        # Run Rust on a copy (in-place mutation)
+        buf_r = data.copy()
+        rust_fn(buf_r, seq_offsets, var_offsets, mask)
+
+        # Cross-check against numba oracle
+        if nb_fn is not None:
+            buf_n = data.copy()
+            nb_fn(buf_n, seq_offsets, var_offsets, mask)
+            np.testing.assert_array_equal(
+                buf_n, buf_r, err_msg="rc_alleles oracle mismatch"
+            )
+
+        # Store: inputs include initial data so replay can copy it
+        cases.append(((data, seq_offsets, var_offsets, mask), buf_r))
+
+    _golden.save_golden("rc_alleles", cases)
+    print(f"  rc_alleles: {len(cases)} cases")
+
+
+# ---------------------------------------------------------------------------
+# assemble_variant_buffers: freeze fixed parametrised cases
+# ---------------------------------------------------------------------------
+
+
+def gen_assemble_variant_buffers() -> None:
+    """Freeze all parametrised assemble_variant_buffers cases.
+
+    Mirrors the exact inputs from test_assemble_variant_buffers_parity.py so the
+    golden covers the same mode matrix without re-running numba at test time.
+    """
+    nb_fn = (
+        _dispatch.backends("assemble_variant_buffers")[0]
+        if _have_numba("assemble_variant_buffers")
+        else None
+    )
+    rust_fn = _golden.RUST_KERNELS["assemble_variant_buffers"]
+
+    def _reference():
+        bases = np.frombuffer(b"ACGT", np.uint8)
+        ref = np.tile(bases, 10).astype(np.uint8)
+        ref_offsets = np.array([0, ref.size], np.int64)
+        return ref, ref_offsets
+
+    def _lut(dtype):
+        lut = np.full(256, 4, dtype)
+        for i, b in enumerate(b"ACGT"):
+            lut[b] = i
+        return lut
+
+    def _globals():
+        alt_data = np.frombuffer(b"ACGT", np.uint8)
+        alt_off = np.array([0, 1, 3, 4], np.int64)
+        ref_data = np.frombuffer(b"CGAA", np.uint8)
+        ref_off = np.array([0, 1, 2, 4], np.int64)
+        v_starts = np.array([5, 12, 20], np.int32)
+        ilens = np.array([0, -1, 1], np.int32)
+        return alt_data, alt_off, ref_data, ref_off, v_starts, ilens
+
+    cases = []
+
+    ref, ref_offsets = _reference()
+    alt_data, alt_off, ref_data, ref_off, v_starts, ilens = _globals()
+
+    # test_windows_mode_matrix: tok_dtype × (ref_mode, alt_mode)
+    for tok_dtype in [np.uint8, np.int32]:
+        for ref_mode, alt_mode in [(1, 1), (1, 2), (2, 1), (2, 2)]:
+            lut = _lut(tok_dtype)
+            v_idxs = np.array([0, 1, 2], np.int32)
+            row_offsets = np.array([0, 3], np.int64)
+            v_contigs = np.zeros(3, np.int32)
+            inp = (
+                1,
+                v_idxs,
+                row_offsets,
+                alt_data,
+                alt_off,
+                ref_data,
+                ref_off,
+                False,
+                False,
+                ref_mode,
+                alt_mode,
+                2,
+                lut,
+                v_contigs,
+                v_starts,
+                ilens,
+                ref,
+                ref_offsets,
+                ord("N"),
+            )
+            r = _normalize(rust_fn(*inp))
+            if nb_fn is not None:
+                _assert_oracle(
+                    "assemble_variant_buffers/windows", _normalize(nb_fn(*inp)), r
+                )
+            cases.append((inp, r))
+
+    # test_variants_mode_matrix: tok_dtype × (want_ref, want_flank)
+    for tok_dtype in [np.uint8, np.int32]:
+        for want_ref, want_flank in [
+            (False, False),
+            (True, False),
+            (False, True),
+            (True, True),
+        ]:
+            lut = _lut(tok_dtype) if want_flank else None
+            v_idxs = np.array([2, 0, 1], np.int32)
+            row_offsets = np.array([0, 1, 3], np.int64)
+            v_contigs = np.zeros(3, np.int32)
+            inp = (
+                0,
+                v_idxs,
+                row_offsets,
+                alt_data,
+                alt_off,
+                ref_data,
+                ref_off,
+                want_ref,
+                want_flank,
+                0,
+                0,
+                2,
+                lut,
+                v_contigs,
+                v_starts,
+                ilens,
+                ref,
+                ref_offsets,
+                ord("N"),
+            )
+            r = _normalize(rust_fn(*inp))
+            if nb_fn is not None:
+                _assert_oracle(
+                    "assemble_variant_buffers/variants", _normalize(nb_fn(*inp)), r
+                )
+            cases.append((inp, r))
+
+    # test_empty_selection: (mode, ref_mode, alt_mode)
+    for mode, ref_mode, alt_mode in [(0, 0, 0), (1, 1, 1)]:
+        lut = _lut(np.uint8)
+        v_idxs = np.array([], np.int32)
+        row_offsets = np.array([0, 0], np.int64)
+        v_contigs = np.array([], np.int32)
+        inp = (
+            mode,
+            v_idxs,
+            row_offsets,
+            alt_data,
+            alt_off,
+            ref_data,
+            ref_off,
+            False,
+            (mode == 0),
+            ref_mode,
+            alt_mode,
+            2,
+            lut,
+            v_contigs,
+            v_starts,
+            ilens,
+            ref,
+            ref_offsets,
+            ord("N"),
+        )
+        r = _normalize(rust_fn(*inp))
+        if nb_fn is not None:
+            _assert_oracle("assemble_variant_buffers/empty", _normalize(nb_fn(*inp)), r)
+        cases.append((inp, r))
+
+    _golden.save_golden("assemble_variant_buffers", cases)
+    print(f"  assemble_variant_buffers: {len(cases)} cases")
+
+
+if __name__ == "__main__":
+    print("Generating value-kernel goldens...")
+    gen_value_kernels()
+    print("Generating in-place-kernel goldens...")
+    gen_inplace_kernels()
+    print("Generating PRNG goldens...")
+    gen_prng()
+    print("Generating rc_alleles golden...")
+    gen_rc_alleles()
+    print("Generating assemble_variant_buffers golden...")
+    gen_assemble_variant_buffers()
+    print("Done.")
diff --git a/tests/parity/golden/.gitkeep b/tests/parity/golden/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/parity/golden/assemble_variant_buffers.npz b/tests/parity/golden/assemble_variant_buffers.npz
new file mode 100644
index 00000000..66a74e9c
Binary files /dev/null and b/tests/parity/golden/assemble_variant_buffers.npz differ
diff --git a/tests/parity/golden/choose_exonic_variants.npz b/tests/parity/golden/choose_exonic_variants.npz
new file mode 100644
index 00000000..0a446b27
Binary files /dev/null and b/tests/parity/golden/choose_exonic_variants.npz differ
diff --git a/tests/parity/golden/compact_keep_f32.npz b/tests/parity/golden/compact_keep_f32.npz
new file mode 100644
index 00000000..9fe00c48
Binary files /dev/null and b/tests/parity/golden/compact_keep_f32.npz differ
diff --git a/tests/parity/golden/compact_keep_i32.npz b/tests/parity/golden/compact_keep_i32.npz
new file mode 100644
index 00000000..fd58048b
Binary files /dev/null and b/tests/parity/golden/compact_keep_i32.npz differ
diff --git a/tests/parity/golden/ds_annotated_mode.npz b/tests/parity/golden/ds_annotated_mode.npz
new file mode 100644
index 00000000..b51322d0
Binary files /dev/null and b/tests/parity/golden/ds_annotated_mode.npz differ
diff --git a/tests/parity/golden/ds_annotated_spliced.npz b/tests/parity/golden/ds_annotated_spliced.npz
new file mode 100644
index 00000000..36725f7c
Binary files /dev/null and b/tests/parity/golden/ds_annotated_spliced.npz differ
diff --git a/tests/parity/golden/ds_haplotypes_mode.npz b/tests/parity/golden/ds_haplotypes_mode.npz
new file mode 100644
index 00000000..b4baa2d7
Binary files /dev/null and b/tests/parity/golden/ds_haplotypes_mode.npz differ
diff --git a/tests/parity/golden/ds_haps_fixed_len.npz b/tests/parity/golden/ds_haps_fixed_len.npz
new file mode 100644
index 00000000..11199527
Binary files /dev/null and b/tests/parity/golden/ds_haps_fixed_len.npz differ
diff --git a/tests/parity/golden/ds_haps_tracks_Constant.npz b/tests/parity/golden/ds_haps_tracks_Constant.npz
new file mode 100644
index 00000000..36a3bfb3
Binary files /dev/null and b/tests/parity/golden/ds_haps_tracks_Constant.npz differ
diff --git a/tests/parity/golden/ds_haps_tracks_FlankSample.npz b/tests/parity/golden/ds_haps_tracks_FlankSample.npz
new file mode 100644
index 00000000..d60d1057
Binary files /dev/null and b/tests/parity/golden/ds_haps_tracks_FlankSample.npz differ
diff --git a/tests/parity/golden/ds_haps_tracks_Interpolate.npz b/tests/parity/golden/ds_haps_tracks_Interpolate.npz
new file mode 100644
index 00000000..05de83a6
Binary files /dev/null and b/tests/parity/golden/ds_haps_tracks_Interpolate.npz differ
diff --git a/tests/parity/golden/ds_haps_tracks_Repeat5p.npz b/tests/parity/golden/ds_haps_tracks_Repeat5p.npz
new file mode 100644
index 00000000..b71b45c2
Binary files /dev/null and b/tests/parity/golden/ds_haps_tracks_Repeat5p.npz differ
diff --git a/tests/parity/golden/ds_haps_tracks_Repeat5pNormalized.npz b/tests/parity/golden/ds_haps_tracks_Repeat5pNormalized.npz
new file mode 100644
index 00000000..694297ee
Binary files /dev/null and b/tests/parity/golden/ds_haps_tracks_Repeat5pNormalized.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_annotated.npz b/tests/parity/golden/ds_neg_strand_annotated.npz
new file mode 100644
index 00000000..ca782c36
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_annotated.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_haplotypes.npz b/tests/parity/golden/ds_neg_strand_haplotypes.npz
new file mode 100644
index 00000000..025343de
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_haplotypes.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_haps_tracks.npz b/tests/parity/golden/ds_neg_strand_haps_tracks.npz
new file mode 100644
index 00000000..ffd1c248
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_haps_tracks.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_reference.npz b/tests/parity/golden/ds_neg_strand_reference.npz
new file mode 100644
index 00000000..a49d1275
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_reference.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_spliced_annotated.npz b/tests/parity/golden/ds_neg_strand_spliced_annotated.npz
new file mode 100644
index 00000000..a17f3a09
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_spliced_annotated.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_spliced_haplotypes.npz b/tests/parity/golden/ds_neg_strand_spliced_haplotypes.npz
new file mode 100644
index 00000000..738dbb2d
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_spliced_haplotypes.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_spliced_reference.npz b/tests/parity/golden/ds_neg_strand_spliced_reference.npz
new file mode 100644
index 00000000..49ce46de
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_spliced_reference.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_spliced_tracks.npz b/tests/parity/golden/ds_neg_strand_spliced_tracks.npz
new file mode 100644
index 00000000..7133e4ca
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_spliced_tracks.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_tracks.npz b/tests/parity/golden/ds_neg_strand_tracks.npz
new file mode 100644
index 00000000..63385649
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_tracks.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_tracks_seqs.npz b/tests/parity/golden/ds_neg_strand_tracks_seqs.npz
new file mode 100644
index 00000000..346fd149
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_tracks_seqs.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_variants.npz b/tests/parity/golden/ds_neg_strand_variants.npz
new file mode 100644
index 00000000..a46e76c9
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_variants.npz differ
diff --git a/tests/parity/golden/ds_neg_strand_variants_dummy.npz b/tests/parity/golden/ds_neg_strand_variants_dummy.npz
new file mode 100644
index 00000000..8d90454c
Binary files /dev/null and b/tests/parity/golden/ds_neg_strand_variants_dummy.npz differ
diff --git a/tests/parity/golden/ds_reference_fetch.npz b/tests/parity/golden/ds_reference_fetch.npz
new file mode 100644
index 00000000..7eab097e
Binary files /dev/null and b/tests/parity/golden/ds_reference_fetch.npz differ
diff --git a/tests/parity/golden/ds_reference_mode.npz b/tests/parity/golden/ds_reference_mode.npz
new file mode 100644
index 00000000..2e3b7fc7
Binary files /dev/null and b/tests/parity/golden/ds_reference_mode.npz differ
diff --git a/tests/parity/golden/ds_spliced_haps.npz b/tests/parity/golden/ds_spliced_haps.npz
new file mode 100644
index 00000000..622b2954
Binary files /dev/null and b/tests/parity/golden/ds_spliced_haps.npz differ
diff --git a/tests/parity/golden/ds_tracks.npz b/tests/parity/golden/ds_tracks.npz
new file mode 100644
index 00000000..85b26e27
Binary files /dev/null and b/tests/parity/golden/ds_tracks.npz differ
diff --git a/tests/parity/golden/ds_tracks_jitter.npz b/tests/parity/golden/ds_tracks_jitter.npz
new file mode 100644
index 00000000..1e369317
Binary files /dev/null and b/tests/parity/golden/ds_tracks_jitter.npz differ
diff --git a/tests/parity/golden/ds_variant_windows.npz b/tests/parity/golden/ds_variant_windows.npz
new file mode 100644
index 00000000..c7b6ee22
Binary files /dev/null and b/tests/parity/golden/ds_variant_windows.npz differ
diff --git a/tests/parity/golden/ds_variants.npz b/tests/parity/golden/ds_variants.npz
new file mode 100644
index 00000000..4d15e5ca
Binary files /dev/null and b/tests/parity/golden/ds_variants.npz differ
diff --git a/tests/parity/golden/fill_empty_fixed_f32.npz b/tests/parity/golden/fill_empty_fixed_f32.npz
new file mode 100644
index 00000000..1e2ae874
Binary files /dev/null and b/tests/parity/golden/fill_empty_fixed_f32.npz differ
diff --git a/tests/parity/golden/fill_empty_fixed_i32.npz b/tests/parity/golden/fill_empty_fixed_i32.npz
new file mode 100644
index 00000000..489986f1
Binary files /dev/null and b/tests/parity/golden/fill_empty_fixed_i32.npz differ
diff --git a/tests/parity/golden/fill_empty_scalar_f32.npz b/tests/parity/golden/fill_empty_scalar_f32.npz
new file mode 100644
index 00000000..6b48a444
Binary files /dev/null and b/tests/parity/golden/fill_empty_scalar_f32.npz differ
diff --git a/tests/parity/golden/fill_empty_scalar_i32.npz b/tests/parity/golden/fill_empty_scalar_i32.npz
new file mode 100644
index 00000000..764a8691
Binary files /dev/null and b/tests/parity/golden/fill_empty_scalar_i32.npz differ
diff --git a/tests/parity/golden/fill_empty_seq_i32.npz b/tests/parity/golden/fill_empty_seq_i32.npz
new file mode 100644
index 00000000..9ffd9675
Binary files /dev/null and b/tests/parity/golden/fill_empty_seq_i32.npz differ
diff --git a/tests/parity/golden/fill_empty_seq_u8.npz b/tests/parity/golden/fill_empty_seq_u8.npz
new file mode 100644
index 00000000..655545ed
Binary files /dev/null and b/tests/parity/golden/fill_empty_seq_u8.npz differ
diff --git a/tests/parity/golden/gather_alleles.npz b/tests/parity/golden/gather_alleles.npz
new file mode 100644
index 00000000..0e135438
Binary files /dev/null and b/tests/parity/golden/gather_alleles.npz differ
diff --git a/tests/parity/golden/gather_rows_f32.npz b/tests/parity/golden/gather_rows_f32.npz
new file mode 100644
index 00000000..5c88fe3c
Binary files /dev/null and b/tests/parity/golden/gather_rows_f32.npz differ
diff --git a/tests/parity/golden/gather_rows_i32.npz b/tests/parity/golden/gather_rows_i32.npz
new file mode 100644
index 00000000..680fedfa
Binary files /dev/null and b/tests/parity/golden/gather_rows_i32.npz differ
diff --git a/tests/parity/golden/get_diffs_sparse.npz b/tests/parity/golden/get_diffs_sparse.npz
new file mode 100644
index 00000000..a23e392c
Binary files /dev/null and b/tests/parity/golden/get_diffs_sparse.npz differ
diff --git a/tests/parity/golden/get_reference.npz b/tests/parity/golden/get_reference.npz
new file mode 100644
index 00000000..38997760
Binary files /dev/null and b/tests/parity/golden/get_reference.npz differ
diff --git a/tests/parity/golden/intervals_to_tracks.npz b/tests/parity/golden/intervals_to_tracks.npz
new file mode 100644
index 00000000..d2252b00
Binary files /dev/null and b/tests/parity/golden/intervals_to_tracks.npz differ
diff --git a/tests/parity/golden/prng_hash4.npz b/tests/parity/golden/prng_hash4.npz
new file mode 100644
index 00000000..e6bce0e4
Binary files /dev/null and b/tests/parity/golden/prng_hash4.npz differ
diff --git a/tests/parity/golden/prng_xorshift64.npz b/tests/parity/golden/prng_xorshift64.npz
new file mode 100644
index 00000000..aa3f142b
Binary files /dev/null and b/tests/parity/golden/prng_xorshift64.npz differ
diff --git a/tests/parity/golden/rc_alleles.npz b/tests/parity/golden/rc_alleles.npz
new file mode 100644
index 00000000..cc395530
Binary files /dev/null and b/tests/parity/golden/rc_alleles.npz differ
diff --git a/tests/parity/golden/reconstruct_haplotypes_from_sparse.npz b/tests/parity/golden/reconstruct_haplotypes_from_sparse.npz
new file mode 100644
index 00000000..760a72d9
Binary files /dev/null and b/tests/parity/golden/reconstruct_haplotypes_from_sparse.npz differ
diff --git a/tests/parity/golden/shift_and_realign_tracks_sparse.npz b/tests/parity/golden/shift_and_realign_tracks_sparse.npz
new file mode 100644
index 00000000..a2fee111
Binary files /dev/null and b/tests/parity/golden/shift_and_realign_tracks_sparse.npz differ
diff --git a/tests/parity/golden/tracks_to_intervals.npz b/tests/parity/golden/tracks_to_intervals.npz
new file mode 100644
index 00000000..30b9050c
Binary files /dev/null and b/tests/parity/golden/tracks_to_intervals.npz differ
diff --git a/tests/parity/strategies.py b/tests/parity/strategies.py
index 0c75eafa..583f9bd6 100644
--- a/tests/parity/strategies.py
+++ b/tests/parity/strategies.py
@@ -63,3 +63,623 @@ def intervals_to_tracks_inputs(draw):
         itv_offsets,
         out_offsets,
     )
+
+
+@st.composite
+def _sparse_geno(
+    draw, max_queries=4, max_ploidy=2, max_vars_per_group=5, max_total_unique=12
+):
+    """Shared sparse-genotype layout: returns
+    (geno_offset_idx (q,p) int64, geno_v_idxs int32, geno_offsets (n+1,) int64,
+     v_starts int32, ilens int32, q_starts int32, q_ends int32).
+    geno_offset_idx is arange so each (q,p) row maps to its own offset slice."""
+    n_unique = draw(st.integers(min_value=1, max_value=max_total_unique))
+    v_starts = np.sort(
+        draw(
+            st.lists(st.integers(0, 1000), min_size=n_unique, max_size=n_unique).map(
+                np.array
+            )
+        )
+    ).astype(np.int32)
+    ilens = np.array(
+        draw(st.lists(st.integers(-5, 5), min_size=n_unique, max_size=n_unique)),
+        dtype=np.int32,
+    )
+    n_q = draw(st.integers(1, max_queries))
+    p = draw(st.integers(1, max_ploidy))
+    n_groups = n_q * p
+    counts = [draw(st.integers(0, max_vars_per_group)) for _ in range(n_groups)]
+    v_idx_list = []
+    for c in counts:
+        # sorted variant indices within a group (reconstruction assumes sorted pos)
+        idxs = sorted(
+            draw(st.lists(st.integers(0, n_unique - 1), min_size=c, max_size=c))
+        )
+        v_idx_list.extend(idxs)
+    geno_v_idxs = np.array(v_idx_list, dtype=np.int32)
+    geno_offsets = np.concatenate([[0], np.cumsum(counts)]).astype(np.int64)
+    geno_offset_idx = np.arange(n_groups, dtype=np.int64).reshape(n_q, p)
+    q_starts = np.array(
+        draw(st.lists(st.integers(0, 800), min_size=n_q, max_size=n_q)), np.int32
+    )
+    q_ends = (q_starts + draw(st.integers(1, 200))).astype(np.int32)
+    return (
+        geno_offset_idx,
+        geno_v_idxs,
+        geno_offsets,
+        v_starts,
+        ilens,
+        q_starts,
+        q_ends,
+    )
+
+
+@st.composite
+def get_diffs_sparse_inputs(draw):
+    (goi, gvi, goff, vstarts, ilens, qstarts, qends) = draw(_sparse_geno())
+    mode = draw(st.sampled_from(["plain", "keep", "query"]))
+    twod = draw(st.booleans())
+    offsets = goff if not twod else np.stack([goff[:-1], goff[1:]]).astype(np.int64)
+    total = int(goff[-1])
+    if mode == "plain":
+        return (goi, gvi, offsets, ilens, None, None, None, None, None)
+    if mode == "keep":
+        keep = np.array(
+            draw(st.lists(st.booleans(), min_size=total, max_size=total)), np.bool_
+        )
+        return (goi, gvi, offsets, ilens, keep, goff.copy(), None, None, None)
+    # query mode (optionally also keep)
+    keep = None
+    keep_off = None
+    if draw(st.booleans()):
+        keep = np.array(
+            draw(st.lists(st.booleans(), min_size=total, max_size=total)), np.bool_
+        )
+        keep_off = goff.copy()
+    return (goi, gvi, offsets, ilens, keep, keep_off, qstarts, qends, vstarts)
+
+
+@st.composite
+def choose_exonic_variants_inputs(draw):
+    (goi, gvi, goff, vstarts, ilens, qstarts, qends) = draw(_sparse_geno())
+    twod = draw(st.booleans())
+    offsets = goff if not twod else np.stack([goff[:-1], goff[1:]]).astype(np.int64)
+    return (qstarts, qends, goi, gvi, offsets, vstarts, ilens)
+
+
+@st.composite
+def gather_rows_inputs(draw, dtype=np.int32):
+    n_groups = draw(st.integers(1, 6))
+    counts = [draw(st.integers(0, 5)) for _ in range(n_groups)]
+    offsets = np.concatenate([[0], np.cumsum(counts)]).astype(np.int64)
+    total = int(offsets[-1])
+    dt = np.dtype(dtype)
+    if np.issubdtype(dt, np.floating):
+        elements = st.floats(width=32, allow_nan=False, allow_infinity=False)
+    else:
+        elements = st.integers(0, 1000)
+    data = np.array(draw(st.lists(elements, min_size=total, max_size=total)), dt)
+    n_rows = draw(st.integers(1, 8))
+    goi = np.array(
+        draw(st.lists(st.integers(0, n_groups - 1), min_size=n_rows, max_size=n_rows)),
+        np.int64,
+    )
+    twod = draw(st.booleans())
+    off = (
+        offsets if not twod else np.stack([offsets[:-1], offsets[1:]]).astype(np.int64)
+    )
+    return (goi, off, data)
+
+
+@st.composite
+def gather_alleles_inputs(draw):
+    n_unique = draw(st.integers(1, 8))
+    lens = [draw(st.integers(0, 5)) for _ in range(n_unique)]
+    allele_offsets = np.concatenate([[0], np.cumsum(lens)]).astype(np.int64)
+    total = int(allele_offsets[-1])
+    allele_bytes = np.array(
+        draw(st.lists(st.integers(0, 255), min_size=total, max_size=total)), np.uint8
+    )
+    m = draw(st.integers(0, 10))
+    v_idxs = np.array(
+        draw(st.lists(st.integers(0, n_unique - 1), min_size=m, max_size=m)), np.int32
+    )
+    return (v_idxs, allele_bytes, allele_offsets)
+
+
+@st.composite
+def compact_keep_inputs(draw, dtype):
+    """Generate (values[dtype], row_offsets int64, keep bool) for compact_keep tests."""
+    n_rows = draw(st.integers(1, 6))
+    counts = [draw(st.integers(0, 5)) for _ in range(n_rows)]
+    row_offsets = np.concatenate([[0], np.cumsum(counts)]).astype(np.int64)
+    total = int(row_offsets[-1])
+    dt = np.dtype(dtype)
+    if np.issubdtype(dt, np.floating):
+        elements = st.floats(width=32, allow_nan=False, allow_infinity=False)
+    else:
+        elements = st.integers(0, 1000)
+    values = np.array(draw(st.lists(elements, min_size=total, max_size=total)), dt)
+    keep = np.array(
+        draw(st.lists(st.booleans(), min_size=total, max_size=total)), np.bool_
+    )
+    return (values, row_offsets, keep)
+
+
+@st.composite
+def fill_empty_scalar_inputs(draw, dtype=np.int32):
+    """Generate (data[dtype], offsets int64, fill) with at least one empty row.
+
+    Guarantees at least one row has zero count so empty-row insertion is
+    exercised on every draw.
+    """
+    n_rows = draw(st.integers(2, 6))
+    counts = [draw(st.integers(0, 5)) for _ in range(n_rows)]
+    # Force one row to be empty so the empty-fill path is always exercised.
+    empty_idx = draw(st.integers(0, n_rows - 1))
+    counts[empty_idx] = 0
+    row_offsets = np.concatenate([[0], np.cumsum(counts)]).astype(np.int64)
+    total = int(row_offsets[-1])
+    dt = np.dtype(dtype)
+    if np.issubdtype(dt, np.floating):
+        elements = st.floats(width=32, allow_nan=False, allow_infinity=False)
+        fill = draw(st.floats(width=32, allow_nan=False, allow_infinity=False))
+    else:
+        elements = st.integers(-1000, 1000)
+        fill = draw(st.integers(-1000, 1000))
+    data = np.array(draw(st.lists(elements, min_size=total, max_size=total)), dt)
+    fill_val = dt.type(fill)
+    return (data, row_offsets, fill_val)
+
+
+@st.composite
+def fill_empty_fixed_inputs(draw, dtype=np.int32):
+    """Generate (data[dtype], offsets int64, inner int, fill) with at least one
+    empty row for fill_empty_fixed tests.
+    """
+    n_rows = draw(st.integers(2, 6))
+    inner = draw(st.integers(1, 4))
+    counts = [draw(st.integers(0, 4)) for _ in range(n_rows)]
+    # Force one row to be empty.
+    empty_idx = draw(st.integers(0, n_rows - 1))
+    counts[empty_idx] = 0
+    row_offsets = np.concatenate([[0], np.cumsum(counts)]).astype(np.int64)
+    total_vars = int(row_offsets[-1])
+    dt = np.dtype(dtype)
+    if np.issubdtype(dt, np.floating):
+        elements = st.floats(width=32, allow_nan=False, allow_infinity=False)
+        fill = draw(st.floats(width=32, allow_nan=False, allow_infinity=False))
+    else:
+        elements = st.integers(-1000, 1000)
+        fill = draw(st.integers(-1000, 1000))
+    data = np.array(
+        draw(
+            st.lists(elements, min_size=total_vars * inner, max_size=total_vars * inner)
+        ),
+        dt,
+    )
+    fill_val = dt.type(fill)
+    return (data, row_offsets, inner, fill_val)
+
+
+@st.composite
+def fill_empty_seq_inputs(draw, dtype=np.uint8):
+    """Generate (data[dtype], var_offsets int64, seq_offsets int64, dummy[dtype])
+    with at least one guaranteed empty row for fill_empty_seq tests.
+
+    Layout:
+    - var_offsets: b*p+1 boundaries over variant groups (one guaranteed empty).
+    - seq_offsets: per-variant byte/token boundaries (len = total_vars + 1).
+    - data: flat element array (len = seq_offsets[-1]).
+    - dummy: random sequence of length >= 1 in the given dtype.
+    """
+    dt = np.dtype(dtype)
+    if np.issubdtype(dt, np.unsignedinteger):
+        elements = st.integers(0, 255)
+    else:
+        elements = st.integers(-1000, 1000)
+
+    n_rows = draw(st.integers(2, 6))
+    # Number of variants per row (zero = empty row).
+    var_counts = [draw(st.integers(0, 4)) for _ in range(n_rows)]
+    # Force at least one empty row.
+    empty_idx = draw(st.integers(0, n_rows - 1))
+    var_counts[empty_idx] = 0
+    var_offsets = np.concatenate([[0], np.cumsum(var_counts)]).astype(np.int64)
+    total_vars = int(var_offsets[-1])
+
+    # Per-variant byte/token lengths.
+    var_lens = [draw(st.integers(0, 5)) for _ in range(total_vars)]
+    seq_offsets = np.concatenate([[0], np.cumsum(var_lens)]).astype(np.int64)
+    total_elems = int(seq_offsets[-1])
+    data = np.array(
+        draw(st.lists(elements, min_size=total_elems, max_size=total_elems)), dt
+    )
+
+    # Dummy sequence: length >= 1.
+    dummy_len = draw(st.integers(1, 4))
+    dummy = np.array(
+        draw(st.lists(elements, min_size=dummy_len, max_size=dummy_len)), dt
+    )
+
+    return (data, var_offsets, seq_offsets, dummy)
+
+
+@st.composite
+def tracks_to_intervals_inputs(draw):
+    """Contract-valid inputs for ``tracks_to_intervals``.
+
+    Generates (regions, tracks, track_offsets) where:
+    - regions: (n_queries, 3) int32 with (contig_idx, start, end)
+    - tracks: flat f32 ragged array, one piecewise-constant run per query
+    - track_offsets: (n_queries + 1,) int64
+
+    Exercises: multi-run queries, all-constant (1 interval), and empty queries.
+    Includes a guaranteed empty query (track_offsets[q]==track_offsets[q+1]) and
+    a guaranteed all-constant query (single run, 1 interval).
+    """
+    n_queries = draw(st.integers(min_value=3, max_value=8))
+    regions_list: list[tuple[int, int, int]] = []
+    track_lengths: list[int] = []
+    tracks_parts: list[np.ndarray] = []
+
+    for qi in range(n_queries):
+        start = draw(st.integers(min_value=0, max_value=500))
+        # Force first query to be empty, second to be all-constant
+        if qi == 0:
+            length = 0
+        elif qi == 1:
+            length = draw(st.integers(min_value=1, max_value=20))
+        else:
+            length = draw(st.integers(min_value=0, max_value=40))
+
+        regions_list.append((0, start, start + length))
+        track_lengths.append(length)
+
+        if length == 0:
+            tracks_parts.append(np.empty(0, dtype=np.float32))
+        elif qi == 1:
+            # All-constant: single run
+            val = draw(st.floats(width=32, allow_nan=False, allow_infinity=False))
+            tracks_parts.append(np.full(length, val, dtype=np.float32))
+        else:
+            # Piecewise constant with interesting RLE structure
+            # Draw run boundaries: build runs by drawing lengths
+            buf = np.empty(length, dtype=np.float32)
+            pos = 0
+            while pos < length:
+                run_len = draw(st.integers(min_value=1, max_value=max(1, length - pos)))
+                run_len = min(run_len, length - pos)
+                val = draw(
+                    st.floats(
+                        min_value=-1e3,
+                        max_value=1e3,
+                        allow_nan=False,
+                        allow_infinity=False,
+                    )
+                )
+                buf[pos : pos + run_len] = val
+                pos += run_len
+            tracks_parts.append(buf)
+
+    regions = np.array(regions_list, dtype=np.int32)
+    track_offsets = np.concatenate([[0], np.cumsum(track_lengths)]).astype(np.int64)
+    tracks = (
+        np.concatenate(tracks_parts) if tracks_parts else np.empty(0, dtype=np.float32)
+    )
+
+    return regions, tracks, track_offsets
+
+
+@st.composite
+def get_reference_inputs(draw):
+    """Generate (regions, out_offsets, reference, ref_offsets, pad_char, parallel)
+    with regions whose [start,end) windows may run off either contig edge.
+
+    Note: start is restricted to [-5, clen) so that the region overlaps the
+    contig (start < clen). The numba kernel has a pre-existing size-mismatch
+    crash when start >= clen (region entirely past contig end); that degenerate
+    case never occurs in production (BED regions are clipped to contig bounds).
+    """
+    from hypothesis.extra.numpy import arrays
+
+    n_contigs = draw(st.integers(1, 3))
+    contig_lens = [draw(st.integers(1, 40)) for _ in range(n_contigs)]
+    ref_offsets = np.concatenate([[0], np.cumsum(contig_lens)]).astype(np.int64)
+    reference = draw(
+        arrays(np.uint8, int(ref_offsets[-1]), elements=st.integers(0, 255))
+    )
+    n_regions = draw(st.integers(1, 6))
+    regions = np.empty((n_regions, 3), np.int32)
+    lengths = []
+    for i in range(n_regions):
+        c = draw(st.integers(0, n_contigs - 1))
+        clen = contig_lens[c]
+        # Restrict start < clen so the region overlaps the contig.  numba's
+        # padded_slice raises ValueError when start >= clen (region entirely
+        # past the contig end): pad_right = end - clen > out_len triggers a
+        # size-mismatch in the ndarray assignment.  Both backends fail loudly
+        # on that degenerate input, so it is outside the byte-identity domain
+        # and is intentionally not generated here.  In production, BED regions
+        # are always clipped to contig bounds, so start >= clen never occurs.
+        # Regions extending past the right edge (end > clen) are still generated.
+        start = draw(st.integers(-5, clen - 1))
+        length = draw(st.integers(0, clen + 5))
+        regions[i] = (c, start, start + length)
+        lengths.append(length)
+    out_offsets = np.concatenate([[0], np.cumsum(lengths)]).astype(np.int64)
+    pad_char = draw(st.integers(0, 255))
+    parallel = draw(st.booleans())
+    return regions, out_offsets, reference, ref_offsets, np.uint8(pad_char), parallel
+
+
+@st.composite
+def shift_and_realign_tracks_inputs(draw):  # noqa: C901
+    """Contract-valid inputs for shift_and_realign_tracks_sparse.
+
+    Returns ``(total_out_size, inputs_tuple)`` where inputs_tuple is everything
+    EXCEPT the out buffer (inserted at index 0 by the parity harness).
+
+    Exercises all five strategy IDs:
+      0 = REPEAT_5P
+      1 = REPEAT_5P_NORM
+      2 = CONSTANT
+      3 = FLANK_SAMPLE
+      4 = INTERPOLATE
+
+    Layout mirrors the numba batch driver signature:
+      out_offsets (b*p+1,), regions (b,3), shifts (b,p),
+      geno_offset_idx (b,p), geno_v_idxs, geno_offsets (2,n),
+      v_starts, ilens, tracks (ragged b*l), track_offsets (b+1),
+      params (f64), keep (optional), keep_offsets (optional),
+      strategy_id, base_seed.
+    """
+    # ── strategy ──────────────────────────────────────────────────────────────
+    strategy_id = draw(st.integers(min_value=0, max_value=4))
+    if strategy_id == 2:  # CONSTANT
+        param_val = draw(st.floats(width=64, allow_nan=False, allow_infinity=False))
+    elif strategy_id == 3:  # FLANK_SAMPLE
+        param_val = float(draw(st.integers(min_value=0, max_value=5)))
+    elif strategy_id == 4:  # INTERPOLATE — order in {1,2,3}
+        param_val = float(draw(st.integers(min_value=1, max_value=3)))
+    else:  # REPEAT_5P (0) or REPEAT_5P_NORM (1): param unused
+        param_val = 0.0
+    params = np.array([param_val], dtype=np.float64)
+
+    base_seed = np.uint64(
+        draw(st.integers(min_value=0, max_value=int(np.iinfo(np.uint64).max)))
+    )
+
+    # ── variants (SNP/ins/del mix) ─────────────────────────────────────────────
+    n_unique = draw(st.integers(min_value=1, max_value=8))
+    # v_starts sorted, in [0, 120] so they fit within track windows
+    v_starts_raw = sorted(
+        draw(st.lists(st.integers(0, 120), min_size=n_unique, max_size=n_unique))
+    )
+    v_starts = np.array(v_starts_raw, dtype=np.int32)
+    # ilens: -3..3 for del/snp/ins mix; ensure at least one each
+    ilens = np.array(
+        draw(st.lists(st.integers(-3, 3), min_size=n_unique, max_size=n_unique)),
+        dtype=np.int32,
+    )
+
+    # ── regions & tracks ─────────────────────────────────────────────────────
+    n_q = draw(st.integers(1, 4))
+    ploidy = draw(st.integers(1, 2))
+    n_groups = n_q * ploidy
+
+    # Per-query: q_start in [0, 80], region length in [4, 40]
+    q_starts = [draw(st.integers(0, 80)) for _ in range(n_q)]
+    region_lengths = [draw(st.integers(4, 40)) for _ in range(n_q)]
+
+    regions = np.empty((n_q, 3), np.int32)
+    for i in range(n_q):
+        regions[i] = (0, q_starts[i], q_starts[i] + region_lengths[i])
+
+    # Track for each query: length = region_length + extra deletion headroom
+    # We give a bit of extra ref track beyond the region so deletions can read
+    # past the region end (production contract: track is always >= region length).
+    track_lengths = [max(rl + 10, 1) for rl in region_lengths]
+    track_offsets = np.concatenate([[0], np.cumsum(track_lengths)]).astype(np.int64)
+    total_track = int(track_offsets[-1])
+    tracks = draw(
+        st.lists(
+            st.floats(
+                min_value=-1e3, max_value=1e3, allow_nan=False, allow_infinity=False
+            ),
+            min_size=total_track,
+            max_size=total_track,
+        ).map(lambda xs: np.array(xs, dtype=np.float32))
+    )
+
+    # ── sparse genotypes ──────────────────────────────────────────────────────
+    counts = [draw(st.integers(0, 4)) for _ in range(n_groups)]
+    geno_offsets_1d = np.concatenate([[0], np.cumsum(counts)]).astype(np.int64)
+    geno_offset_idx = np.arange(n_groups, dtype=np.int64).reshape(n_q, ploidy)
+    v_idx_list: list[int] = []
+    for c in counts:
+        idxs = sorted(
+            draw(st.lists(st.integers(0, n_unique - 1), min_size=c, max_size=c))
+        )
+        v_idx_list.extend(idxs)
+    geno_v_idxs = np.array(v_idx_list, dtype=np.int32)
+
+    # normalize geno_offsets to (2, n) form
+    geno_offsets_2d = np.stack([geno_offsets_1d[:-1], geno_offsets_1d[1:]]).astype(
+        np.int64
+    )
+
+    # ── out_offsets: (n_q * ploidy + 1,) ─────────────────────────────────────
+    # Each (query, hap) output has the same length as the region (no jitter here)
+    out_lengths = np.array(
+        [rl for rl in region_lengths for _ in range(ploidy)], dtype=np.int64
+    )
+    out_offsets = np.concatenate([[0], np.cumsum(out_lengths)]).astype(np.int64)
+    total_out = int(out_offsets[-1])
+
+    # ── shifts ────────────────────────────────────────────────────────────────
+    shifts = np.zeros((n_q, ploidy), dtype=np.int32)
+    for qi in range(n_q):
+        for h in range(ploidy):
+            shifts[qi, h] = draw(st.integers(0, max(0, region_lengths[qi] // 4)))
+
+    # ── optional keep mask ────────────────────────────────────────────────────
+    use_keep = draw(st.booleans())
+    total_v = int(geno_offsets_1d[-1])
+    if use_keep and total_v > 0:
+        keep = np.array(
+            draw(st.lists(st.booleans(), min_size=total_v, max_size=total_v)), np.bool_
+        )
+        keep_offsets = geno_offsets_1d.copy()
+    else:
+        keep = None
+        keep_offsets = None
+
+    inputs = (
+        out_offsets,  # (b*p+1,)
+        regions,  # (b, 3)
+        shifts,  # (b, p)
+        geno_offset_idx,  # (b, p)
+        geno_v_idxs,  # ragged variant idxs
+        geno_offsets_2d,  # (2, n)
+        v_starts,  # (n_unique,)
+        ilens,  # (n_unique,)
+        tracks,  # (total_track,) ragged
+        track_offsets,  # (b+1,)
+        params,  # (1,) f64
+        keep,  # optional bool
+        keep_offsets,  # optional i64
+        int(strategy_id),  # int
+        base_seed,  # np.uint64
+    )
+    return total_out, inputs
+
+
+@st.composite
+def reconstruct_haplotypes_inputs(draw, annotate=False):  # noqa: ARG001
+    """Contract-valid inputs for reconstruct_haplotypes_from_sparse.
+
+    Returns ``(total_out_size, inputs_tuple)`` where inputs_tuple is everything
+    EXCEPT the out buffer (inserted at index 0 by the harness). The
+    ``annotate`` parameter is accepted but unused — the test file decides whether
+    to build annotation buffers.
+    """
+    from hypothesis.extra.numpy import arrays as hp_arrays
+
+    # ── reference (1–2 contigs) ─────────────────────────────────────────────
+    # Draw reference FIRST so we can constrain variant positions to be within
+    # the contig bounds (mirrors the production contract where variants always
+    # come from VCF records within the contig).
+    n_contigs = draw(st.integers(1, 2))
+    contig_lens = [draw(st.integers(10, 80)) for _ in range(n_contigs)]
+
+    # ── variants ──────────────────────────────────────────────────────────────
+    n_unique = draw(st.integers(min_value=1, max_value=6))
+    # Constrain v_starts to [0, min_contig_len - 1] so that ref[ref_idx:v_pos]
+    # never exceeds any contig's bounds. Variants are shared across all queries
+    # (which may reference different contigs), so we must be conservative and use
+    # the shortest contig's length as the upper bound. In production, variants are
+    # always within-contig; this constraint enforces that invariant.
+    min_contig_len = min(contig_lens)
+    v_starts_raw = draw(
+        st.lists(
+            st.integers(0, min_contig_len - 1), min_size=n_unique, max_size=n_unique
+        )
+    )
+    v_starts = np.sort(np.array(v_starts_raw, dtype=np.int32))
+    ilens = np.array(
+        draw(st.lists(st.integers(-3, 3), min_size=n_unique, max_size=n_unique)),
+        dtype=np.int32,
+    )
+    # atomized: alt_len = max(1, 1 + ilen)
+    alt_lens = np.maximum(1, 1 + ilens).astype(np.int64)
+    alt_offsets = np.concatenate([[np.int64(0)], np.cumsum(alt_lens)]).astype(np.int64)
+    total_alt = int(alt_offsets[-1])
+    alt_alleles = draw(hp_arrays(np.uint8, total_alt, elements=st.integers(65, 90)))
+    ref_offsets = np.concatenate([[np.int64(0)], np.cumsum(contig_lens)]).astype(
+        np.int64
+    )
+    reference = draw(
+        hp_arrays(np.uint8, int(ref_offsets[-1]), elements=st.integers(65, 90))
+    )
+
+    # ── sparse genotypes ──────────────────────────────────────────────────────
+    n_q = draw(st.integers(1, 3))
+    ploidy = draw(st.integers(1, 2))
+    n_groups = n_q * ploidy
+    counts = [draw(st.integers(0, 4)) for _ in range(n_groups)]
+    geno_offsets_1d = np.concatenate([[np.int64(0)], np.cumsum(counts)]).astype(
+        np.int64
+    )
+    geno_offset_idx = np.arange(n_groups, dtype=np.int64).reshape(n_q, ploidy)
+    v_idx_list: list[int] = []
+    for c in counts:
+        idxs = sorted(
+            draw(st.lists(st.integers(0, n_unique - 1), min_size=c, max_size=c))
+        )
+        v_idx_list.extend(idxs)
+    geno_v_idxs = np.array(v_idx_list, dtype=np.int32)
+
+    # ── regions: (contig_idx, start, end) ────────────────────────────────────
+    regions = np.empty((n_q, 3), np.int32)
+    region_lengths: list[int] = []
+    for i in range(n_q):
+        c = draw(st.integers(0, n_contigs - 1))
+        clen = contig_lens[c]
+        start = draw(st.integers(0, max(0, clen - 1)))
+        length = draw(st.integers(1, min(40, clen - start + 5)))
+        regions[i] = (c, start, start + length)
+        region_lengths.append(length)
+
+    # ── out_offsets: (n_q * ploidy + 1,) ─────────────────────────────────────
+    out_lengths_mat = np.array(region_lengths, dtype=np.int64)[:, None] * np.ones(
+        ploidy, dtype=np.int64
+    )  # (n_q, ploidy)
+    out_offsets = np.concatenate(
+        [np.array([np.int64(0)]), np.cumsum(out_lengths_mat.ravel())]
+    ).astype(np.int64)
+    total_out = int(out_offsets[-1])
+
+    # ── shifts ────────────────────────────────────────────────────────────────
+    shifts = np.zeros((n_q, ploidy), dtype=np.int32)
+    for qi in range(n_q):
+        for h in range(ploidy):
+            shifts[qi, h] = draw(st.integers(0, max(0, region_lengths[qi] // 4)))
+
+    # ── optional keep mask ────────────────────────────────────────────────────
+    use_keep = draw(st.booleans())
+    total_v = int(geno_offsets_1d[-1])
+    if use_keep and total_v > 0:
+        keep = np.array(
+            draw(st.lists(st.booleans(), min_size=total_v, max_size=total_v)), np.bool_
+        )
+        keep_offsets = geno_offsets_1d.copy()
+    else:
+        keep = None
+        keep_offsets = None
+
+    # normalize geno_offsets to (2, n) form (the registered backends accept this)
+    geno_offsets_2d = np.stack([geno_offsets_1d[:-1], geno_offsets_1d[1:]]).astype(
+        np.int64
+    )
+
+    inputs = (
+        out_offsets,
+        regions,
+        shifts,
+        geno_offset_idx,
+        geno_offsets_2d,
+        geno_v_idxs,
+        v_starts,
+        ilens,
+        alt_alleles,
+        alt_offsets,
+        reference,
+        ref_offsets,
+        np.uint8(78),  # pad_char = ord('N')
+        keep,
+        keep_offsets,
+        None,  # annot_v_idxs — caller fills for annotated path
+        None,  # annot_ref_pos — caller fills for annotated path
+    )
+    return total_out, inputs
diff --git a/tests/parity/test_annotated_spliced_haplotypes_parity.py b/tests/parity/test_annotated_spliced_haplotypes_parity.py
new file mode 100644
index 00000000..6a0616a3
--- /dev/null
+++ b/tests/parity/test_annotated_spliced_haplotypes_parity.py
@@ -0,0 +1,95 @@
+"""Annotated+spliced haplotypes dataset parity backstop (fused rust entry, Phase 5 W3).
+
+Proves the fused Rust entry ``reconstruct_annotated_haplotypes_spliced_fused`` produces
+byte-identical (haps, var_idxs, ref_coords) output to the frozen golden (generated from
+the rust implementation, oracle-verified against the composed numba pipeline at gen time),
+including a negative-strand transcript that exercises the in-kernel RC triple.
+
+Asserts:
+  1. The fused entry actually fires on the rust path (spy).
+  2. All three arrays are byte-identical to the frozen golden.
+  3. RC actually changes the output (rc_neg=True vs rc_neg=False differ) — proves the
+     negative-strand transcript exercises the in-kernel RC path (non-vacuous RC coverage).
+  4. Output is non-trivial (contains non-N bases).
+"""
+
+from __future__ import annotations
+
+from dataclasses import replace
+
+import numpy as np
+import polars as pl
+import pytest
+
+import genvarloader as gvl
+import genvarloader._dataset._haps as _haps_mod
+from genvarloader._ragged import RaggedAnnotatedHaps
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_annotated_spliced_haplotypes_parity(phased_svar_gvl, reference, monkeypatch):
+    # --- open in annotated mode, build a spliced dataset with mixed strands inline ---
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = ds.with_seqs("annotated").with_tracks(False)
+
+    n = 4
+    # Group regions 0+1 -> T1 (+ strand), 2+3 -> T2 (- strand). The '-' transcript
+    # exercises the in-kernel RC triple (rc bytes + reverse var_idxs/ref_coords).
+    sub_bed = ds._full_bed[:n].with_columns(
+        pl.Series("transcript_id", ["T1", "T1", "T2", "T2"]),
+        pl.Series("strand", ["+", "+", "-", "-"]),
+    )
+    assert (sub_bed["strand"] == "-").any(), "need a '-' transcript to cover RC"
+    ds = replace(ds, _full_bed=sub_bed).with_settings(splice_info="transcript_id")
+    assert ds.is_spliced, "Dataset should be in spliced mode"
+
+    # --- spy on the fused annotated-spliced entry ---
+    orig = getattr(_haps_mod, "reconstruct_annotated_haplotypes_spliced_fused", None)
+    assert orig is not None, (
+        "reconstruct_annotated_haplotypes_spliced_fused not found on _haps_mod — "
+        "ensure it is imported at module level in _haps.py"
+    )
+    calls = {"n": 0}
+
+    def _spy(*a, **k):
+        calls["n"] += 1
+        return orig(*a, **k)
+
+    monkeypatch.setattr(
+        _haps_mod, "reconstruct_annotated_haplotypes_spliced_fused", _spy
+    )
+
+    # --- read (default rust backend, spy active) ---
+    out = ds[:, :]
+    rust_calls = calls["n"]
+
+    assert rust_calls > 0, (
+        "reconstruct_annotated_haplotypes_spliced_fused was NEVER invoked on the "
+        "read — the backstop is vacuous. Ensure _haps._reconstruct_annotated_haplotypes "
+        "calls it on the splice path."
+    )
+
+    assert isinstance(out, RaggedAnnotatedHaps), type(out)
+
+    # --- non-trivial output ---
+    data_u8 = np.asarray(out.haps.data).view(np.uint8)
+    assert data_u8.size > 0 and np.any(data_u8 != np.uint8(ord("N"))), (
+        "annotated-spliced output is empty or all-N padding — comparison is vacuous."
+    )
+
+    # --- RC non-vacuity: rc_neg flips the '-' transcript output (rust backend) ---
+    out_norc = ds.with_settings(rc_neg=False)[:, :]
+    assert not np.array_equal(
+        np.asarray(out.haps.data), np.asarray(out_norc.haps.data)
+    ), (
+        "RC made no difference — the negative-strand transcript is not exercising the "
+        "in-kernel RC path (check strand propagation / rc_neg default)."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_annotated_spliced")
+    )
diff --git a/tests/parity/test_assemble_variant_buffers_parity.py b/tests/parity/test_assemble_variant_buffers_parity.py
new file mode 100644
index 00000000..5bf2bb10
--- /dev/null
+++ b/tests/parity/test_assemble_variant_buffers_parity.py
@@ -0,0 +1,21 @@
+"""assemble_variant_buffers: rust vs frozen golden (oracle frozen Phase 5 W5).
+
+All parametrised cases (windows mode matrix, variants mode matrix, empty selection)
+are now replayed from the frozen golden generated by generate_goldens.py and
+cross-checked against numba at generation time.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_assemble_variant_buffers_golden():
+    """Rust assemble_variant_buffers must equal the frozen golden for all mode combinations."""
+    cases = _golden.load_golden("assemble_variant_buffers")
+    assert cases, "empty golden"
+    _golden.replay_dict("assemble_variant_buffers", cases)
diff --git a/tests/parity/test_choose_exonic_variants_parity.py b/tests/parity/test_choose_exonic_variants_parity.py
new file mode 100644
index 00000000..3e49a9d7
--- /dev/null
+++ b/tests/parity/test_choose_exonic_variants_parity.py
@@ -0,0 +1,15 @@
+"""choose_exonic_variants: rust vs frozen golden (oracle frozen Phase 5 W5)."""
+
+from __future__ import annotations
+
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_choose_exonic_variants_golden():
+    cases = _golden.load_golden("choose_exonic_variants")
+    assert cases, "empty golden"
+    _golden.replay_tuple("choose_exonic_variants", cases)
diff --git a/tests/parity/test_dataset_parity.py b/tests/parity/test_dataset_parity.py
index 4a07d848..6feb1fb5 100644
--- a/tests/parity/test_dataset_parity.py
+++ b/tests/parity/test_dataset_parity.py
@@ -1,7 +1,21 @@
-"""Dataset read-path parity backstop for intervals_to_tracks.
+"""Dataset read-path parity backstops for track kernels.
 
-Proves that flipping GVL_BACKEND (numba vs rust) produces byte-identical
-track output through the real Dataset.__getitem__ path.
+Covers three cases:
+
+1. ``intervals_to_tracks`` only (track-only dataset, no variants):
+   Proves that the rust backend produces output matching the frozen golden
+   through the real Dataset.__getitem__ path.
+
+2. ``shift_and_realign_tracks_sparse`` (haplotypes+tracks dataset with indels):
+   Proves that the dispatch wiring for the realignment kernel is correct
+   end-to-end, across every insertion-fill strategy.
+
+3. Strand=−1 parity backstops (Task 7 — pre-wiring safety net):
+   Proves that the rust backend produces byte-identical output matching the
+   frozen golden for datasets with mixed + and − strand regions, across all
+   five output kinds (reference, haplotypes, annotated, tracks, tracks-seqs)
+   in the UNSPLICED path, and across the four splice-capable kinds in the
+   SPLICED path.  Analytical non-vacuity tests (RC guard) are also included.
 """
 
 from __future__ import annotations
@@ -9,40 +23,26 @@
 import numpy as np
 import pytest
 
-from tests.parity._fixtures import build_track_dataset
+from tests.parity import _golden
+from tests.parity._fixtures import (
+    _JITTER_SIGNAL_PER_SAMPLE,
+    build_haps_tracks_dataset,
+    build_strand_mixed_dataset,
+    build_track_dataset,
+    build_track_dataset_jittered,
+)
 
 pytestmark = pytest.mark.parity
 
 
-def _read_track_array(
-    ds, r_idx: np.ndarray, s_idx: np.ndarray
-) -> tuple[np.ndarray, np.ndarray]:
-    """Return (data, offsets) from the RaggedTracks produced by ds[r_idx, s_idx].
-
-    Dataset.open with no reference and no variants + with_tracks("signal") returns
-    a RaggedTracks directly from __getitem__.  RaggedTracks is a Ragged[np.float32]
-    so it carries .data (flat float32 buffer) and .offsets (int64).
-    """
-    result = ds[r_idx, s_idx]
-    # result is RaggedTracks (a seqpro Ragged[np.float32]) when no seqs are configured
-    data = np.asarray(result.data, dtype=np.float32)
-    offsets = np.asarray(result.offsets, dtype=np.int64)
-    return data, offsets
-
-
 def test_track_getitem_identical_across_backends(tmp_path, monkeypatch):
-    ds_dir = build_track_dataset(tmp_path)
-
     import genvarloader as gvl
-    import genvarloader._dataset._reconstruct as _recon_mod
     import genvarloader._dataset._tracks as _tracks_mod
 
+    ds_dir = build_track_dataset(tmp_path)
     ds = gvl.Dataset.open(ds_dir)
-    # tracks-only dataset: with_tracks enables the signal track explicitly
     ds = ds.with_tracks("signal")
 
-    # Use slice(None) for both dims so Dataset uses "basic" indexing (cross-product)
-    # which returns shape (n_regions, n_samples, n_tracks, ~length).
     r_idx = slice(None)
     s_idx = slice(None)
 
@@ -56,42 +56,494 @@ def spy(*a, **k):
 
         return spy
 
-    # Patch BOTH call-site modules; the track-only path uses _tracks_mod
+    # The track-only path calls intervals_to_tracks via _tracks_mod (the
+    # haps+tracks path uses the fused intervals_and_realign_track_fused in
+    # _reconstruct, which is covered by test_fused_tracks_parity).
     monkeypatch.setattr(
         _tracks_mod, "intervals_to_tracks", _make_spy(_tracks_mod.intervals_to_tracks)
     )
-    monkeypatch.setattr(
-        _recon_mod, "intervals_to_tracks", _make_spy(_recon_mod.intervals_to_tracks)
-    )
 
-    # --- numba read ---
-    monkeypatch.setenv("GVL_BACKEND", "numba")
-    data_n, off_n = _read_track_array(ds, r_idx, s_idx)
+    # --- read (default rust backend) ---
+    result = ds[r_idx, s_idx]
 
     # Backstop guard: kernel must have been called at least once
     assert calls["n"] > 0, (
-        f"intervals_to_tracks was NEVER called during the numba read "
+        f"intervals_to_tracks was NEVER called during the read "
         f"(calls={calls['n']}) — the backstop is vacuous. "
         "Inspect the read path and confirm the track reconstructor is active."
     )
 
-    # --- rust read ---
-    monkeypatch.setenv("GVL_BACKEND", "rust")
-    data_r, off_r = _read_track_array(ds, r_idx, s_idx)
+    # Sanity: the read painted real non-zero signal
+    data = np.asarray(result.data, dtype=np.float32)
+    assert np.any(data != 0.0), (
+        "Track data is all-zero — regions may not overlap synthetic intervals. "
+        "Non-zero signal is required to prove the comparison is meaningful."
+    )
 
-    # --- byte-identical comparison ---
-    np.testing.assert_array_equal(
-        off_n, off_r, err_msg="offsets differ across backends"
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(result, _golden.load_flat_golden("ds_tracks"))
+
+
+# ---------------------------------------------------------------------------
+# max_jitter > 0 end-to-end parity + oracle (#242 regression)
+# ---------------------------------------------------------------------------
+
+
+def test_tracks_max_jitter_intervals_parity_and_oracle(tmp_path):
+    """End-to-end regression for #242: max_jitter>0 track reads match the golden
+    and the hand-computed positional oracle.
+
+    Bug #242 root cause
+    -------------------
+    ``gvl.write`` clips BigWig intervals to the jitter-expanded write window
+    ``[chromStart - max_jitter, chromEnd + max_jitter]``, so stored interval
+    starts equal ``chromStart - max_jitter``.  ``Dataset.open`` derives query
+    starts from the ORIGINAL ``chromStart`` (``input_regions.arrow``), so
+    ``itv_start - query_start = -max_jitter`` — a negative offset.
+    Fix (PR #244): both kernels now clip ``s = max(itv_start - query_start, 0)``.
+
+    Guards
+    ------
+    - **Non-vacuity**: at least one ``regions.npy[:,1]`` (stored start) is
+      strictly ``<`` the corresponding ``input_regions.arrow`` chromStart
+      (original start), proving the #242 boundary condition is exercised.
+    - **Golden replay**: output matches the frozen golden.
+    - **Positional oracle**: each individual (region, sample) track SLICE
+      exactly equals ``np.full(REGION_LEN, sample_constant)`` — catches sample
+      misordering / spatial misplacement that a count-based check would miss.
+    - **Non-triviality**: at least one output value is non-zero.
+    """
+    import polars as pl
+
+    import genvarloader as gvl
+
+    MAX_JITTER = 4
+    REGION_LEN = 20  # chromEnd - chromStart for every fixture region
+    N_REGIONS = 3
+    N_SAMPLES = 3  # s0, s1, s2
+
+    ds_dir = build_track_dataset_jittered(tmp_path, max_jitter=MAX_JITTER)
+
+    # --- Non-vacuity guard: stored start < original chromStart (#242 condition) ---
+    regions = np.load(ds_dir / "regions.npy")  # shape (N_REGIONS, 4), int32
+    input_bed = pl.read_ipc(ds_dir / "input_regions.arrow")
+    r_idx_map = input_bed["r_idx_map"].to_numpy()  # original_row → sorted_pos
+    orig_starts = input_bed["chromStart"].to_numpy()
+    stored_starts_aligned = regions[r_idx_map, 1]  # stored starts per original row
+    assert np.any(stored_starts_aligned < orig_starts), (
+        "Non-vacuity guard FAILED: no stored region start is < the original chromStart. "
+        f"stored (aligned)={stored_starts_aligned.tolist()}, orig={orig_starts.tolist()}. "
+        "The max_jitter expansion is not exercising the #242 boundary condition."
     )
-    assert data_n.dtype == data_r.dtype == np.float32, (
-        f"dtype mismatch: numba={data_n.dtype}, rust={data_r.dtype}"
+
+    # --- Open dataset ---
+    ds = gvl.Dataset.open(ds_dir)
+    ds = ds.with_tracks("signal")
+    assert ds.jitter == 0, (
+        f"Expected ds.jitter == 0 after Dataset.open (deterministic default), "
+        f"got {ds.jitter}."
     )
-    np.testing.assert_array_equal(
-        data_n, data_r, err_msg="track data differs across backends"
+
+    # --- Read (default rust backend) ---
+    result = ds[:, :]
+    tracks_t = result[1] if isinstance(result, tuple) else result
+    data = np.asarray(tracks_t.data, dtype=np.float32)
+    off = np.asarray(tracks_t.offsets, dtype=np.int64)
+
+    # --- Golden replay ---
+    _golden.assert_output_matches_golden(
+        result, _golden.load_flat_golden("ds_tracks_jitter")
     )
 
-    # Sanity: the read painted real non-zero signal (not an all-zero vacuous match)
-    assert np.any(data_n != 0.0), (
-        "Track data is all-zero — regions may not overlap synthetic intervals. "
-        "Non-zero signal is required to prove the comparison is meaningful."
+    # --- Positional, hand-computed oracle ---
+    sample_consts = [np.float32(v) for v in _JITTER_SIGNAL_PER_SAMPLE.values()]
+    assert off.size - 1 == N_REGIONS * N_SAMPLES, (
+        f"Expected {N_REGIONS * N_SAMPLES} track rows, got {off.size - 1}; "
+        "the (region, sample) layout assumption is wrong."
+    )
+    for region in range(N_REGIONS):
+        for sample in range(N_SAMPLES):
+            row = region * N_SAMPLES + sample
+            seg = data[off[row] : off[row + 1]]
+            expected = np.full(REGION_LEN, sample_consts[sample], dtype=np.float32)
+            np.testing.assert_array_equal(
+                seg,
+                expected,
+                err_msg=(
+                    f"Positional oracle mismatch at region {region}, sample "
+                    f"{sample} (row {row}): expected constant "
+                    f"{sample_consts[sample]} over {REGION_LEN} positions."
+                ),
+            )
+
+    total_expected = N_REGIONS * N_SAMPLES * REGION_LEN  # 3 × 3 × 20 = 180
+    assert data.size == total_expected, (
+        f"Output data size {data.size} != expected {total_expected} "
+        f"({N_REGIONS} regions × {N_SAMPLES} samples × {REGION_LEN} positions)."
+    )
+
+    # --- Non-triviality ---
+    assert np.any(data != 0.0), (
+        "All track values are 0.0 — constant BigWig signal is not reaching the output."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Haplotypes+tracks realignment backstop
+# ---------------------------------------------------------------------------
+
+
+def test_tracks_realign_getitem_identical_across_backends(
+    synthetic_case, tmp_path, monkeypatch
+):
+    """Spy-guarded backstop for tracks realignment dispatch wiring (Task 11/14).
+
+    Proves that materialising a haplotypes+tracks dataset (with indel-bearing
+    genotypes) via ``ds[:, :]`` produces output matching the frozen golden,
+    for every insertion-fill strategy.
+
+    After Task 14, the Rust path calls the fused entry
+    ``intervals_and_realign_track_fused`` (one FFI crossing per track).
+    The spy targets this entry.
+    """
+    import genvarloader as gvl
+    import genvarloader._dataset._reconstruct as _recon_mod
+    from genvarloader._dataset._insertion_fill import (
+        Constant,
+        FlankSample,
+        Interpolate,
+        Repeat5p,
+        Repeat5pNormalized,
+    )
+
+    ds_dir = build_haps_tracks_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds_base = gvl.Dataset.open(ds_dir, reference=ref)
+    ds_base = ds_base.with_seqs("haplotypes").with_tracks("signal")
+
+    orig_fused = getattr(_recon_mod, "intervals_and_realign_track_fused", None)
+    assert orig_fused is not None, (
+        "intervals_and_realign_track_fused not found on _recon_mod — "
+        "ensure it is imported at module level in _reconstruct.py"
+    )
+
+    calls: dict[str, int] = {"n": 0}
+
+    def _spy_fused(*a, **k):
+        calls["n"] += 1
+        return orig_fused(*a, **k)
+
+    fill_strategies = [
+        Repeat5p(),
+        Repeat5pNormalized(),
+        Constant(0.0),
+        FlankSample(flank_width=5),
+        Interpolate(order=1),
+    ]
+
+    for strategy in fill_strategies:
+        strategy_name = type(strategy).__name__
+        ds = ds_base.with_insertion_fill(strategy)
+
+        monkeypatch.setattr(_recon_mod, "intervals_and_realign_track_fused", _spy_fused)
+        calls["n"] = 0  # reset per-strategy counter
+
+        # --- read (default rust backend, spy active) ---
+        out = ds[:, :]
+
+        # Anti-vacuous guard
+        assert calls["n"] > 0, (
+            f"[{strategy_name}] intervals_and_realign_track_fused was NEVER "
+            f"invoked during the read (calls={calls['n']}) — "
+            "the backstop is vacuous. Inspect HapsTracks.__call__ to "
+            "confirm intervals_and_realign_track_fused is called on the Rust path."
+        )
+
+        # --- extract tracks for non-triviality check ---
+        _, tracks_out = out
+        data_r = np.asarray(tracks_out.data, dtype=np.float32)
+        assert data_r.size > 0, (
+            f"[{strategy_name}] Track output is empty — "
+            "regions may not overlap stored intervals."
+        )
+        assert np.any(data_r != 0.0), (
+            f"[{strategy_name}] All realigned track values are 0 — "
+            "the BigWig intervals may not overlap the stored regions, "
+            "making this comparison vacuous."
+        )
+
+        # --- replay against frozen golden ---
+        golden_name = f"ds_haps_tracks_{strategy_name}"
+        _golden.assert_output_matches_golden(out, _golden.load_flat_golden(golden_name))
+
+        # Restore original between strategies.
+        monkeypatch.setattr(_recon_mod, "intervals_and_realign_track_fused", orig_fused)
+
+
+# ---------------------------------------------------------------------------
+# variant-windows live-path spy
+# ---------------------------------------------------------------------------
+
+
+def test_assemble_variant_buffers_runs_on_live_windows_path(phased_svar_gvl, reference):
+    """The rust mega-call must actually fire on the windows __getitem__ path.
+
+    Installs a counting spy on the registered ``rust`` entry of
+    ``assemble_variant_buffers``, opens a variant-windows dataset, indexes a
+    batch, and asserts the spy was invoked at least once.
+    """
+    import genvarloader as gvl
+    import genvarloader._dataset._flat_variants  # noqa: F401 — triggers register()
+    from genvarloader import VarWindowOpt
+
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = (
+        ds.with_tracks(False)
+        .with_output_format("flat")
+        .with_seqs(
+            "variant-windows",
+            VarWindowOpt(flank_length=4, token_alphabet=b"ACGT", unknown_token=4),
+        )
+    )
+
+    spy, calls, restore = _golden.make_kernel_spy("assemble_variant_buffers")
+    try:
+        _ = ds[[0, 1], [0, 1]]
+    finally:
+        restore()
+
+    assert calls["n"] > 0, (
+        "assemble_variant_buffers was NEVER invoked on the live variant-windows "
+        f"__getitem__ path (calls={calls['n']}) — the backstop is vacuous. "
+        "Inspect get_variants_flat to confirm the kernel is called on the windows branch."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Strand=−1 parity backstops (Task 7 — pre-wiring safety net)
+# ---------------------------------------------------------------------------
+
+_SPLICE_TRANSCRIPT_IDS = ["T1", "T2", "T3", "T3", "T4"]
+_NEG_TRANSCRIPT_IDX = 1
+
+
+def _open_strand_spliced(ds_dir, ref, kind: str):
+    """Open the strand-mixed dataset in spliced mode for ``kind``."""
+    from dataclasses import replace
+
+    import polars as pl
+
+    import genvarloader as gvl
+
+    if kind == "tracks":
+        ds = gvl.Dataset.open(ds_dir)
+        ds = ds.with_seqs(None).with_tracks("signal")
+    else:
+        ds = gvl.Dataset.open(ds_dir, reference=ref)
+        ds = ds.with_seqs(kind).with_tracks(False)  # type: ignore[arg-type]
+
+    sub_bed = ds._full_bed.with_columns(
+        pl.Series("transcript_id", _SPLICE_TRANSCRIPT_IDS)
+    )
+    ds = replace(ds, _full_bed=sub_bed).with_settings(splice_info="transcript_id")
+    assert ds.is_spliced, f"[{kind}] dataset should be in spliced mode"
+    return ds
+
+
+@pytest.mark.parametrize(
+    "kind",
+    ["reference", "haplotypes", "annotated", "tracks", "tracks-seqs", "haps-tracks"],
+)
+def test_neg_strand_parity(kind, tmp_path, synthetic_case):
+    """Mixed +/− strand regions produce output matching the frozen golden.
+
+    Covers six output kinds over a fresh variants+tracks+strand dataset with
+    ``max_jitter=0``.
+    """
+    import genvarloader as gvl
+
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+
+    if kind == "tracks":
+        ds = gvl.Dataset.open(ds_dir)
+        ds = ds.with_seqs(None).with_tracks("signal")
+    elif kind == "tracks-seqs":
+        ds = gvl.Dataset.open(ds_dir, reference=ref)
+        ds = ds.with_seqs("reference").with_tracks("signal")
+    elif kind == "haps-tracks":
+        ds = gvl.Dataset.open(ds_dir, reference=ref)
+        ds = ds.with_seqs("haplotypes").with_tracks("signal")
+    else:
+        ds = gvl.Dataset.open(ds_dir, reference=ref)
+        ds = ds.with_seqs(kind).with_tracks(False)  # type: ignore[arg-type]
+
+    # Non-vacuity guard: fixture must have -strand regions.
+    neg_mask = ds._full_regions[:, 3] == -1
+    assert np.any(neg_mask), (
+        f"[{kind}] Fixture has no -strand regions; parity test is vacuous."
+    )
+
+    # --- read (default rust backend) ---
+    out = ds[:, :]
+
+    # --- replay against frozen golden ---
+    safe_kind = kind.replace("-", "_")
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden(f"ds_neg_strand_{safe_kind}")
+    )
+
+
+def test_negative_strand_actually_reverse_complements(tmp_path, synthetic_case):
+    """Non-vacuity: a −strand region's bytes differ from the forward-oriented
+    bytes AND equal the exact reverse-complement.
+    """
+    import genvarloader as gvl
+    from seqpro.rag import reverse_complement
+
+    from genvarloader._ragged import _COMP
+
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+
+    ds = gvl.Dataset.open(ds_dir, reference=ref)
+    ds = ds.with_seqs("reference").with_tracks(False)
+
+    neg_mask = ds._full_regions[:, 3] == -1
+    assert np.any(neg_mask), (
+        "No -strand regions in fixture; non-vacuity test is vacuous."
+    )
+    neg_idx = int(np.where(neg_mask)[0][0])  # first -strand region (index 1)
+
+    # Forward-oriented reference at the -strand region (RC disabled).
+    ds_fwd = ds.with_settings(rc_neg=False)
+    fwd = ds_fwd[neg_idx, 0]  # Ragged[S1], shape (None,)
+
+    # RC-applied output (rc_neg=True by default).
+    out = ds[neg_idx, 0]  # Ragged[S1], shape (None,)
+
+    fwd_bytes = np.asarray(fwd.data).tobytes()
+    out_bytes = np.asarray(out.data).tobytes()
+
+    mask = np.array([True], dtype=bool)
+    rc_fwd = reverse_complement(fwd, _COMP, mask=mask, copy=True)
+    rc_fwd_bytes = np.asarray(rc_fwd.data).tobytes()
+
+    # Self-check: the anchor region must be non-palindromic.
+    assert fwd_bytes != rc_fwd_bytes, (
+        f"Anchor -strand region {neg_idx} is palindromic (fwd == rc(fwd)) — "
+        "non-vacuity Guard 1 is unreliable; pick a different anchor region."
+    )
+
+    # Guard 1: RC must have changed bytes.
+    assert out_bytes != fwd_bytes, (
+        f"RC had NO effect on -strand region {neg_idx}: output is byte-identical "
+        "to the forward-oriented sequence.  The region may be a palindrome, or "
+        "rc_neg=True is not being applied on the read path."
+    )
+
+    # Guard 2: output must equal the exact reverse-complement of the forward seq.
+    assert out_bytes == rc_fwd_bytes, (
+        f"Output for -strand region {neg_idx} is NOT the exact reverse-complement "
+        "of the forward-oriented sequence.\n"
+        "  forward : "
+        f"{bytes(np.asarray(fwd.data).view(np.uint8)).decode('ascii')!r}\n"
+        "  rc(fwd) : "
+        f"{bytes(np.asarray(rc_fwd.data).view(np.uint8)).decode('ascii')!r}\n"
+        "  output  : "
+        f"{bytes(np.asarray(out.data).view(np.uint8)).decode('ascii')!r}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Strand=−1 SPLICED parity backstops (Task 7 — pre-wiring safety net)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "kind",
+    ["reference", "haplotypes", "annotated", "tracks"],
+)
+def test_neg_strand_spliced_parity(kind, tmp_path, synthetic_case):
+    """Spliced mixed +/− strand transcripts: output matches the frozen golden.
+
+    Covers the four splice-capable output kinds (reference, haplotypes,
+    annotated, tracks).
+    """
+    import genvarloader as gvl
+
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = _open_strand_spliced(ds_dir, ref, kind)
+
+    # The negative-strand anchor transcript (T2) must really be -strand.
+    neg_transcript = ds.spliced_regions[_NEG_TRANSCRIPT_IDX]
+    assert "-" in neg_transcript["strand"].item(0), (
+        f"[{kind}] anchor transcript is not negative-strand; test is vacuous."
+    )
+
+    # --- read (default rust backend) ---
+    out = ds[:, :]
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden(f"ds_neg_strand_spliced_{kind}")
+    )
+
+
+def test_negative_strand_spliced_reverse_complements(tmp_path, synthetic_case):
+    """Non-vacuity for the spliced path: a −strand transcript's bytes differ
+    from the forward-oriented bytes AND equal the exact reverse-complement.
+    """
+    import genvarloader as gvl
+    from seqpro.rag import reverse_complement
+
+    from genvarloader._ragged import _COMP
+
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = _open_strand_spliced(ds_dir, ref, "reference")
+
+    t_idx = _NEG_TRANSCRIPT_IDX
+    assert "-" in ds.spliced_regions[t_idx]["strand"].item(0), (
+        "Anchor spliced transcript is not negative-strand; test is vacuous."
+    )
+
+    # Forward-oriented spliced transcript (RC disabled).
+    ds_fwd = ds.with_settings(rc_neg=False)
+    fwd = ds_fwd[t_idx, 0]  # Ragged[S1], shape (None,)
+
+    # RC-applied spliced transcript (rc_neg=True by default).
+    out = ds[t_idx, 0]  # Ragged[S1], shape (None,)
+
+    fwd_bytes = np.asarray(fwd.data).tobytes()
+    out_bytes = np.asarray(out.data).tobytes()
+
+    mask = np.array([True], dtype=bool)
+    rc_fwd = reverse_complement(fwd, _COMP, mask=mask, copy=True)
+    rc_fwd_bytes = np.asarray(rc_fwd.data).tobytes()
+
+    # Self-check: anchor transcript must be non-palindromic.
+    assert fwd_bytes != rc_fwd_bytes, (
+        f"Anchor spliced transcript {t_idx} is palindromic (fwd == rc(fwd)) — "
+        "non-vacuity Guard 1 is unreliable; pick a different anchor transcript."
+    )
+
+    # Guard 1: RC must have changed bytes.
+    assert out_bytes != fwd_bytes, (
+        f"RC had NO effect on spliced -strand transcript {t_idx}: output is "
+        "byte-identical to the forward-oriented sequence.  rc_neg=True may not "
+        "be applied on the spliced read path."
+    )
+
+    # Guard 2: output must equal the exact reverse-complement of the forward seq.
+    assert out_bytes == rc_fwd_bytes, (
+        f"Output for spliced -strand transcript {t_idx} is NOT the exact "
+        "reverse-complement of the forward-oriented sequence.\n"
+        "  forward : "
+        f"{bytes(np.asarray(fwd.data).view(np.uint8)).decode('ascii')!r}\n"
+        "  rc(fwd) : "
+        f"{bytes(np.asarray(rc_fwd.data).view(np.uint8)).decode('ascii')!r}\n"
+        "  output  : "
+        f"{bytes(np.asarray(out.data).view(np.uint8)).decode('ascii')!r}"
     )
diff --git a/tests/parity/test_flat_variants_parity.py b/tests/parity/test_flat_variants_parity.py
new file mode 100644
index 00000000..47862bcb
--- /dev/null
+++ b/tests/parity/test_flat_variants_parity.py
@@ -0,0 +1,199 @@
+"""flat_variants kernels: rust vs frozen golden (oracle frozen Phase 5 W5)."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from genvarloader._dataset._flat_variants import (
+    _compact_keep,
+    _fill_empty_fixed,
+    _fill_empty_scalar,
+    _fill_empty_seq,
+    _gather_rows,
+)
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+# ---------------------------------------------------------------------------
+# Golden replay tests (one per golden name)
+# ---------------------------------------------------------------------------
+
+
+def test_gather_rows_i32_golden():
+    cases = _golden.load_golden("gather_rows_i32")
+    assert cases, "empty golden"
+    _golden.replay_tuple("gather_rows_i32", cases)
+
+
+def test_gather_rows_f32_golden():
+    cases = _golden.load_golden("gather_rows_f32")
+    assert cases, "empty golden"
+    _golden.replay_tuple("gather_rows_f32", cases)
+
+
+def test_gather_alleles_golden():
+    cases = _golden.load_golden("gather_alleles")
+    assert cases, "empty golden"
+    _golden.replay_tuple("gather_alleles", cases)
+
+
+def test_compact_keep_i32_golden():
+    cases = _golden.load_golden("compact_keep_i32")
+    assert cases, "empty golden"
+    _golden.replay_tuple("compact_keep_i32", cases)
+
+
+def test_compact_keep_f32_golden():
+    cases = _golden.load_golden("compact_keep_f32")
+    assert cases, "empty golden"
+    _golden.replay_tuple("compact_keep_f32", cases)
+
+
+def test_fill_empty_scalar_i32_golden():
+    cases = _golden.load_golden("fill_empty_scalar_i32")
+    assert cases, "empty golden"
+    _golden.replay_tuple("fill_empty_scalar_i32", cases)
+
+
+def test_fill_empty_scalar_f32_golden():
+    cases = _golden.load_golden("fill_empty_scalar_f32")
+    assert cases, "empty golden"
+    _golden.replay_tuple("fill_empty_scalar_f32", cases)
+
+
+def test_fill_empty_fixed_i32_golden():
+    cases = _golden.load_golden("fill_empty_fixed_i32")
+    assert cases, "empty golden"
+    _golden.replay_tuple("fill_empty_fixed_i32", cases)
+
+
+def test_fill_empty_fixed_f32_golden():
+    cases = _golden.load_golden("fill_empty_fixed_f32")
+    assert cases, "empty golden"
+    _golden.replay_tuple("fill_empty_fixed_f32", cases)
+
+
+def test_fill_empty_seq_u8_golden():
+    cases = _golden.load_golden("fill_empty_seq_u8")
+    assert cases, "empty golden"
+    _golden.replay_tuple("fill_empty_seq_u8", cases)
+
+
+def test_fill_empty_seq_i32_golden():
+    cases = _golden.load_golden("fill_empty_seq_i32")
+    assert cases, "empty golden"
+    _golden.replay_tuple("fill_empty_seq_i32", cases)
+
+
+# ---------------------------------------------------------------------------
+# Dtype regression tests (no hypothesis, no dispatch)
+# ---------------------------------------------------------------------------
+
+
+def test_gather_rows_dtype_regression():
+    """_gather_rows must preserve dtype and values — no silent down-cast."""
+    # float32 case: the original corruption (0.25 -> 0 as int32)
+    goi = np.array([0], np.intp)
+    offsets = np.array([0, 2], np.int64)
+    data_f32 = np.array([0.25, 0.75], np.float32)
+    out_f32, off_f32 = _gather_rows(goi, offsets, data_f32)
+    assert out_f32.dtype == np.float32, f"Expected float32, got {out_f32.dtype}"
+    np.testing.assert_array_equal(out_f32, np.array([0.25, 0.75], np.float32))
+    assert off_f32.tolist() == [0, 2]
+
+    # int64 case: arbitrary "other" dtype must not be coerced to int32
+    data_i64 = np.array([100_000_000, 200_000_000], np.int64)
+    out_i64, off_i64 = _gather_rows(goi, offsets, data_i64)
+    assert out_i64.dtype == np.int64, f"Expected int64, got {out_i64.dtype}"
+    np.testing.assert_array_equal(out_i64, data_i64)
+    assert off_i64.tolist() == [0, 2]
+
+
+def test_compact_keep_dtype_regression():
+    """_compact_keep must preserve dtype without down-casting.
+
+    The i32/f32 Rust cores handle those two dtypes. All other dtypes (e.g.
+    int16, int64 for custom FORMAT fields, issue #231) must round-trip via the
+    numba fallback with the exact same dtype and values.
+    """
+    row_offsets = np.array([0, 2, 3], np.int64)
+    keep = np.array([True, False, True], np.bool_)
+
+    # int16: should NOT be widened to int32
+    vals_i16 = np.array([10, 20, 30], np.int16)
+    out_i16, off_i16 = _compact_keep(vals_i16, row_offsets, keep)
+    assert out_i16.dtype == np.int16, f"Expected int16, got {out_i16.dtype}"
+    np.testing.assert_array_equal(out_i16, np.array([10, 30], np.int16))
+    assert off_i16.tolist() == [0, 1, 2]
+
+    # int64: should NOT be narrowed to int32
+    vals_i64 = np.array([100_000_000_000, 200_000_000_000, 300_000_000_000], np.int64)
+    out_i64, off_i64 = _compact_keep(vals_i64, row_offsets, keep)
+    assert out_i64.dtype == np.int64, f"Expected int64, got {out_i64.dtype}"
+    np.testing.assert_array_equal(
+        out_i64, np.array([100_000_000_000, 300_000_000_000], np.int64)
+    )
+    assert off_i64.tolist() == [0, 1, 2]
+
+
+def test_fill_empty_scalar_dtype_regression():
+    """_fill_empty_scalar must preserve dtype — no down-cast for non-i32/f32.
+
+    int16 is a representative custom FORMAT field dtype (issue #231).
+    The empty row's fill slot must carry the int16 fill value exactly.
+    """
+    # offsets: 3 rows with middle row empty → [0, 2, 2, 3]
+    data = np.array([10, 20, 30], np.int16)
+    offsets = np.array([0, 2, 2, 3], np.int64)
+    fill = np.int16(99)
+    out, new_off = _fill_empty_scalar(data, offsets, fill)
+    assert out.dtype == np.int16, f"Expected int16, got {out.dtype}"
+    np.testing.assert_array_equal(out, np.array([10, 20, 99, 30], np.int16))
+    assert new_off.tolist() == [0, 2, 3, 4]
+
+
+def test_fill_empty_fixed_dtype_regression():
+    """_fill_empty_fixed must preserve dtype — no down-cast for non-i32/f32.
+
+    int16 is representative of custom FORMAT flank tokens (issue #231).
+    The empty row's `inner` fill slots must carry the int16 fill value exactly.
+    """
+    # 2 rows: offsets [0,1,1], inner=2 — second row empty.
+    data = np.array([7, 8], np.int16)  # 1 var * 2 inner
+    offsets = np.array([0, 1, 1], np.int64)
+    fill = np.int16(42)
+    out, new_off = _fill_empty_fixed(data, offsets, 2, fill)
+    assert out.dtype == np.int16, f"Expected int16, got {out.dtype}"
+    np.testing.assert_array_equal(out, np.array([7, 8, 42, 42], np.int16))
+    assert new_off.tolist() == [0, 1, 2]
+
+
+def test_fill_empty_seq_dtype_regression():
+    """_fill_empty_seq must preserve dtype for int32 token windows.
+
+    A single uint8-only Rust core would silently corrupt int32 token values
+    (e.g. token 999 → 0xE7 = 231 when truncated to uint8).
+    This test verifies that int32 token windows round-trip exactly through
+    the dispatch wrapper, including the dummy token in the empty slot.
+    """
+    # 2 rows: var_offsets [0,0,2] — row 0 is empty.
+    # Row 1: 2 variants with tokens [100, 200] and [300].
+    # seq_offsets: [0,2,3].
+    # dummy int32 token = 999 (> 255 — would be corrupted if truncated to uint8).
+    data = np.array([100, 200, 300], np.int32)
+    var_offsets = np.array([0, 0, 2], np.int64)
+    seq_offsets = np.array([0, 2, 3], np.int64)
+    dummy = np.array([999], np.int32)
+
+    nd, nvar, nseq = _fill_empty_seq(data, var_offsets, seq_offsets, dummy)
+
+    assert nd.dtype == np.int32, f"Expected int32, got {nd.dtype}"
+    # new_var: row 0 empty→1 dummy, row 1 has 2 vars → [0, 1, 3]
+    assert nvar.tolist() == [0, 1, 3], f"new_var wrong: {nvar.tolist()}"
+    # new_seq: dummy len=1, var0 len=2, var1 len=1 → [0, 1, 3, 4]
+    assert nseq.tolist() == [0, 1, 3, 4], f"new_seq wrong: {nseq.tolist()}"
+    # new_data: [999] (dummy), [100,200] (var0 tokens), [300] (var1 tokens)
+    np.testing.assert_array_equal(nd, np.array([999, 100, 200, 300], np.int32))
diff --git a/tests/parity/test_fused_haps_parity.py b/tests/parity/test_fused_haps_parity.py
new file mode 100644
index 00000000..e3f11cad
--- /dev/null
+++ b/tests/parity/test_fused_haps_parity.py
@@ -0,0 +1,157 @@
+"""Dataset-level parity backstop for the fused haplotypes __getitem__ kernel.
+
+Proves that the fused Rust entry ``reconstruct_haplotypes_fused`` (Task 13)
+produces byte-identical haplotype output to the frozen golden (generated from
+the rust implementation, oracle-verified against numba at generation time).
+
+The test asserts:
+  1. The fused entry is actually invoked on the Rust path (non-vacuity spy guard).
+  2. The Rust output is byte-identical to the frozen golden.
+  3. The output is non-trivial (contains non-N bases).
+
+Scope:
+  - Only the NON-SPLICE plain haplotypes path is fused (per task spec and
+    audit section 5d).  The splice path continues to use the existing
+    per-kernel dispatched entries.
+  - The annotated path is NOT fused in Task 13.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import genvarloader as gvl
+import genvarloader._dataset._haps as _haps_mod
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+# ---------------------------------------------------------------------------
+# Main parity gate — fused Rust path vs. frozen golden
+# ---------------------------------------------------------------------------
+
+
+def test_fused_haps_dataset_parity(phased_svar_gvl, reference, monkeypatch):
+    """Fused reconstruct_haplotypes_fused output matches the frozen golden.
+
+    Spy guard: we monkeypatch ``_haps_mod.reconstruct_haplotypes_fused`` to
+    count calls.  The spy must fire at least once (anti-vacuous guard).
+    """
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = ds.with_seqs("haplotypes")
+
+    orig_fused = getattr(_haps_mod, "reconstruct_haplotypes_fused", None)
+    assert orig_fused is not None, (
+        "reconstruct_haplotypes_fused not found on _haps_mod — "
+        "ensure it is imported at module level in _haps.py"
+    )
+
+    calls: dict[str, int] = {"n": 0}
+
+    def _spy_fused(*a, **k):
+        calls["n"] += 1
+        return orig_fused(*a, **k)
+
+    monkeypatch.setattr(_haps_mod, "reconstruct_haplotypes_fused", _spy_fused)
+
+    # --- read (default rust backend, spy active) ---
+    out = ds[:, :]
+
+    # Anti-vacuous guard: fused entry must have been invoked
+    assert calls["n"] > 0, (
+        f"reconstruct_haplotypes_fused was NEVER invoked during the read "
+        f"(calls={calls['n']}) — the backstop is vacuous. "
+        "Ensure _haps._reconstruct_haplotypes calls reconstruct_haplotypes_fused "
+        "on the non-splice path."
+    )
+
+    # --- sanity: non-trivial output ---
+    out_data = np.asarray(out.data)
+    assert out_data.size > 0, (
+        "Haplotypes output contains zero bytes — regions don't overlap any "
+        "reference sequence.  The parity comparison is vacuous."
+    )
+    n_pad = np.uint8(ord("N"))
+    data_u8 = out_data.view(np.uint8)
+    assert np.any(data_u8 != n_pad), (
+        "Haplotypes output is entirely 'N' padding — non-padding bases are "
+        "required to prove the comparison is meaningful."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_haplotypes_mode")
+    )
+
+
+# ---------------------------------------------------------------------------
+# Fixed-length parity gate — exercises the output_length >= 0 fused branch
+# ---------------------------------------------------------------------------
+
+
+def test_fused_haps_dataset_parity_fixed_length(
+    phased_svar_gvl, reference, monkeypatch
+):
+    """Fused reconstruct_haplotypes_fused (fixed-length arm) matches the frozen golden.
+
+    Requests a fixed output_length via ``Dataset.with_len(N)``.  The fused entry
+    then receives ``output_length=N`` (>= 0) rather than -1 (ragged mode).
+
+    Spy guard and non-vacuity check mirror the ragged test above.
+    The golden stores the fixed-length ndarray output.
+    """
+    FIXED_LEN = 15
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = ds.with_seqs("haplotypes").with_len(FIXED_LEN)
+
+    orig_fused = getattr(_haps_mod, "reconstruct_haplotypes_fused", None)
+    assert orig_fused is not None, (
+        "reconstruct_haplotypes_fused not found on _haps_mod — "
+        "ensure it is imported at module level in _haps.py"
+    )
+
+    calls: dict[str, int] = {"n": 0}
+
+    def _spy_fused(*a, **k):
+        calls["n"] += 1
+        return orig_fused(*a, **k)
+
+    monkeypatch.setattr(_haps_mod, "reconstruct_haplotypes_fused", _spy_fused)
+
+    # --- read (default rust backend, fixed-length fused path) ---
+    out = ds[:, :]
+
+    # Anti-vacuous guard
+    assert calls["n"] > 0, (
+        f"reconstruct_haplotypes_fused was NEVER invoked during the read "
+        f"(calls={calls['n']}) — the backstop is vacuous. "
+        "Ensure _haps._reconstruct_haplotypes calls reconstruct_haplotypes_fused "
+        "on the non-splice path."
+    )
+
+    # --- type + shape sanity ---
+    assert isinstance(out, np.ndarray), (
+        f"Expected ndarray from fixed-length haplotypes mode, got {type(out)}"
+    )
+    assert out.shape[-1] == FIXED_LEN, (
+        f"Expected last axis == {FIXED_LEN}, got shape {out.shape}"
+    )
+
+    # --- sanity: non-trivial output ---
+    data_u8 = out.view(np.uint8)
+    assert data_u8.size > 0, (
+        "Fixed-length haplotypes output has zero bytes — the comparison is vacuous."
+    )
+    n_pad = np.uint8(ord("N"))
+    assert np.any(data_u8 != n_pad), (
+        "Fixed-length haplotypes output is entirely 'N' padding — non-padding "
+        "bases are required to prove the comparison is meaningful."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_haps_fixed_len")
+    )
diff --git a/tests/parity/test_fused_tracks_parity.py b/tests/parity/test_fused_tracks_parity.py
new file mode 100644
index 00000000..cb53fbd5
--- /dev/null
+++ b/tests/parity/test_fused_tracks_parity.py
@@ -0,0 +1,122 @@
+"""Dataset-level parity backstop for the fused tracks __getitem__ kernel (Task 14).
+
+Proves that the fused Rust entry ``intervals_and_realign_track_fused``
+produces byte-identical track output to the frozen golden (generated from
+the rust implementation, oracle-verified against the composed numba pipeline).
+
+The test asserts:
+  1. The fused entry is actually invoked on the Rust path (non-vacuity spy guard).
+  2. The Rust output is byte-identical to the frozen golden,
+     across all 5 insertion-fill strategies.
+  3. The output is non-trivial (contains non-zero values).
+
+Scope:
+  - Only the HapsTracks path is tested (track realignment requires variants).
+  - Uses the ``max_jitter=0`` ``build_haps_tracks_dataset`` fixture (Task 11).
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_fused_tracks_dataset_parity(synthetic_case, tmp_path, monkeypatch):
+    """Fused intervals_and_realign_track_fused output matches the frozen golden.
+
+    Covers all 5 insertion-fill strategies. The fused per-track entry (called
+    directly from HapsTracks.__call__ on the rust path) must produce the same
+    float32 bytes as the frozen golden.
+
+    Spy guard: we monkeypatch ``_reconstruct_mod.intervals_and_realign_track_fused``
+    to count calls. The spy must fire at least once during the read.
+    """
+    import genvarloader as gvl
+    import genvarloader._dataset._reconstruct as _reconstruct_mod
+    from genvarloader._dataset._insertion_fill import (
+        Constant,
+        FlankSample,
+        Interpolate,
+        Repeat5p,
+        Repeat5pNormalized,
+    )
+    from tests.parity._fixtures import build_haps_tracks_dataset
+
+    ds_dir = build_haps_tracks_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds_base = gvl.Dataset.open(ds_dir, reference=ref)
+    ds_base = ds_base.with_seqs("haplotypes").with_tracks("signal")
+
+    orig_fused = getattr(_reconstruct_mod, "intervals_and_realign_track_fused", None)
+    assert orig_fused is not None, (
+        "intervals_and_realign_track_fused not found on _reconstruct_mod — "
+        "ensure it is imported at module level in _reconstruct.py"
+    )
+
+    fill_strategies = [
+        Repeat5p(),
+        Repeat5pNormalized(),
+        Constant(0.0),
+        FlankSample(flank_width=5),
+        Interpolate(order=1),
+    ]
+
+    for strategy in fill_strategies:
+        strategy_name = type(strategy).__name__
+        ds = ds_base.with_insertion_fill(strategy)
+
+        # --- install spy on intervals_and_realign_track_fused ---
+        calls: dict[str, int] = {"n": 0}
+
+        def _make_spy(orig, c=calls):
+            def spy(*a, **k):
+                c["n"] += 1
+                return orig(*a, **k)
+
+            return spy
+
+        spy_fn = _make_spy(orig_fused)
+        monkeypatch.setattr(
+            _reconstruct_mod, "intervals_and_realign_track_fused", spy_fn
+        )
+
+        calls["n"] = 0  # reset per-strategy
+
+        # --- read (default rust backend, spy active) ---
+        out = ds[:, :]
+
+        # Anti-vacuous guard
+        assert calls["n"] > 0, (
+            f"[{strategy_name}] intervals_and_realign_track_fused was NEVER invoked "
+            f"during the read (calls={calls['n']}) — the backstop is "
+            "vacuous. Ensure HapsTracks.__call__ calls intervals_and_realign_track_fused "
+            "on the Rust path."
+        )
+
+        # --- extract track arrays for non-triviality check ---
+        _, tracks_out = out
+        data_r = np.asarray(tracks_out.data, dtype=np.float32)
+
+        # Non-triviality
+        assert data_r.size > 0, (
+            f"[{strategy_name}] Track output is empty — "
+            "regions may not overlap stored intervals."
+        )
+        assert np.any(data_r != 0.0), (
+            f"[{strategy_name}] All realigned track values are 0 — "
+            "the BigWig intervals may not overlap the stored regions, "
+            "making this comparison vacuous."
+        )
+
+        # --- replay against frozen golden ---
+        golden_name = f"ds_haps_tracks_{strategy_name}"
+        _golden.assert_output_matches_golden(out, _golden.load_flat_golden(golden_name))
+
+        # Restore original between strategies.
+        monkeypatch.setattr(
+            _reconstruct_mod, "intervals_and_realign_track_fused", orig_fused
+        )
diff --git a/tests/parity/test_gen_dataset_goldens.py b/tests/parity/test_gen_dataset_goldens.py
new file mode 100644
index 00000000..4e6de5f8
--- /dev/null
+++ b/tests/parity/test_gen_dataset_goldens.py
@@ -0,0 +1,391 @@
+"""Dataset-level golden generator for the parity suite.
+
+Run with GVL_GEN_GOLDENS=1 to regenerate all dataset goldens:
+
+    GVL_GEN_GOLDENS=1 pixi run -e dev pytest tests/parity/test_gen_dataset_goldens.py -q --basetemp=$(pwd)/.pytest_tmp
+
+Each test:
+  1. Builds the SAME dataset the corresponding parity test uses (identical fixtures).
+  2. Reads ds[idx] under numba then rust (GVL_BACKEND env flip — gen time only).
+  3. HARD-FAILS on any numba != rust mismatch (oracle cross-check).
+  4. Saves the rust output as a frozen golden.
+
+Normal test runs skip all tests in this file.
+
+*** DANGER (post-W5): numba was DELETED in W5, so the GVL_BACKEND flip + oracle
+cross-check (steps 2-3) no longer fire. Regenerating now would freeze rust == rust
+with no oracle — meaningless goldens. Only regenerate on a numba-PRESENT checkout
+(at or before the Stage-A snapshot). ***
+"""
+
+from __future__ import annotations
+
+import os
+
+import numpy as np
+import polars as pl
+import pytest
+from dataclasses import replace
+
+import genvarloader as gvl
+import genvarloader._dataset._genotypes  # noqa: F401 — trigger register()
+import genvarloader._dataset._flat_variants  # noqa: F401
+import genvarloader._dataset._reference  # noqa: F401
+import genvarloader._dataset._tracks  # noqa: F401
+from genvarloader import VarWindowOpt
+
+from tests.parity import _golden
+from tests.parity._fixtures import (
+    build_haps_tracks_dataset,
+    build_strand_mixed_dataset,
+    build_track_dataset,
+    build_track_dataset_jittered,
+)
+
+pytestmark = pytest.mark.parity
+
+GEN = os.environ.get("GVL_GEN_GOLDENS") == "1"
+skip_unless_gen = pytest.mark.skipif(
+    not GEN, reason="set GVL_GEN_GOLDENS=1 to generate"
+)
+
+
+def _oracle_check(out_numba, out_rust, name: str) -> None:
+    """HARD-FAIL if numba output differs from rust output. No suppression."""
+    flat_n = _golden.flatten_output(out_numba)
+    flat_r = _golden.flatten_output(out_rust)
+    _golden._assert_flat_eq(flat_n, flat_r, f"oracle/{name}")
+
+
+def _gen(name: str, monkeypatch, build_fn):
+    """Build dataset, read under numba then rust, oracle-check, save golden."""
+    monkeypatch.setenv("GVL_BACKEND", "numba")
+    out_numba = build_fn()
+    monkeypatch.setenv("GVL_BACKEND", "rust")
+    out_rust = build_fn()
+    _oracle_check(out_numba, out_rust, name)
+    _golden.save_flat_golden(name, out_rust)
+
+
+# ---------------------------------------------------------------------------
+# Haplotypes-mode (non-splice) and fused-haps — share ds_haplotypes_mode
+# ---------------------------------------------------------------------------
+
+
+@skip_unless_gen
+def test_gen_haplotypes_mode(phased_svar_gvl, reference, monkeypatch):
+    """Generates ds_haplotypes_mode: phased_svar_gvl + reference, haplotypes mode."""
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference).with_seqs("haplotypes")
+    _gen("ds_haplotypes_mode", monkeypatch, lambda: ds[:, :])
+
+
+@skip_unless_gen
+def test_gen_annotated_mode(phased_svar_gvl, reference, monkeypatch):
+    """Generates ds_annotated_mode: annotated mode."""
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference).with_seqs("annotated")
+    _gen("ds_annotated_mode", monkeypatch, lambda: ds[:, :])
+
+
+@skip_unless_gen
+def test_gen_haps_fixed_len(phased_svar_gvl, reference, monkeypatch):
+    """Generates ds_haps_fixed_len: haplotypes mode with with_len(15)."""
+    FIXED_LEN = 15
+    ds = (
+        gvl.Dataset.open(phased_svar_gvl, reference=reference)
+        .with_seqs("haplotypes")
+        .with_len(FIXED_LEN)
+    )
+    _gen("ds_haps_fixed_len", monkeypatch, lambda: ds[:, :])
+
+
+# ---------------------------------------------------------------------------
+# Spliced haplotypes
+# ---------------------------------------------------------------------------
+
+
+@skip_unless_gen
+def test_gen_spliced_haps(phased_svar_gvl, reference, monkeypatch):
+    """Generates ds_spliced_haps: haplotypes + splice (T1=[0,1], T2=[2,3])."""
+    ds = (
+        gvl.Dataset.open(phased_svar_gvl, reference=reference)
+        .with_seqs("haplotypes")
+        .with_tracks(False)
+    )
+    n = 4
+    sub_bed = ds._full_bed[:n].with_columns(
+        pl.Series("transcript_id", ["T1", "T1", "T2", "T2"])
+    )
+    ds = replace(ds, _full_bed=sub_bed).with_settings(splice_info="transcript_id")
+    assert ds.is_spliced
+    _gen("ds_spliced_haps", monkeypatch, lambda: ds[:, :])
+
+
+# ---------------------------------------------------------------------------
+# Annotated spliced haplotypes
+# ---------------------------------------------------------------------------
+
+
+@skip_unless_gen
+def test_gen_annotated_spliced(phased_svar_gvl, reference, monkeypatch):
+    """Generates ds_annotated_spliced: annotated + spliced with mixed strands."""
+    ds = (
+        gvl.Dataset.open(phased_svar_gvl, reference=reference)
+        .with_seqs("annotated")
+        .with_tracks(False)
+    )
+    n = 4
+    sub_bed = ds._full_bed[:n].with_columns(
+        pl.Series("transcript_id", ["T1", "T1", "T2", "T2"]),
+        pl.Series("strand", ["+", "+", "-", "-"]),
+    )
+    ds = replace(ds, _full_bed=sub_bed).with_settings(splice_info="transcript_id")
+    assert ds.is_spliced
+    _gen("ds_annotated_spliced", monkeypatch, lambda: ds[:, :])
+
+
+# ---------------------------------------------------------------------------
+# Track-only datasets
+# ---------------------------------------------------------------------------
+
+
+@skip_unless_gen
+def test_gen_tracks(tmp_path, monkeypatch):
+    """Generates ds_tracks: track-only dataset, signal track."""
+    ds_dir = build_track_dataset(tmp_path)
+    ds = gvl.Dataset.open(ds_dir).with_tracks("signal")
+    _gen("ds_tracks", monkeypatch, lambda: ds[slice(None), slice(None)])
+
+
+@skip_unless_gen
+def test_gen_tracks_jitter(tmp_path, monkeypatch):
+    """Generates ds_tracks_jitter: jittered track dataset (max_jitter=4)."""
+    MAX_JITTER = 4
+    ds_dir = build_track_dataset_jittered(tmp_path, max_jitter=MAX_JITTER)
+    ds = gvl.Dataset.open(ds_dir).with_tracks("signal")
+    _gen("ds_tracks_jitter", monkeypatch, lambda: ds[slice(None), slice(None)])
+
+
+# ---------------------------------------------------------------------------
+# Haps+tracks (5 fill strategies) — shared by test_dataset_parity and test_fused_tracks_parity
+# ---------------------------------------------------------------------------
+
+
+@skip_unless_gen
+@pytest.mark.parametrize(
+    "strategy_name",
+    [
+        "Repeat5p",
+        "Repeat5pNormalized",
+        "Constant",
+        "FlankSample",
+        "Interpolate",
+    ],
+)
+def test_gen_haps_tracks(strategy_name, tmp_path, synthetic_case, monkeypatch):
+    """Generates ds_haps_tracks_{strategy}: haps+tracks with each fill strategy."""
+    from genvarloader._dataset._insertion_fill import (
+        Constant,
+        FlankSample,
+        Interpolate,
+        Repeat5p,
+        Repeat5pNormalized,
+    )
+
+    strat_map = {
+        "Repeat5p": Repeat5p(),
+        "Repeat5pNormalized": Repeat5pNormalized(),
+        "Constant": Constant(0.0),
+        "FlankSample": FlankSample(flank_width=5),
+        "Interpolate": Interpolate(order=1),
+    }
+    fill = strat_map[strategy_name]
+    ds_dir = build_haps_tracks_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = (
+        gvl.Dataset.open(ds_dir, reference=ref)
+        .with_seqs("haplotypes")
+        .with_tracks("signal")
+        .with_insertion_fill(fill)
+    )
+    golden_name = f"ds_haps_tracks_{strategy_name}"
+    _gen(golden_name, monkeypatch, lambda: ds[:, :])
+
+
+# ---------------------------------------------------------------------------
+# Reference mode
+# ---------------------------------------------------------------------------
+
+
+@skip_unless_gen
+def test_gen_reference_mode(phased_svar_gvl, reference, monkeypatch):
+    """Generates ds_reference_mode: reference mode on phased_svar_gvl."""
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference).with_seqs("reference")
+    _gen("ds_reference_mode", monkeypatch, lambda: ds[:, :])
+
+
+@skip_unless_gen
+def test_gen_reference_fetch(reference, monkeypatch):
+    """Generates ds_reference_fetch: Reference.fetch(contigs[:1], [0], [50])."""
+    contigs = reference.contigs[:1]
+    starts = np.array([0], dtype=np.int64)
+    ends = np.array([50], dtype=np.int64)
+    _gen(
+        "ds_reference_fetch",
+        monkeypatch,
+        lambda: reference.fetch(contigs, starts, ends),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Variants mode
+# ---------------------------------------------------------------------------
+
+
+@skip_unless_gen
+def test_gen_variants(phased_svar_gvl, reference, monkeypatch):
+    """Generates ds_variants: variants mode (RaggedVariants)."""
+    ds = (
+        gvl.Dataset.open(phased_svar_gvl, reference=reference)
+        .with_tracks(False)
+        .with_seqs("variants")
+    )
+    _gen("ds_variants", monkeypatch, lambda: ds[:, :])
+
+
+@skip_unless_gen
+def test_gen_variants_af(phased_svar_gvl, reference, monkeypatch):
+    """Generates ds_variants_af: variants with AF filter (skips if AF unavailable)."""
+    ds_base = gvl.Dataset.open(phased_svar_gvl, reference=reference).with_tracks(False)
+    try:
+        ds = ds_base.with_seqs("variants").with_settings(min_af=0.1, max_af=0.9)
+    except Exception as e:
+        pytest.skip(f"AF filtering unavailable: {e}")
+    try:
+        monkeypatch.setenv("GVL_BACKEND", "numba")
+        out_numba = ds[:, :]
+    except KeyError as e:
+        pytest.skip(f"AF key missing: {e}")
+    monkeypatch.setenv("GVL_BACKEND", "rust")
+    out_rust = ds[:, :]
+    _oracle_check(out_numba, out_rust, "ds_variants_af")
+    _golden.save_flat_golden("ds_variants_af", out_rust)
+
+
+@skip_unless_gen
+def test_gen_variant_windows(phased_svar_gvl, reference, monkeypatch):
+    """Generates ds_variant_windows: variant-windows mode (_FlatVariantWindows)."""
+    ds = (
+        gvl.Dataset.open(phased_svar_gvl, reference=reference)
+        .with_tracks(False)
+        .with_output_format("flat")
+        .with_seqs(
+            "variant-windows",
+            VarWindowOpt(flank_length=4, token_alphabet=b"ACGT", unknown_token=4),
+        )
+    )
+    _gen("ds_variant_windows", monkeypatch, lambda: ds[[0, 1], [0, 1]])
+
+
+# ---------------------------------------------------------------------------
+# Neg-strand parity (6 kinds, unspliced)
+# ---------------------------------------------------------------------------
+
+_NEG_STRAND_KINDS = [
+    "reference",
+    "haplotypes",
+    "annotated",
+    "tracks",
+    "tracks-seqs",
+    "haps-tracks",
+]
+
+
+@skip_unless_gen
+@pytest.mark.parametrize("kind", _NEG_STRAND_KINDS)
+def test_gen_neg_strand(kind, tmp_path, synthetic_case, monkeypatch):
+    """Generates ds_neg_strand_{kind}: mixed +/- strand regions."""
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+
+    if kind == "tracks":
+        ds = gvl.Dataset.open(ds_dir).with_seqs(None).with_tracks("signal")
+    elif kind == "tracks-seqs":
+        ds = (
+            gvl.Dataset.open(ds_dir, reference=ref)
+            .with_seqs("reference")
+            .with_tracks("signal")
+        )
+    elif kind == "haps-tracks":
+        ds = (
+            gvl.Dataset.open(ds_dir, reference=ref)
+            .with_seqs("haplotypes")
+            .with_tracks("signal")
+        )
+    else:
+        ds = gvl.Dataset.open(ds_dir, reference=ref).with_seqs(kind).with_tracks(False)
+
+    safe_kind = kind.replace("-", "_")
+    _gen(f"ds_neg_strand_{safe_kind}", monkeypatch, lambda: ds[:, :])
+
+
+# ---------------------------------------------------------------------------
+# Neg-strand SPLICED parity (4 kinds)
+# ---------------------------------------------------------------------------
+
+_SPLICE_TRANSCRIPT_IDS = ["T1", "T2", "T3", "T3", "T4"]
+_NEG_SPLICED_KINDS = ["reference", "haplotypes", "annotated", "tracks"]
+
+
+def _open_strand_spliced(ds_dir, ref, kind: str):
+    if kind == "tracks":
+        ds = gvl.Dataset.open(ds_dir).with_seqs(None).with_tracks("signal")
+    else:
+        ds = gvl.Dataset.open(ds_dir, reference=ref).with_seqs(kind).with_tracks(False)
+    sub_bed = ds._full_bed.with_columns(
+        pl.Series("transcript_id", _SPLICE_TRANSCRIPT_IDS)
+    )
+    ds = replace(ds, _full_bed=sub_bed).with_settings(splice_info="transcript_id")
+    assert ds.is_spliced
+    return ds
+
+
+@skip_unless_gen
+@pytest.mark.parametrize("kind", _NEG_SPLICED_KINDS)
+def test_gen_neg_strand_spliced(kind, tmp_path, synthetic_case, monkeypatch):
+    """Generates ds_neg_strand_spliced_{kind}: spliced mixed +/- strand."""
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = _open_strand_spliced(ds_dir, ref, kind)
+    _gen(f"ds_neg_strand_spliced_{kind}", monkeypatch, lambda: ds[:, :])
+
+
+# ---------------------------------------------------------------------------
+# Neg-strand variants
+# ---------------------------------------------------------------------------
+
+
+@skip_unless_gen
+def test_gen_neg_strand_variants(tmp_path, synthetic_case, monkeypatch):
+    """Generates ds_neg_strand_variants: variants on mixed-strand dataset."""
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = (
+        gvl.Dataset.open(ds_dir, reference=ref).with_tracks(False).with_seqs("variants")
+    )
+    _gen("ds_neg_strand_variants", monkeypatch, lambda: ds[:, :])
+
+
+@skip_unless_gen
+def test_gen_neg_strand_variants_dummy(tmp_path, synthetic_case, monkeypatch):
+    """Generates ds_neg_strand_variants_dummy: variants with custom DummyVariant."""
+    from genvarloader._dataset._flat_variants import DummyVariant
+
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = (
+        gvl.Dataset.open(ds_dir, reference=ref)
+        .with_tracks(False)
+        .with_seqs("variants")
+        .with_settings(dummy_variant=DummyVariant(alt=b"AC", ref=b"AC"))
+    )
+    _gen("ds_neg_strand_variants_dummy", monkeypatch, lambda: ds[:, :])
diff --git a/tests/parity/test_get_diffs_sparse_parity.py b/tests/parity/test_get_diffs_sparse_parity.py
new file mode 100644
index 00000000..279ea24c
--- /dev/null
+++ b/tests/parity/test_get_diffs_sparse_parity.py
@@ -0,0 +1,15 @@
+"""get_diffs_sparse: rust vs frozen golden (oracle frozen Phase 5 W5)."""
+
+from __future__ import annotations
+
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_get_diffs_sparse_golden():
+    cases = _golden.load_golden("get_diffs_sparse")
+    assert cases, "empty golden"
+    _golden.replay_tuple("get_diffs_sparse", cases)
diff --git a/tests/parity/test_get_reference_parity.py b/tests/parity/test_get_reference_parity.py
new file mode 100644
index 00000000..c2e0ff93
--- /dev/null
+++ b/tests/parity/test_get_reference_parity.py
@@ -0,0 +1,15 @@
+"""get_reference: rust vs frozen golden (oracle frozen Phase 5 W5)."""
+
+from __future__ import annotations
+
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_get_reference_golden():
+    cases = _golden.load_golden("get_reference")
+    assert cases, "empty golden"
+    _golden.replay_return("get_reference", cases)
diff --git a/tests/parity/test_golden_infra.py b/tests/parity/test_golden_infra.py
new file mode 100644
index 00000000..d162ecd3
--- /dev/null
+++ b/tests/parity/test_golden_infra.py
@@ -0,0 +1,38 @@
+# tests/parity/test_golden_infra.py
+"""Self-tests for the golden snapshot/replay infrastructure."""
+
+from __future__ import annotations
+
+import numpy as np
+from hypothesis import strategies as st
+
+from tests.parity import _golden
+
+
+def test_collect_examples_deterministic():
+    s = st.integers(0, 1_000_000)
+    a = _golden.collect_examples(s, 20)
+    b = _golden.collect_examples(s, 20)
+    assert a == b
+    assert len(a) == 20
+
+
+def test_save_load_roundtrip_mixed(tmp_path, monkeypatch):
+    monkeypatch.setattr(_golden, "GOLDEN_DIR", tmp_path)
+    cases = [
+        ((np.arange(3, dtype=np.int32), None, 5), np.arange(3, dtype=np.int32) * 2),
+        ((np.zeros(0, np.uint8),), np.zeros(0, np.uint8)),
+    ]
+    _golden.save_golden("demo", cases)
+    back = _golden.load_golden("demo")
+    assert len(back) == 2
+    np.testing.assert_array_equal(back[0][0][0], cases[0][0][0])
+    assert back[0][0][1] is None
+    assert back[0][0][2] == 5
+
+
+def test_rust_kernels_table_callable():
+    # Every registered name resolves to a real callable imported directly.
+    assert _golden.RUST_KERNELS, "RUST_KERNELS is empty"
+    for name, fn in _golden.RUST_KERNELS.items():
+        assert callable(fn), f"{name} -> {fn!r} not callable"
diff --git a/tests/parity/test_haplotypes_dataset_parity.py b/tests/parity/test_haplotypes_dataset_parity.py
new file mode 100644
index 00000000..aef48e90
--- /dev/null
+++ b/tests/parity/test_haplotypes_dataset_parity.py
@@ -0,0 +1,148 @@
+"""Haplotypes-mode dataset-level parity backstop.
+
+Proves that the Rust reconstruct_haplotypes_fused / reconstruct_annotated_haplotypes_fused
+kernels produce byte-identical output to the frozen goldens generated from the numba-verified
+rust output.
+
+Kernels exercised end-to-end:
+  - reconstruct_haplotypes_fused         (haplotypes mode, non-splice, Task 13)
+  - reconstruct_annotated_haplotypes_fused (annotated mode, non-splice, Task 4)
+
+Two output modes are covered:
+  - "haplotypes"  → Ragged[np.bytes_]
+  - "annotated"   → RaggedAnnotatedHaps (.haps, .var_idxs, .ref_coords)
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import genvarloader as gvl
+import genvarloader._dataset._genotypes  # noqa: F401 — triggers register("reconstruct_haplotypes_from_sparse")
+import genvarloader._dataset._haps as _haps_mod
+from genvarloader._ragged import RaggedAnnotatedHaps
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+# ---------------------------------------------------------------------------
+# Main backstop — "haplotypes" mode
+# ---------------------------------------------------------------------------
+
+
+def test_haplotypes_mode_dataset_parity(phased_svar_gvl, reference, monkeypatch):
+    """Rust reconstruct_haplotypes_fused output matches the frozen golden.
+
+    Spy guard proves the fused entry is actually invoked (non-vacuous).
+    """
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = ds.with_seqs("haplotypes")
+
+    # --- install spy on the fused Rust reconstruct_haplotypes_fused entry ---
+    orig_fused = _haps_mod.reconstruct_haplotypes_fused
+    calls: dict[str, int] = {"n": 0}
+
+    def _spy_fused(*a, **k):
+        calls["n"] += 1
+        return orig_fused(*a, **k)
+
+    monkeypatch.setattr(_haps_mod, "reconstruct_haplotypes_fused", _spy_fused)
+
+    # --- read (default rust backend, spy active) ---
+    out = ds[:, :]
+
+    # --- anti-vacuous guard ---
+    assert calls["n"] > 0, (
+        f"Rust reconstruct_haplotypes_fused was NEVER invoked during the "
+        f"read (calls={calls['n']}) — the backstop is vacuous. "
+        "Inspect the haplotypes read path to confirm "
+        "reconstruct_haplotypes_fused is called on the non-splice rust path "
+        "in _haps._reconstruct_haplotypes."
+    )
+
+    # --- sanity: output must be non-trivial ---
+    out_data = np.asarray(out.data)
+    n_bases = out_data.size
+    assert n_bases > 0, (
+        "Haplotypes output contains zero bytes — regions don't overlap any "
+        "reference sequence.  The parity comparison is vacuous."
+    )
+    n_pad = np.uint8(ord("N"))
+    data_u8 = out_data.view(np.uint8)
+    assert np.any(data_u8 != n_pad), (
+        "Haplotypes output is entirely 'N' padding — regions may fall outside "
+        "the reference contigs.  Non-padding bases are required to prove the "
+        "comparison is meaningful."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_haplotypes_mode")
+    )
+
+
+# ---------------------------------------------------------------------------
+# Annotated backstop — "annotated" mode
+# ---------------------------------------------------------------------------
+
+
+def test_annotated_haplotypes_mode_dataset_parity(
+    phased_svar_gvl, reference, monkeypatch
+):
+    """Rust reconstruct_annotated_haplotypes_fused output matches the frozen golden.
+
+    Covers the annotated path (with_seqs("annotated")).  All three arrays —
+    haps, var_idxs, and ref_coords — are compared byte-identically against the golden.
+    """
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = ds.with_seqs("annotated")
+
+    # --- install spy on the fused Rust reconstruct_annotated_haplotypes_fused entry ---
+    orig_fused = _haps_mod.reconstruct_annotated_haplotypes_fused
+    calls: dict[str, int] = {"n": 0}
+
+    def _spy_fused(*a, **k):
+        calls["n"] += 1
+        return orig_fused(*a, **k)
+
+    monkeypatch.setattr(_haps_mod, "reconstruct_annotated_haplotypes_fused", _spy_fused)
+
+    # --- read (default rust backend, spy active) ---
+    out = ds[:, :]
+
+    # --- anti-vacuous guard ---
+    assert calls["n"] > 0, (
+        f"Rust reconstruct_annotated_haplotypes_fused was NEVER invoked during the "
+        f"read (calls={calls['n']}) — the annotated backstop is vacuous. "
+        "Inspect the annotated read path to confirm "
+        "reconstruct_annotated_haplotypes_fused is called on the non-splice rust path "
+        "in _haps._reconstruct_annotated_haplotypes."
+    )
+
+    # --- type sanity ---
+    assert isinstance(out, RaggedAnnotatedHaps), (
+        f"Expected RaggedAnnotatedHaps from annotated mode, got {type(out)}"
+    )
+
+    # --- sanity: output must be non-trivial ---
+    haps_data = np.asarray(out.haps.data)
+    n_bases = haps_data.size
+    assert n_bases > 0, (
+        "Annotated haplotypes output contains zero bytes — regions don't overlap "
+        "any reference sequence.  The parity comparison is vacuous."
+    )
+    data_u8 = haps_data.view(np.uint8)
+    n_pad = np.uint8(ord("N"))
+    assert np.any(data_u8 != n_pad), (
+        "Annotated haplotypes output is entirely 'N' padding — regions may fall "
+        "outside the reference contigs.  Non-padding bases are required to prove "
+        "the comparison is meaningful."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_annotated_mode")
+    )
diff --git a/tests/parity/test_import_no_numba.py b/tests/parity/test_import_no_numba.py
new file mode 100644
index 00000000..bdaef2f4
--- /dev/null
+++ b/tests/parity/test_import_no_numba.py
@@ -0,0 +1,24 @@
+"""genvarloader's OWN modules must not import numba (Phase 5 W5).
+
+NOTE: `import genvarloader` may still pull numba transitively via seqpro
+(seqpro 0.20.0 eagerly imports numba). That is outside genvarloader's control;
+this guard asserts genvarloader's own source is numba-free. See the seqpro
+follow-up issue for the transitive import and the W6 RSS impact.
+"""
+
+from __future__ import annotations
+
+import pathlib
+
+import genvarloader
+
+
+def test_genvarloader_own_code_imports_no_numba():
+    pkg_dir = pathlib.Path(genvarloader.__file__).parent
+    offenders: list[str] = []
+    for py in pkg_dir.rglob("*.py"):
+        for ln, line in enumerate(py.read_text().splitlines(), 1):
+            s = line.strip()
+            if s.startswith("import numba") or s.startswith("from numba"):
+                offenders.append(f"{py.relative_to(pkg_dir)}:{ln}: {s}")
+    assert not offenders, "genvarloader modules import numba:\n" + "\n".join(offenders)
diff --git a/tests/parity/test_intervals_to_tracks_parity.py b/tests/parity/test_intervals_to_tracks_parity.py
index 5507e8c7..64c97734 100644
--- a/tests/parity/test_intervals_to_tracks_parity.py
+++ b/tests/parity/test_intervals_to_tracks_parity.py
@@ -1,22 +1,23 @@
+"""intervals_to_tracks: rust vs frozen golden (oracle frozen Phase 5 W5)."""
+
+from __future__ import annotations
+
 import numpy as np
 import pytest
-from hypothesis import given
 
-from genvarloader._dataset import _intervals  # noqa: F401  (import triggers register())
-from tests.parity._harness import assert_inplace_kernel_parity
-from tests.parity.strategies import intervals_to_tracks_inputs
+from tests.parity import _golden
 
 pytestmark = pytest.mark.parity
 
 
-@given(intervals_to_tracks_inputs())
-def test_intervals_to_tracks_parity(inputs):
-    out_offsets = inputs[6]
-    total = int(out_offsets[-1])
-    # NaN sentinel: any position the kernel fails to zero/paint stays NaN and is caught.
-    assert_inplace_kernel_parity(
+def test_intervals_to_tracks_golden():
+    cases = _golden.load_golden("intervals_to_tracks")
+    assert cases, "empty golden"
+    _golden.replay_inplace(
         "intervals_to_tracks",
-        inputs,
-        out_factory=lambda: np.full(total, np.nan, np.float32),
+        cases,
+        out_factory=lambda inputs: np.zeros(
+            int(np.asarray(inputs[-1])[-1]), np.float32
+        ),
         out_index=6,
     )
diff --git a/tests/parity/test_prng_parity.py b/tests/parity/test_prng_parity.py
new file mode 100644
index 00000000..7320083e
--- /dev/null
+++ b/tests/parity/test_prng_parity.py
@@ -0,0 +1,71 @@
+"""Direct rust parity test for xorshift64 and hash4 PRNG primitives.
+
+Known-vector tests run directly against the Rust debug exports.  The
+hypothesis-driven numba-comparison tests have been replaced with frozen-golden
+replay (goldens generated in generate_goldens.py, cross-checked against numba at
+generation time).
+
+The Rust functions are exposed as DEBUG exports (`_debug_xorshift64`,
+`_debug_hash4`) in the genvarloader extension module.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from genvarloader.genvarloader import _debug_hash4 as _hash4_rust
+from genvarloader.genvarloader import _debug_xorshift64 as _xorshift64_rust
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+UINT64_MAX = 2**64 - 1
+
+
+# ── frozen-golden replay ───────────────────────────────────────────────────────
+
+
+def test_xorshift64_golden():
+    """Rust xorshift64 must equal the frozen golden (cross-checked vs numba at freeze time)."""
+    cases = _golden.load_golden("prng_xorshift64")
+    assert cases, "empty golden"
+    for ci, (inputs, golden) in enumerate(cases):
+        (x,) = inputs
+        got = np.uint64(_xorshift64_rust(int(x)))
+        exp = np.uint64(golden)
+        assert got == exp, (
+            f"xorshift64 case {ci}: input={x:#x} got={got:#x} exp={exp:#x}"
+        )
+
+
+def test_hash4_golden():
+    """Rust hash4 must equal the frozen golden (cross-checked vs numba at freeze time)."""
+    cases = _golden.load_golden("prng_hash4")
+    assert cases, "empty golden"
+    for ci, (inputs, golden) in enumerate(cases):
+        a, b, c, d = inputs
+        got = np.uint64(_hash4_rust(int(a), int(b), int(c), int(d)))
+        exp = np.uint64(golden)
+        assert got == exp, (
+            f"hash4 case {ci}: ({a:#x},{b:#x},{c:#x},{d:#x}) got={got:#x} exp={exp:#x}"
+        )
+
+
+# ── smoke: fixed known vectors ─────────────────────────────────────────────────
+
+
+def test_xorshift64_known_vectors() -> None:
+    """Smoke-test a few hand-verified xorshift64 outputs."""
+    assert _xorshift64_rust(1) == 1_082_269_761
+    assert _xorshift64_rust(2) == 2_164_539_522
+    assert _xorshift64_rust(42) == 45_454_805_674
+    assert _xorshift64_rust(0xDEADBEEF) == 4_018_790_486_776_397_394
+    assert _xorshift64_rust(UINT64_MAX) == 1_065_361_344
+
+
+def test_hash4_known_vectors() -> None:
+    """Smoke-test a few hand-verified hash4 outputs."""
+    assert _hash4_rust(1, 2, 3, 4) == 11_323_120_931_611_735_037
+    assert _hash4_rust(0, 0, 0, 0) == 0
+    assert _hash4_rust(0xDEADBEEF, 0xCAFE, 0xBABE, 1) == 5_244_362_157_944_750_963
diff --git a/tests/parity/test_rayon_equivalence.py b/tests/parity/test_rayon_equivalence.py
new file mode 100644
index 00000000..a8109801
--- /dev/null
+++ b/tests/parity/test_rayon_equivalence.py
@@ -0,0 +1,186 @@
+"""Serial vs parallel rust output must be byte-identical (and == golden).
+
+Tests that reconstruct_haplotypes_from_sparse, shift_and_realign_tracks_sparse,
+tracks_to_intervals, get_diffs_sparse, and intervals_to_tracks each produce
+identical output regardless of whether parallel=False (serial rayon-free path)
+or parallel=True (rayon par_iter path).
+Both must also match the frozen golden captured from the Rust implementation.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+# RUST_KERNELS stores shims that wrap bare FFI functions with a `parallel=False`
+# default (so existing golden replays stay serial); they forward *args and
+# `parallel` straight through to the FFI. The FFI accepts `parallel` as a
+# keyword argument (PyO3 registers all pyfunction args as keyword-capable), so
+# passing parallel=True/False here exercises both branches.
+_fn = _golden.RUST_KERNELS["reconstruct_haplotypes_from_sparse"]
+_fn_sart = _golden.RUST_KERNELS["shift_and_realign_tracks_sparse"]
+_fn_tti = _golden.RUST_KERNELS["tracks_to_intervals"]
+_fn_gds = _golden.RUST_KERNELS["get_diffs_sparse"]
+_fn_itt = _golden.RUST_KERNELS["intervals_to_tracks"]
+
+
+def test_reconstruct_haplotypes_serial_eq_parallel():
+    """For every frozen golden case: serial == parallel == golden (byte-identical)."""
+    cases = _golden.load_golden("reconstruct_haplotypes_from_sparse")
+    assert cases, "empty golden — run generate_goldens.py first"
+
+    for ci, (inputs, golden) in enumerate(cases):
+        golden_arr = np.asarray(golden)
+        outs: dict[bool, np.ndarray] = {}
+        for parallel in (False, True):
+            out = np.zeros(golden_arr.shape, golden_arr.dtype)
+            # inputs tuple: (out_offsets, regions, shifts, geno_offset_idx,
+            #                geno_offsets_2d, geno_v_idxs, v_starts, ilens,
+            #                alt_alleles, alt_offsets, reference, ref_offsets,
+            #                pad_char, keep, keep_offsets, None, None)
+            # The FFI takes `out` as the first positional arg; inputs do NOT include out.
+            args = list(inputs)
+            args.insert(0, out)
+            _fn(*args, parallel=parallel)
+            outs[parallel] = out
+
+        np.testing.assert_array_equal(
+            outs[False],
+            outs[True],
+            err_msg=f"case {ci}: serial != parallel",
+        )
+        np.testing.assert_array_equal(
+            outs[True],
+            golden_arr,
+            err_msg=f"case {ci}: parallel != golden",
+        )
+
+
+def test_shift_and_realign_tracks_sparse_serial_eq_parallel():
+    """For every frozen golden case: serial == parallel == golden (byte-identical).
+
+    shift_and_realign_tracks_sparse is an INPLACE kernel: the golden stores
+    (inputs_tuple_without_out, golden_output_array). The out buffer is
+    inserted at index 0 before calling the wrapper.
+    """
+    cases = _golden.load_golden("shift_and_realign_tracks_sparse")
+    assert cases, "empty golden — run generate_goldens.py first"
+
+    for ci, (inputs, golden) in enumerate(cases):
+        golden_arr = np.asarray(golden)
+        outs: dict[bool, np.ndarray] = {}
+        for parallel in (False, True):
+            out = np.zeros(golden_arr.shape, golden_arr.dtype)
+            args = list(inputs)
+            args.insert(0, out)
+            _fn_sart(*args, parallel=parallel)
+            outs[parallel] = out
+
+        np.testing.assert_array_equal(
+            outs[False],
+            outs[True],
+            err_msg=f"case {ci}: serial != parallel",
+        )
+        np.testing.assert_array_equal(
+            outs[True],
+            golden_arr,
+            err_msg=f"case {ci}: parallel != golden",
+        )
+
+
+def test_tracks_to_intervals_serial_eq_parallel():
+    """For every frozen golden case: serial == parallel == golden (byte-identical).
+
+    tracks_to_intervals is a TUPLE-return kernel: the golden stores
+    (inputs_tuple, (starts, ends, values, offsets)).
+    """
+    cases = _golden.load_golden("tracks_to_intervals")
+    assert cases, "empty golden — run generate_goldens.py first"
+
+    for ci, (inputs, golden) in enumerate(cases):
+        results: dict[bool, tuple] = {}
+        for parallel in (False, True):
+            got = _fn_tti(*inputs, parallel=parallel)
+            results[parallel] = got if isinstance(got, tuple) else (got,)
+
+        gold = golden if isinstance(golden, tuple) else (golden,)
+        for j, (serial_arr, parallel_arr) in enumerate(
+            zip(results[False], results[True])
+        ):
+            np.testing.assert_array_equal(
+                np.asarray(serial_arr),
+                np.asarray(parallel_arr),
+                err_msg=f"case {ci} element {j}: serial != parallel",
+            )
+        for j, (parallel_arr, golden_arr) in enumerate(zip(results[True], gold)):
+            np.testing.assert_array_equal(
+                np.asarray(parallel_arr),
+                np.asarray(golden_arr),
+                err_msg=f"case {ci} element {j}: parallel != golden",
+            )
+
+
+def test_get_diffs_sparse_serial_eq_parallel():
+    """For every frozen golden case: serial == parallel == golden (byte-identical).
+
+    get_diffs_sparse is a RETURN kernel: the golden stores (inputs_tuple,
+    result_array). The shim adds `parallel=False` default so replay_tuple
+    callers that don't pass parallel continue to work.
+    """
+    cases = _golden.load_golden("get_diffs_sparse")
+    assert cases, "empty golden — run generate_goldens.py first"
+
+    for ci, (inputs, golden) in enumerate(cases):
+        golden_arr = np.asarray(golden)
+        results: dict[bool, np.ndarray] = {}
+        for parallel in (False, True):
+            got = _fn_gds(*inputs, parallel=parallel)
+            results[parallel] = np.asarray(got)
+
+        np.testing.assert_array_equal(
+            results[False],
+            results[True],
+            err_msg=f"case {ci}: serial != parallel",
+        )
+        np.testing.assert_array_equal(
+            results[True],
+            golden_arr,
+            err_msg=f"case {ci}: parallel != golden",
+        )
+
+
+def test_intervals_to_tracks_serial_eq_parallel():
+    """For every frozen golden case: serial == parallel == golden (byte-identical).
+
+    intervals_to_tracks is an INPLACE kernel: the golden stores
+    (inputs_tuple_without_out, golden_output_array). The out buffer is
+    inserted at index 6 (before out_offsets, the 7th element) before calling.
+    """
+    cases = _golden.load_golden("intervals_to_tracks")
+    assert cases, "empty golden — run generate_goldens.py first"
+
+    for ci, (inputs, golden) in enumerate(cases):
+        golden_arr = np.asarray(golden)
+        outs: dict[bool, np.ndarray] = {}
+        for parallel in (False, True):
+            # inputs[6] = out_offsets; total length = int(inputs[6][-1])
+            out = np.full(int(inputs[6][-1]), np.nan, np.float32)
+            args = list(inputs)
+            args.insert(6, out)
+            _fn_itt(*args, parallel=parallel)
+            outs[parallel] = out
+
+        np.testing.assert_array_equal(
+            outs[False],
+            outs[True],
+            err_msg=f"case {ci}: serial != parallel",
+        )
+        np.testing.assert_array_equal(
+            outs[True],
+            golden_arr,
+            err_msg=f"case {ci}: parallel != golden",
+        )
diff --git a/tests/parity/test_rc_alleles_parity.py b/tests/parity/test_rc_alleles_parity.py
new file mode 100644
index 00000000..726040b7
--- /dev/null
+++ b/tests/parity/test_rc_alleles_parity.py
@@ -0,0 +1,48 @@
+"""rc_alleles: rust vs frozen golden (oracle frozen Phase 5 W5).
+
+The hypothesis-driven numba-comparison test has been replaced with frozen-golden
+replay.  The dispatch-call-count smoke test is preserved using make_kernel_spy
+(which keeps _dispatch usage inside _golden.py, not here).
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_flat_alleles_reverse_masked_uses_rc_alleles():
+    """_FlatAlleles.reverse_masked must call the dispatched rc_alleles kernel."""
+    from genvarloader._dataset._flat_variants import _FlatAlleles
+
+    spy, calls, restore = _golden.make_kernel_spy("rc_alleles")
+    try:
+        # one row (b=1, ploidy=1), two alleles "AC","G".
+        byte_data = np.frombuffer(b"ACG", np.uint8).copy()
+        seq_offsets = np.array([0, 2, 3], np.int64)
+        var_offsets = np.array([0, 2], np.int64)
+        fa = _FlatAlleles(byte_data, seq_offsets, var_offsets, (1, 1, None))
+        fa.reverse_masked(np.array([True], np.bool_))
+        assert calls["n"] == 1
+        # "AC"->"GT", "G"->"C"
+        assert fa.byte_data.tobytes() == b"GTC"
+    finally:
+        restore()
+
+
+def test_rc_alleles_golden():
+    """Rust rc_alleles must equal the frozen golden (cross-checked vs numba at freeze time)."""
+    cases = _golden.load_golden("rc_alleles")
+    assert cases, "empty golden"
+    rust_fn = _golden.RUST_KERNELS["rc_alleles"]
+    for ci, (inputs, golden) in enumerate(cases):
+        init_data, seq_offsets, var_offsets, mask = inputs
+        buf = np.ascontiguousarray(init_data, np.uint8)
+        rust_fn(buf, seq_offsets, var_offsets, mask)
+        np.testing.assert_array_equal(
+            buf, golden, err_msg=f"rc_alleles case {ci} mismatch"
+        )
diff --git a/tests/parity/test_reconstruct_haplotypes_parity.py b/tests/parity/test_reconstruct_haplotypes_parity.py
new file mode 100644
index 00000000..251e6906
--- /dev/null
+++ b/tests/parity/test_reconstruct_haplotypes_parity.py
@@ -0,0 +1,21 @@
+"""reconstruct_haplotypes_from_sparse: rust vs frozen golden (oracle frozen Phase 5 W5)."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_reconstruct_haplotypes_from_sparse_golden():
+    cases = _golden.load_golden("reconstruct_haplotypes_from_sparse")
+    assert cases, "empty golden"
+    _golden.replay_inplace(
+        "reconstruct_haplotypes_from_sparse",
+        cases,
+        out_factory=lambda inputs: np.zeros(int(np.asarray(inputs[0])[-1]), np.uint8),
+        out_index=0,
+    )
diff --git a/tests/parity/test_reference_dataset_parity.py b/tests/parity/test_reference_dataset_parity.py
new file mode 100644
index 00000000..fada29a4
--- /dev/null
+++ b/tests/parity/test_reference_dataset_parity.py
@@ -0,0 +1,68 @@
+"""Reference-mode dataset-level parity backstop.
+
+Proves that the Rust get_reference kernel produces byte-identical output
+matching the frozen golden (generated from the rust implementation,
+oracle-verified against the composed numba pipeline at gen time).
+
+Kernel exercised end-to-end:
+  - get_reference  (reference fetch, via make_kernel_spy)
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import genvarloader as gvl
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_reference_mode_dataset_parity(phased_svar_gvl, reference):
+    """Rust get_reference output matches the frozen golden.
+
+    The spy asserts that the Rust get_reference kernel is actually invoked
+    (non-vacuous guard).  The ragged output is compared byte-identically
+    against the golden, and a non-triviality check ensures the comparison is
+    meaningful (output is not all-padding).
+    """
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = ds.with_seqs("reference")
+
+    # --- install counting spy via make_kernel_spy ---
+    spy_fn, calls, restore = _golden.make_kernel_spy("get_reference")
+    try:
+        # --- read (default rust backend, spy active) ---
+        out = ds[:, :]
+    finally:
+        restore()
+
+    # --- anti-vacuous guard ---
+    assert calls["n"] > 0, (
+        f"Rust get_reference was NEVER invoked during the read "
+        f"(calls={calls['n']}) — the backstop is vacuous. "
+        "Inspect the reference read path to confirm _get_reference_rust is still "
+        "called on the Dataset.__getitem__ → _getitem_unspliced code path."
+    )
+
+    # --- sanity: output must be non-trivial ---
+    out_arr = np.asarray(out.data)
+    n_bases = out_arr.size
+    assert n_bases > 0, (
+        "Reference output contains zero bytes — regions don't overlap any "
+        "reference sequence.  The parity comparison is vacuous."
+    )
+    n_pad = np.uint8(ord("N"))
+    data_u8 = out_arr.view(np.uint8)
+    assert np.any(data_u8 != n_pad), (
+        "Reference output is entirely 'N' padding — regions may fall outside "
+        "the reference contigs.  Non-padding bases are required to prove the "
+        "comparison is meaningful."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_reference_mode")
+    )
diff --git a/tests/parity/test_reference_fetch_parity.py b/tests/parity/test_reference_fetch_parity.py
new file mode 100644
index 00000000..255753e9
--- /dev/null
+++ b/tests/parity/test_reference_fetch_parity.py
@@ -0,0 +1,38 @@
+"""Parity backstop for Reference.fetch (rerouted through dispatched get_reference).
+
+fetch builds regions=(contig_idx, start, end) and out_offsets, then calls the
+same get_reference core used by the main reference read path. This test asserts
+that the rust get_reference kernel is actually invoked (spy guard) and that the
+output matches the frozen golden.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import genvarloader._dataset._reference  # noqa: F401 — triggers register("get_reference")
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_reference_fetch_parity(reference):
+    ref = reference
+    contigs = ref.contigs[:1]
+    starts = np.array([0], dtype=np.int64)
+    ends = np.array([50], dtype=np.int64)
+
+    spy_fn, calls, restore = _golden.make_kernel_spy("get_reference")
+    try:
+        out = ref.fetch(contigs, starts, ends)
+    finally:
+        restore()
+
+    assert calls["n"] > 0, "rust get_reference never invoked via fetch — vacuous"
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_reference_fetch")
+    )
diff --git a/tests/parity/test_shift_and_realign_tracks_parity.py b/tests/parity/test_shift_and_realign_tracks_parity.py
new file mode 100644
index 00000000..1efdf587
--- /dev/null
+++ b/tests/parity/test_shift_and_realign_tracks_parity.py
@@ -0,0 +1,21 @@
+"""shift_and_realign_tracks_sparse: rust vs frozen golden (oracle frozen Phase 5 W5)."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_shift_and_realign_tracks_sparse_golden():
+    cases = _golden.load_golden("shift_and_realign_tracks_sparse")
+    assert cases, "empty golden"
+    _golden.replay_inplace(
+        "shift_and_realign_tracks_sparse",
+        cases,
+        out_factory=lambda inputs: np.zeros(int(np.asarray(inputs[0])[-1]), np.float32),
+        out_index=0,
+    )
diff --git a/tests/parity/test_spliced_haplotypes_parity.py b/tests/parity/test_spliced_haplotypes_parity.py
new file mode 100644
index 00000000..010fcbb6
--- /dev/null
+++ b/tests/parity/test_spliced_haplotypes_parity.py
@@ -0,0 +1,97 @@
+"""Spliced-haplotypes dataset parity backstop (fused rust splice entry).
+
+Proves that the fused Rust entry ``reconstruct_haplotypes_spliced_fused`` (Task 5)
+produces byte-identical haplotype output to the frozen golden (generated from
+the rust implementation, oracle-verified against the composed numba pipeline).
+
+The test asserts:
+  1. The fused entry is actually invoked on the Rust path (non-vacuity spy guard).
+  2. The Rust output is byte-identical to the frozen golden.
+  3. The output is non-trivial (contains non-N bases).
+
+Dataset construction:
+  - Opens the existing phased_svar_gvl fixture in haplotypes mode.
+  - Adds a synthetic transcript_id column grouping regions 0+1 → T1, 2+3 → T2.
+  - Activates splice mode via with_settings(splice_info="transcript_id").
+"""
+
+from __future__ import annotations
+
+from dataclasses import replace
+
+import numpy as np
+import polars as pl
+import pytest
+
+import genvarloader as gvl
+import genvarloader._dataset._haps as _haps_mod
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+# ---------------------------------------------------------------------------
+# Main parity gate — fused Rust splice path vs. frozen golden
+# ---------------------------------------------------------------------------
+
+
+def test_spliced_haplotypes_parity(phased_svar_gvl, reference, monkeypatch):
+    """Fused reconstruct_haplotypes_spliced_fused output matches the frozen golden.
+
+    Spy guard: we monkeypatch ``_haps_mod.reconstruct_haplotypes_spliced_fused``
+    to count calls.  The spy must fire at least once (anti-vacuous guard).
+    """
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = ds.with_seqs("haplotypes").with_tracks(False)
+
+    n = 4
+    sub_bed = ds._full_bed[:n].with_columns(
+        pl.Series("transcript_id", ["T1", "T1", "T2", "T2"])
+    )
+    ds = replace(ds, _full_bed=sub_bed).with_settings(splice_info="transcript_id")
+
+    assert ds.is_spliced, "Dataset should be in spliced mode"
+
+    orig_fused = getattr(_haps_mod, "reconstruct_haplotypes_spliced_fused", None)
+    assert orig_fused is not None, (
+        "reconstruct_haplotypes_spliced_fused not found on _haps_mod — "
+        "ensure it is imported at module level in _haps.py"
+    )
+
+    calls: dict[str, int] = {"n": 0}
+
+    def _spy_fused(*a, **k):
+        calls["n"] += 1
+        return orig_fused(*a, **k)
+
+    monkeypatch.setattr(_haps_mod, "reconstruct_haplotypes_spliced_fused", _spy_fused)
+
+    # --- read (default rust backend, spy active) ---
+    out = ds[:, :]
+
+    # Anti-vacuous guard
+    assert calls["n"] > 0, (
+        f"reconstruct_haplotypes_spliced_fused was NEVER invoked during the read "
+        f"(calls={calls['n']}) — the backstop is vacuous. "
+        "Ensure _haps._reconstruct_haplotypes calls reconstruct_haplotypes_spliced_fused "
+        "on the splice path."
+    )
+
+    # --- sanity: non-trivial output ---
+    out_data = np.asarray(out.data)
+    assert out_data.size > 0, (
+        "Spliced haplotypes output contains zero bytes — regions don't overlap any "
+        "reference sequence.  The parity comparison is vacuous."
+    )
+    n_pad = np.uint8(ord("N"))
+    data_u8 = out_data.view(np.uint8)
+    assert np.any(data_u8 != n_pad), (
+        "Spliced haplotypes output is entirely 'N' padding — non-padding bases are "
+        "required to prove the comparison is meaningful."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_spliced_haps")
+    )
diff --git a/tests/parity/test_tracks_to_intervals_parity.py b/tests/parity/test_tracks_to_intervals_parity.py
new file mode 100644
index 00000000..010101ab
--- /dev/null
+++ b/tests/parity/test_tracks_to_intervals_parity.py
@@ -0,0 +1,15 @@
+"""tracks_to_intervals: rust vs frozen golden (oracle frozen Phase 5 W5)."""
+
+from __future__ import annotations
+
+import pytest
+
+from tests.parity import _golden
+
+pytestmark = pytest.mark.parity
+
+
+def test_tracks_to_intervals_golden():
+    cases = _golden.load_golden("tracks_to_intervals")
+    assert cases, "empty golden"
+    _golden.replay_tuple("tracks_to_intervals", cases)
diff --git a/tests/parity/test_variants_dataset_parity.py b/tests/parity/test_variants_dataset_parity.py
new file mode 100644
index 00000000..d63b46be
--- /dev/null
+++ b/tests/parity/test_variants_dataset_parity.py
@@ -0,0 +1,214 @@
+"""Variants-mode dataset-level parity backstop.
+
+Proves that the Rust backend produces byte-identical variants output matching
+the frozen golden (generated from the rust implementation, oracle-verified
+against the numba pipeline at gen time).
+
+Kernels exercised end-to-end:
+  - gather_rows_i32   (v_idxs gather — always on the variants path)
+  - gather_alleles    (alt/ref sequence gather)
+  - fill_empty_*      (empty group sentinel fill)
+  - compact_keep_*    (AF filtering, when min_af/max_af are active)
+  - rc_alleles        (reverse-complement of alleles on neg-strand regions)
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import genvarloader as gvl
+import genvarloader._dataset._flat_variants  # noqa: F401 — triggers register()
+from genvarloader._dataset._flat_variants import DummyVariant
+
+from tests.parity import _golden
+from ._fixtures import build_strand_mixed_dataset
+
+pytestmark = pytest.mark.parity
+
+
+# ---------------------------------------------------------------------------
+# Main backstop test
+# ---------------------------------------------------------------------------
+
+
+def test_variants_getitem_parity_and_kernels_invoked(phased_svar_gvl, reference):
+    """Rust variants output matches the frozen golden.
+
+    The spy asserts that the Rust gather_rows_i32 kernel is actually invoked
+    (non-vacuous guard).
+    """
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = ds.with_tracks(False)
+    ds = ds.with_seqs("variants")
+
+    spy_fn, calls, restore = _golden.make_kernel_spy("gather_rows_i32")
+    try:
+        out = ds[:, :]
+    finally:
+        restore()
+
+    # --- anti-vacuous guard ---
+    assert calls["n"] > 0, (
+        f"Rust gather_rows_i32 was NEVER invoked during the read "
+        f"(calls={calls['n']}) — the backstop is vacuous. "
+        "Inspect the variants read path to confirm gather_rows_i32 is still "
+        "called on the get_variants_flat → _gather_rows code path."
+    )
+
+    # --- sanity: output must be non-trivial ---
+    n_total_variants = int(out.start.data.size)
+    assert n_total_variants > 0, (
+        "RaggedVariants output contains zero variants — regions don't overlap any "
+        "variants in the dataset.  The parity comparison is vacuous."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(out, _golden.load_flat_golden("ds_variants"))
+
+
+# ---------------------------------------------------------------------------
+# AF-filtered backstop (compact_keep_i32 exercise)
+# ---------------------------------------------------------------------------
+
+
+def test_variants_af_filter_parity(phased_svar_gvl, reference):
+    """Same parity check with a mild AF filter to exercise compact_keep_i32.
+
+    If the dataset has no AF annotation or the golden was not generated,
+    skips with a clear message.
+    """
+    ds_base = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds_base = ds_base.with_tracks(False)
+
+    # Try to apply an AF filter.  with_settings raises if AF is unavailable.
+    try:
+        ds = ds_base.with_seqs("variants").with_settings(min_af=0.1, max_af=0.9)
+    except Exception as e:
+        pytest.skip(
+            f"AF filtering unavailable on this dataset — skipping compact_keep "
+            f"exercise ({type(e).__name__}: {e})"
+        )
+
+    # Load golden — may not exist if AF was unavailable at generation time.
+    try:
+        golden = _golden.load_flat_golden("ds_variants_af")
+    except FileNotFoundError:
+        pytest.skip("ds_variants_af golden not generated (AF unavailable at gen time)")
+
+    spy_fn, ck_calls, restore = _golden.make_kernel_spy("compact_keep_i32")
+    try:
+        out = ds[:, :]
+    finally:
+        restore()
+
+    # compact_keep may not fire if no variants fall within the AF window;
+    # only assert it if variants are present.
+    n_vars = int(out.start.data.size)
+    if n_vars > 0 and ck_calls["n"] == 0:
+        pytest.xfail(
+            "compact_keep_i32 was not invoked even though variants are present — "
+            "AF filter may not be active on this code path."
+        )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(out, golden)
+
+
+# ---------------------------------------------------------------------------
+# variant-windows cross-backend parity
+# ---------------------------------------------------------------------------
+
+
+def test_variant_windows_getitem_parity_across_backends(phased_svar_gvl, reference):
+    """variant-windows __getitem__ must match the frozen golden.
+
+    Proves the windows output is non-empty AND byte-identical to the golden
+    end-to-end.
+    """
+    from genvarloader import VarWindowOpt
+
+    ds = gvl.Dataset.open(phased_svar_gvl, reference=reference)
+    ds = (
+        ds.with_tracks(False)
+        .with_output_format("flat")
+        .with_seqs(
+            "variant-windows",
+            VarWindowOpt(flank_length=4, token_alphabet=b"ACGT", unknown_token=4),
+        )
+    )
+
+    out = ds[[0, 1], [0, 1]]
+
+    # Anti-vacuous: at least one window field must be present and non-empty.
+    present = [w for w in (out.ref_window, out.alt_window) if w is not None]
+    assert len(present) > 0, (
+        "No window fields present in the output — test is vacuous. "
+        "Check that VarWindowOpt.ref/alt defaults produce at least one window."
+    )
+    assert any(np.asarray(w.data).size > 0 for w in present), (
+        "All window data arrays are empty — no variants in the indexed batch. "
+        "The comparison is vacuous."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_variant_windows")
+    )
+
+
+# ---------------------------------------------------------------------------
+# Neg-strand variants parity + dummy-fill coverage (Task 6)
+# ---------------------------------------------------------------------------
+
+
+def test_neg_strand_variants_rc_parity_and_kernel_invoked(tmp_path, synthetic_case):
+    """variants-mode neg-strand RC output matches the frozen golden, and the
+    rust rc_alleles kernel actually fires on the live read (non-vacuous)."""
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = (
+        gvl.Dataset.open(ds_dir, reference=ref).with_tracks(False).with_seqs("variants")
+    )
+
+    # Non-vacuity: fixture must carry −strand regions (rc_neg defaults True).
+    assert np.any(ds._full_regions[:, 3] == -1), "fixture has no −strand regions"
+
+    spy_fn, calls, restore = _golden.make_kernel_spy("rc_alleles")
+    try:
+        out = ds[:, :]
+    finally:
+        restore()
+
+    assert calls["n"] > 0, (
+        "rust rc_alleles was never invoked on the neg-strand variants read — "
+        "the backstop is vacuous. Confirm a variant overlaps a −strand region; if "
+        "the synthetic variant set does not, extend build_strand_mixed_dataset with a "
+        "−strand region positioned over a known variant."
+    )
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_neg_strand_variants")
+    )
+
+
+def test_neg_strand_variants_custom_dummy_parity(tmp_path, synthetic_case):
+    """A custom non-palindromic dummy (alt/ref = b'AC') filled into empty groups on
+    a −strand read produces output matching the frozen golden."""
+    ds_dir = build_strand_mixed_dataset(tmp_path, synthetic_case.svar_path)
+    ref = gvl.Reference.from_path(synthetic_case.ref_path, in_memory=False)
+    ds = (
+        gvl.Dataset.open(ds_dir, reference=ref)
+        .with_tracks(False)
+        .with_seqs("variants")
+        .with_settings(dummy_variant=DummyVariant(alt=b"AC", ref=b"AC"))
+    )
+    assert np.any(ds._full_regions[:, 3] == -1), "fixture has no −strand regions"
+
+    out = ds[:, :]
+
+    # --- replay against frozen golden ---
+    _golden.assert_output_matches_golden(
+        out, _golden.load_flat_golden("ds_neg_strand_variants_dummy")
+    )
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/dataset/__init__.py b/tests/unit/dataset/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/dataset/genotypes/test_choose_exonic_variants.py b/tests/unit/dataset/genotypes/test_choose_exonic_variants.py
index fcffe8b7..0e58b03f 100644
--- a/tests/unit/dataset/genotypes/test_choose_exonic_variants.py
+++ b/tests/unit/dataset/genotypes/test_choose_exonic_variants.py
@@ -6,8 +6,7 @@
 ``geno_offsets[o_idx]`` (returning a length-2 row, not scalars) and
 then sliced ``geno_v_idxs[o_s:o_e]`` with those rows.
 
-Mirror the fix in the first loop + the sibling ``filter_af`` kernel
-which both branch on ``geno_offsets.ndim == 1``.
+Mirror the fix applied in the first loop, which branches on ``geno_offsets.ndim == 1``.
 """
 
 from __future__ import annotations
diff --git a/tests/unit/dataset/genotypes/test_filter_af.py b/tests/unit/dataset/genotypes/test_filter_af.py
deleted file mode 100644
index 3e778505..00000000
--- a/tests/unit/dataset/genotypes/test_filter_af.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import numpy as np
-from genvarloader._dataset._genotypes import filter_af
-
-
-def _basic_inputs():
-    geno_offset_idx = np.array([[0]], dtype=np.intp)
-    geno_offsets = np.array([0, 4], dtype=np.int64)
-    geno_v_idxs = np.array([0, 1, 2, 3], dtype=np.int32)
-    afs = np.array([0.001, 0.05, 0.2, 0.5], dtype=np.float32)
-    return geno_offset_idx, geno_offsets, geno_v_idxs, afs
-
-
-def test_filter_af_no_op():
-    """min_af=None, max_af=None -> all kept, short-circuits."""
-    geno_offset_idx, geno_offsets, geno_v_idxs, afs = _basic_inputs()
-    keep, _ = filter_af(geno_offset_idx, geno_offsets, geno_v_idxs, afs, None, None)
-    np.testing.assert_equal(keep, np.array([True, True, True, True]))
-
-
-def test_filter_af_min_only():
-    """min_af=0.05 keeps variants with af >= 0.05."""
-    geno_offset_idx, geno_offsets, geno_v_idxs, afs = _basic_inputs()
-    keep, _ = filter_af(geno_offset_idx, geno_offsets, geno_v_idxs, afs, 0.05, None)
-    np.testing.assert_equal(keep, np.array([False, True, True, True]))
-
-
-def test_filter_af_max_only():
-    """max_af=0.2 keeps variants with af <= 0.2.
-
-    Note: afs are stored as float32. np.float32(0.2) > float64(0.2) due to
-    representation loss, so the variant at af=0.2 does NOT pass the <= 0.2
-    filter when max_af is a Python float.  The actual kept set is [0.001, 0.05].
-    """
-    geno_offset_idx, geno_offsets, geno_v_idxs, afs = _basic_inputs()
-    keep, _ = filter_af(geno_offset_idx, geno_offsets, geno_v_idxs, afs, None, 0.2)
-    np.testing.assert_equal(keep, np.array([True, True, False, False]))
-
-
-def test_filter_af_both():
-    """Combined min/max bounds."""
-    geno_offset_idx, geno_offsets, geno_v_idxs, afs = _basic_inputs()
-    keep, _ = filter_af(geno_offset_idx, geno_offsets, geno_v_idxs, afs, 0.01, 0.3)
-    np.testing.assert_equal(keep, np.array([False, True, True, False]))
-
-
-def test_filter_af_2d_offsets_layout():
-    """(2, n_slices) offsets layout — slice [start, end) per row."""
-    geno_offset_idx = np.array([[0]], dtype=np.intp)
-    # Single slice covering all 4 variants.
-    geno_offsets = np.array([[0], [4]], dtype=np.int64)  # (2, n_slices=1)
-    geno_v_idxs = np.array([0, 1, 2, 3], dtype=np.int32)
-    afs = np.array([0.001, 0.05, 0.2, 0.5], dtype=np.float32)
-    keep, keep_offsets = filter_af(
-        geno_offset_idx, geno_offsets, geno_v_idxs, afs, 0.05, None
-    )
-    np.testing.assert_equal(keep, np.array([False, True, True, True]))
-    # keep_offsets is cumulative offsets over n_slices: length n_slices+1 = 2.
-    assert keep_offsets.shape == (2,)
-
-
-def test_1d_and_2d_layouts_agree():
-    """1-D offsets [0, N] and 2-D offsets [[0], [N]] describe the same input
-    and must produce equivalent `keep` arrays."""
-    geno_offset_idx = np.array([[0]], dtype=np.intp)
-    geno_v_idxs = np.array([0, 1, 2, 3], dtype=np.int32)
-    afs = np.array([0.001, 0.05, 0.2, 0.5], dtype=np.float32)
-
-    keep_1d, _ = filter_af(
-        geno_offset_idx,
-        np.array([0, 4], dtype=np.int64),
-        geno_v_idxs,
-        afs,
-        0.05,
-        None,
-    )
-    keep_2d, _ = filter_af(
-        geno_offset_idx,
-        np.array([[0], [4]], dtype=np.int64),
-        geno_v_idxs,
-        afs,
-        0.05,
-        None,
-    )
-    np.testing.assert_equal(keep_1d, keep_2d)
-
-
-def test_filter_af_nan_behavior():
-    """NaN allele frequencies: assert observed behavior, document the contract.
-
-    `nan >= min_af` is False and `nan <= max_af` is False, so a NaN should be
-    REJECTED by either bound. Verify."""
-    geno_offset_idx = np.array([[0]], dtype=np.intp)
-    geno_offsets = np.array([0, 3], dtype=np.int64)
-    geno_v_idxs = np.array([0, 1, 2], dtype=np.int32)
-    afs = np.array([0.1, np.nan, 0.5], dtype=np.float32)
-
-    # min only — NaN must be rejected
-    keep, _ = filter_af(geno_offset_idx, geno_offsets, geno_v_idxs, afs, 0.05, None)
-    np.testing.assert_equal(keep, np.array([True, False, True]))
-
-    # max only — NaN must be rejected
-    keep, _ = filter_af(geno_offset_idx, geno_offsets, geno_v_idxs, afs, None, 0.5)
-    np.testing.assert_equal(keep, np.array([True, False, True]))
-
-    # both — NaN must be rejected
-    keep, _ = filter_af(geno_offset_idx, geno_offsets, geno_v_idxs, afs, 0.05, 0.5)
-    np.testing.assert_equal(keep, np.array([True, False, True]))
-
-    # neither — NaN passes through (no-op short-circuit)
-    keep, _ = filter_af(geno_offset_idx, geno_offsets, geno_v_idxs, afs, None, None)
-    np.testing.assert_equal(keep, np.array([True, True, True]))
diff --git a/tests/unit/dataset/test_dataset_utils.py b/tests/unit/dataset/test_dataset_utils.py
index f12e95de..42afc805 100644
--- a/tests/unit/dataset/test_dataset_utils.py
+++ b/tests/unit/dataset/test_dataset_utils.py
@@ -10,7 +10,6 @@
     padded_slice,
     reduceat_offsets,
     regions_to_bed,
-    splits_sum_le_value,
 )
 
 
@@ -78,11 +77,6 @@ def test_padded_slice_left_and_right_pad():
     np.testing.assert_array_equal(res, np.array([-1, -1, 1, 2, 3, -1, -1]))
 
 
-def test_splits_sum_le_value_docstring_example():
-    out = splits_sum_le_value(np.array([5, 5, 11, 9, 2, 7]), 10)
-    np.testing.assert_array_equal(out, np.array([0, 2, 3, 4, 6]))
-
-
 def test_regions_to_bed_and_back_roundtrip():
     regions = np.array(
         [[0, 100, 200, 1], [1, 50, 150, -1]],
diff --git a/tests/unit/dataset/test_ffi_array.py b/tests/unit/dataset/test_ffi_array.py
new file mode 100644
index 00000000..26c0ef0a
--- /dev/null
+++ b/tests/unit/dataset/test_ffi_array.py
@@ -0,0 +1,28 @@
+"""_ffi_array boundary guard (Task 4)."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from genvarloader._dataset._utils import _ffi_array
+
+
+def test_passes_contiguous_correct_dtype():
+    arr = np.arange(10, dtype=np.int32)
+    out = _ffi_array(arr, np.int32, "geno_v_idxs")
+    assert out is arr  # zero-copy: same object
+
+
+def test_raises_on_non_contiguous():
+    base = np.zeros((10, 3), dtype=np.int32)
+    strided = base[:, 1]  # non-contiguous column view
+    assert not strided.flags["C_CONTIGUOUS"]
+    with pytest.raises(ValueError, match="geno_v_idxs"):
+        _ffi_array(strided, np.int32, "geno_v_idxs")
+
+
+def test_raises_on_wrong_dtype():
+    arr = np.arange(10, dtype=np.int64)
+    with pytest.raises(ValueError, match="itv_starts"):
+        _ffi_array(arr, np.int32, "itv_starts")
diff --git a/tests/unit/dataset/test_flat_variants_type.py b/tests/unit/dataset/test_flat_variants_type.py
index 19bb7c96..816087d3 100644
--- a/tests/unit/dataset/test_flat_variants_type.py
+++ b/tests/unit/dataset/test_flat_variants_type.py
@@ -273,7 +273,7 @@ def test_gather_rows_1d_vs_2d_dispatch():
     """
     from genvarloader._dataset._flat_variants import (
         _gather_rows,
-        _gather_v_idxs_ss,
+        _gather_v_idxs_ss_numba,
     )
 
     geno_v_idxs = np.array([10, 11, 20, 21, 22, 30], np.int32)
@@ -308,8 +308,8 @@ def test_gather_rows_1d_vs_2d_dispatch():
     np.testing.assert_array_equal(v_1d, v_2d, err_msg="1D and 2D v_idxs disagree")
     np.testing.assert_array_equal(off_1d, off_2d, err_msg="1D and 2D offsets disagree")
 
-    # Also test _gather_v_idxs_ss directly against the golden value
-    v_ss, off_ss = _gather_v_idxs_ss(
+    # Also test _gather_v_idxs_ss_numba directly against the golden value
+    v_ss, off_ss = _gather_v_idxs_ss_numba(
         geno_offset_idx, offsets_2d[0], offsets_2d[1], geno_v_idxs
     )
     np.testing.assert_array_equal(
diff --git a/tests/unit/dataset/test_intervals_dispatch.py b/tests/unit/dataset/test_intervals_dispatch.py
index e82f56fa..51097f3c 100644
--- a/tests/unit/dataset/test_intervals_dispatch.py
+++ b/tests/unit/dataset/test_intervals_dispatch.py
@@ -1,5 +1,4 @@
 import numpy as np
-import pytest
 from genvarloader._dataset._intervals import intervals_to_tracks
 
 
@@ -23,9 +22,7 @@ def _known_case():
     )
 
 
-@pytest.mark.parametrize("backend", ["numba", "rust"])
-def test_wrapper_matches_known_result(backend, monkeypatch):
-    monkeypatch.setenv("GVL_BACKEND", backend)
+def test_wrapper_matches_known_result():
     (
         offset_idxs,
         starts,
@@ -47,9 +44,3 @@ def test_wrapper_matches_known_result(backend, monkeypatch):
         out_offsets,
     )
     np.testing.assert_array_equal(out, np.array([0, 2, 2, 0, 0], np.float32))
-
-
-def test_wrapper_is_registered():
-    from genvarloader import _dispatch
-
-    assert "intervals_to_tracks" in _dispatch.registered_names()
diff --git a/tests/unit/dataset/test_reconstruct_trailing_fill.py b/tests/unit/dataset/test_reconstruct_trailing_fill.py
new file mode 100644
index 00000000..ca457984
--- /dev/null
+++ b/tests/unit/dataset/test_reconstruct_trailing_fill.py
@@ -0,0 +1,31 @@
+"""Correctness of the trailing-fill clause when a deletion exhausts the contig.
+
+The overshoot sub-domain (ref_idx past contig end with output unfilled) was
+historically excluded from parity because numba and rust diverged AND both were
+wrong. Correct behavior: pad the entire unfilled tail (no reference left).
+"""
+
+import numpy as np
+
+from genvarloader._dataset._genotypes import reconstruct_haplotype_from_sparse
+
+
+def test_overshoot_pads_full_tail():
+    # ref=[1,2,3,4], deletion at pos 2 (ilen=-5) -> ref_idx advances to 8 (>4).
+    # out_len=8: [1,2] ref + [50] allele, then ref exhausted -> pad rest with 0.
+    out = np.full(8, 255, dtype=np.uint8)  # 0xFF sentinel: catches unwritten positions
+    reconstruct_haplotype_from_sparse(
+        np.array([0], dtype=np.int32),  # v_idxs
+        np.array([2], dtype=np.int32),  # v_starts
+        np.array([-5], dtype=np.int32),  # ilens
+        0,  # shift
+        np.array([50], dtype=np.uint8),  # alt_alleles
+        np.array([0, 1], dtype=np.int64),  # alt_offsets
+        np.array([1, 2, 3, 4], dtype=np.uint8),  # ref
+        0,  # ref_start
+        out,  # out
+        0,  # pad_char
+    )
+    np.testing.assert_array_equal(
+        out, np.array([1, 2, 50, 0, 0, 0, 0, 0], dtype=np.uint8)
+    )
diff --git a/tests/unit/dataset/test_ref_fetch_dispatch.py b/tests/unit/dataset/test_ref_fetch_dispatch.py
index 949861e8..74d25479 100644
--- a/tests/unit/dataset/test_ref_fetch_dispatch.py
+++ b/tests/unit/dataset/test_ref_fetch_dispatch.py
@@ -2,33 +2,11 @@
 from seqpro.rag import lengths_to_offsets
 
 from genvarloader._dataset._reference import (
-    _fetch_impl_ser,
-    _fetch_impl_par,
     _get_reference_ser,
     _get_reference_par,
 )
 
 
-def _run(kernel, c_idxs, starts, ends, reference, ref_offsets, pad_char):
-    out_offsets = lengths_to_offsets(ends - starts)
-    out = np.empty(int(out_offsets[-1]), np.uint8)
-    kernel(c_idxs, starts, ends, reference, ref_offsets, pad_char, out, out_offsets)
-    return out
-
-
-def test_serial_and_parallel_kernels_agree():
-    rng = np.random.default_rng(0)
-    reference = rng.integers(65, 85, size=500, dtype=np.uint8)  # ascii A..T
-    ref_offsets = np.array([0, 200, 500], dtype=np.int64)  # 2 contigs
-    c_idxs = np.array([0, 1, 0, 1], dtype=np.int64)
-    starts = np.array([-5, 10, 190, 0], dtype=np.int64)  # includes OOB left
-    ends = np.array([10, 30, 205, 300], dtype=np.int64)  # includes OOB right
-    pad = ord("N")
-    ser = _run(_fetch_impl_ser, c_idxs, starts, ends, reference, ref_offsets, pad)
-    par = _run(_fetch_impl_par, c_idxs, starts, ends, reference, ref_offsets, pad)
-    np.testing.assert_array_equal(ser, par)
-
-
 def test_get_reference_kernels_agree():
     rng = np.random.default_rng(1)
     reference = rng.integers(65, 85, size=500, dtype=np.uint8)
diff --git a/tests/unit/dataset/test_table_max_mem.py b/tests/unit/dataset/test_table_max_mem.py
index 112d42f5..3fb20f98 100644
--- a/tests/unit/dataset/test_table_max_mem.py
+++ b/tests/unit/dataset/test_table_max_mem.py
@@ -35,5 +35,7 @@ def test_write_track_table_succeeds_within_budget(tmp_path):
     t = _dense_table(1000)
     bed = pl.DataFrame({"chrom": ["chr1"], "chromStart": [0], "chromEnd": [10_000]})
     _write_track_table(tmp_path, bed, t, ["s0"], max_mem=1 << 20)
-    assert (tmp_path / "intervals.npy").exists()
+    assert (tmp_path / "starts.npy").exists()
+    assert (tmp_path / "ends.npy").exists()
+    assert (tmp_path / "values.npy").exists()
     assert (tmp_path / "offsets.npy").exists()
diff --git a/tests/unit/dataset/test_write.py b/tests/unit/dataset/test_write.py
new file mode 100644
index 00000000..f8166621
--- /dev/null
+++ b/tests/unit/dataset/test_write.py
@@ -0,0 +1,12 @@
+from pathlib import Path
+
+import polars as pl
+import pytest
+
+from genvarloader._dataset._write import _write_track
+
+
+def test_write_track_rejects_unsupported_type():
+    """Custom IntervalTrack types are unsupported now that the legacy path is gone."""
+    with pytest.raises(TypeError, match="BigWigs.*Table"):
+        _write_track(Path("/tmp/unused"), pl.DataFrame(), object(), None, 1)
diff --git a/tests/unit/dataset/test_write_atomic.py b/tests/unit/dataset/test_write_atomic.py
index 11eee170..eeef14bc 100644
--- a/tests/unit/dataset/test_write_atomic.py
+++ b/tests/unit/dataset/test_write_atomic.py
@@ -16,8 +16,8 @@ def test_metadata_has_format_version_field():
     assert m.format_version is None
 
 
-def test_dataset_format_version_is_1_0_0():
-    assert str(DATASET_FORMAT_VERSION) == "1.0.0"
+def test_dataset_format_version_is_2_0_0():
+    assert str(DATASET_FORMAT_VERSION) == "2.0.0"
 
 
 def test_write_stamps_format_version():
@@ -28,7 +28,7 @@ def test_write_stamps_format_version():
         format_version=DATASET_FORMAT_VERSION,
     ).model_dump_json()
     back = Metadata.model_validate_json(raw)
-    assert str(back.format_version) == "1.0.0"
+    assert str(back.format_version) == "2.0.0"
 
 
 def test_write_is_atomic_no_temp_left(phased_vcf_gvl):
@@ -87,7 +87,7 @@ def test_format_version_stamped_on_disk(synthetic_case, tmp_path):
     )
 
     meta = json.loads((dest / "metadata.json").read_text())
-    assert meta["format_version"] == "1.0.0"
+    assert meta["format_version"] == "2.0.0"
 
 
 def test_failure_leaves_no_partial_artifacts(synthetic_case, tmp_path):
diff --git a/tests/unit/test_bigwig_write_binding.py b/tests/unit/test_bigwig_write_binding.py
index 996ce413..ce20d0bc 100644
--- a/tests/unit/test_bigwig_write_binding.py
+++ b/tests/unit/test_bigwig_write_binding.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 
-from genvarloader._ragged import INTERVAL_DTYPE
 from genvarloader.genvarloader import bigwig_write_track
 
 
@@ -16,10 +15,15 @@ def test_bigwig_write_binding_roundtrip(tmp_path):
     out = tmp_path
     bigwig_write_track(paths, contigs, starts, ends, 1 << 30, str(out), False)
 
-    itvs = np.memmap(out / "intervals.npy", dtype=INTERVAL_DTYPE, mode="r")
+    starts_arr = np.memmap(out / "starts.npy", dtype=np.int32, mode="r")
+    ends_arr = np.memmap(out / "ends.npy", dtype=np.int32, mode="r")
+    values_arr = np.memmap(out / "values.npy", dtype=np.float32, mode="r")
     offsets = np.memmap(out / "offsets.npy", dtype=np.int64, mode="r")
     # 2 regions x 2 samples -> offsets length 5
     assert len(offsets) == 2 * 2 + 1
     assert offsets[0] == 0
-    assert offsets[-1] == len(itvs)
-    assert itvs.dtype == INTERVAL_DTYPE
+    assert offsets[-1] == len(starts_arr)
+    assert len(starts_arr) == len(ends_arr) == len(values_arr)
+    assert starts_arr.dtype == np.int32
+    assert ends_arr.dtype == np.int32
+    assert values_arr.dtype == np.float32
diff --git a/tests/unit/test_dispatch.py b/tests/unit/test_dispatch.py
deleted file mode 100644
index 882e148f..00000000
--- a/tests/unit/test_dispatch.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import pytest
-from genvarloader import _dispatch
-
-
-@pytest.fixture(autouse=True)
-def _clean_registry(monkeypatch):
-    # Isolate each test: fresh registry + no inherited GVL_BACKEND.
-    monkeypatch.setattr(_dispatch, "_REGISTRY", {})
-    monkeypatch.delenv("GVL_BACKEND", raising=False)
-    yield
-
-
-def _reg():
-    _dispatch.register("k", numba=lambda: "numba", rust=lambda: "rust", default="numba")
-
-
-def test_get_returns_default_backend():
-    _reg()
-    assert _dispatch.get("k")() == "numba"
-
-
-def test_get_respects_per_kernel_rust_default():
-    _dispatch.register("k", numba=lambda: "n", rust=lambda: "r", default="rust")
-    assert _dispatch.get("k")() == "r"
-
-
-def test_env_override_forces_all_kernels(monkeypatch):
-    _reg()
-    monkeypatch.setenv("GVL_BACKEND", "rust")
-    assert _dispatch.get("k")() == "rust"
-
-
-def test_backends_returns_both_regardless_of_default():
-    _reg()
-    numba_fn, rust_fn = _dispatch.backends("k")
-    assert numba_fn() == "numba" and rust_fn() == "rust"
-
-
-def test_unknown_name_raises_keyerror_listing_names():
-    _reg()
-    with pytest.raises(KeyError, match="k"):
-        _dispatch.get("missing")
-
-
-def test_invalid_env_backend_raises(monkeypatch):
-    _reg()
-    monkeypatch.setenv("GVL_BACKEND", "julia")
-    with pytest.raises(ValueError, match="GVL_BACKEND"):
-        _dispatch.get("k")
diff --git a/tests/unit/test_rc_alleles_ffi.py b/tests/unit/test_rc_alleles_ffi.py
new file mode 100644
index 00000000..73e7ddfc
--- /dev/null
+++ b/tests/unit/test_rc_alleles_ffi.py
@@ -0,0 +1,12 @@
+import numpy as np
+import genvarloader.genvarloader as _gvl  # compiled rust extension module
+
+
+def test_rc_alleles_ffi_inplace():
+    # 2 rows. row0 (masked): alleles "AC","G". row1 (unmasked): "TT".
+    data = np.frombuffer(b"ACGTT", np.uint8).copy()
+    seq_offsets = np.array([0, 2, 3, 5], np.int64)
+    var_offsets = np.array([0, 2, 3], np.int64)
+    to_rc_row = np.array([True, False], np.bool_)
+    _gvl.rc_alleles(data, seq_offsets, var_offsets, to_rc_row)
+    assert data.tobytes() == b"GTCTT"
diff --git a/tests/unit/test_threads.py b/tests/unit/test_threads.py
index 4a48f33a..f28350a9 100644
--- a/tests/unit/test_threads.py
+++ b/tests/unit/test_threads.py
@@ -1,7 +1,5 @@
 import os
 
-import numba
-
 import genvarloader._threads as th
 
 
@@ -20,21 +18,17 @@ def _constrain_detected_cpus(monkeypatch, n: int) -> None:
 
 def test_resolve_honors_env_override(monkeypatch):
     monkeypatch.setenv("GVL_NUM_THREADS", "7")
-    # env wins, clamped to >= 1 and <= numba hard max
-    monkeypatch.setattr(numba, "get_num_threads", lambda: 64)
     assert th._resolve_num_threads() == 7
 
 
-def test_resolve_env_clamped_to_numba_max(monkeypatch):
+def test_resolve_env_not_clamped(monkeypatch):
+    # New behavior: env is NOT clamped to any numba limit; user is responsible.
     monkeypatch.setenv("GVL_NUM_THREADS", "9999")
-    monkeypatch.setattr(numba, "get_num_threads", lambda: 64)
-    assert th._resolve_num_threads() == 64
+    assert th._resolve_num_threads() == 9999
 
 
 def test_resolve_uses_cgroup_affinity(monkeypatch):
     monkeypatch.delenv("GVL_NUM_THREADS", raising=False)
-    # host reports 208 logical CPUs, cgroup allows 52 -> min wins
-    monkeypatch.setattr(numba, "get_num_threads", lambda: 208)
     _constrain_detected_cpus(monkeypatch, 52)
     assert th._resolve_num_threads() == 52
 
@@ -42,13 +36,15 @@ def test_resolve_uses_cgroup_affinity(monkeypatch):
 def test_resolve_malformed_env_falls_back_to_affinity(monkeypatch):
     # a non-integer override must not break import; fall through to detection
     monkeypatch.setenv("GVL_NUM_THREADS", "auto")
-    monkeypatch.setattr(numba, "get_num_threads", lambda: 208)
     _constrain_detected_cpus(monkeypatch, 52)
     assert th._resolve_num_threads() == 52
 
 
 def test_should_parallelize_threshold(monkeypatch):
-    monkeypatch.setattr(numba, "get_num_threads", lambda: 4)
+    # Reset cached thread count so monkeypatch takes effect.
+    monkeypatch.setattr(th, "_NUM_THREADS", None)
+    monkeypatch.delenv("GVL_NUM_THREADS", raising=False)
+    _constrain_detected_cpus(monkeypatch, 4)
     thresh = 4 * th._MIN_BYTES_PER_THREAD
     assert th.should_parallelize(thresh - 1) is False
     assert th.should_parallelize(thresh) is True
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index b51dd18f..b0bfd560 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -1,7 +1,7 @@
 import numpy as np
 import polars as pl
 from genoray._utils import ContigNormalizer
-from genvarloader._dataset._utils import bed_to_regions, splits_sum_le_value
+from genvarloader._dataset._utils import bed_to_regions
 from genvarloader._utils import normalize_contig_name
 from pytest_cases import parametrize_with_cases
 
@@ -60,14 +60,6 @@ def test_bed_to_regions_no_strand_defaults_to_plus() -> None:
     np.testing.assert_array_equal(regions, np.array([[0, 100, 200, 1]], np.int32))
 
 
-def test_splits_sum_le_value():
-    max_size = 10
-    sizes = np.array([3, 5, 2, 4, 7, 5, 2], np.int32)
-    splits = splits_sum_le_value(sizes, max_size)
-    np.testing.assert_equal(splits, np.array([0, 3, 4, 5, 7], np.intp))
-    np.testing.assert_array_less(np.add.reduceat(sizes, splits[:-1]), max_size + 1)
-
-
 def contig_match():
     unnormed = "chr1"
     source = ["chr1", "chr2"]
diff --git a/tests/unit/test_write_annot_bigwig.py b/tests/unit/test_write_annot_bigwig.py
index 7158573d..4a5cce99 100644
--- a/tests/unit/test_write_annot_bigwig.py
+++ b/tests/unit/test_write_annot_bigwig.py
@@ -36,9 +36,7 @@ def test_write_annot_track_rust_byte_matches_legacy(tmp_path):
     # rust
     _write._write_annot_track_rust(rust_dir, regions, bw, max_mem=2**30)
 
-    assert (legacy_dir / "intervals.npy").read_bytes() == (
-        rust_dir / "intervals.npy"
-    ).read_bytes()
-    assert (legacy_dir / "offsets.npy").read_bytes() == (
-        rust_dir / "offsets.npy"
-    ).read_bytes()
+    for name in ("starts.npy", "ends.npy", "values.npy", "offsets.npy"):
+        assert (legacy_dir / name).read_bytes() == (rust_dir / name).read_bytes(), (
+            f"{name} bytes mismatch between legacy and rust writers"
+        )