AdaWorldAPI · AdaWorldAPI · Jul 2, 2026 · Jul 2, 2026 · chatgpt-codex-connector · Jul 2, 2026
diff --git a/crates/onebrc-probe/src/lane_b.rs b/crates/onebrc-probe/src/lane_b.rs
@@ -5,18 +5,23 @@
 //! parse + accumulate logic. The workspace's SIMD iron rule is "all SIMD from
 //! `ndarray::simd`" (`simd-savant` agent,
 //! `.claude/knowledge/ndarray-vertical-simd-alien-magic.md`) — this module
-//! uses `ndarray::simd::U8x32::cmpeq_mask` exclusively, never a raw
+//! uses `ndarray::simd::{U8x32, U8x64}::cmpeq_mask` exclusively, never a raw
 //! `core::arch` intrinsic, `pulp`, `wide`, or `memchr`.
 //!
-//! `U8x32` is the AVX2-native byte width (one `__m256i` = 32 bytes; see
-//! `ndarray/src/simd_avx2.rs` module doc). The corpus is scanned in 32-byte
-//! strides: for each block, `cmpeq_mask(U8x32::splat(b'\n'))` and
-//! `cmpeq_mask(U8x32::splat(b';'))` produce 32-bit masks with bit `i` set
-//! iff `block[i]` matches. The set bits of the combined mask (newline and
-//! semicolon bytes never coincide, so `nl_mask | semi_mask` has no lost
-//! information) are walked in ascending order via the classic
-//! `mask & (mask - 1)` "clear lowest set bit" trick, recovering the ordered
-//! sequence of delimiter events for the block.
+//! **Width is compile-time-dispatched** (`SimdByte`): under AVX-512
+//! (`target-cpu=x86-64-v4` / `native`) the scan strides 64-byte `zmm`
+//! blocks via `U8x64` (`cmpeq_mask -> u64`); under the AVX2 default
+//! (`x86-64-v3`, the CI baseline) it strides 32-byte `ymm` blocks via
+//! `U8x32` (`cmpeq_mask -> u32`; one `__m256i`, see
+//! `ndarray/src/simd_avx2.rs`). For each block,
+//! `cmpeq_mask(SimdByte::splat(b'\n'))` and `cmpeq_mask(SimdByte::splat(b';'))`
+//! produce a `SimdByte::LANES`-bit mask with bit `i` set iff `block[i]`
+//! matches. The set bits of the combined mask (newline and semicolon bytes
+//! never coincide, so `nl_mask | semi_mask` has no lost information) are
+//! walked in ascending order via the classic `mask & (mask - 1)` "clear
+//! lowest set bit" trick, recovering the ordered sequence of delimiter
+//! events for the block. The walk is generic over the `u32`/`u64` mask
+//! width, so the same body serves both dispatched widths.
 //!
 //! **Parse remains scalar** (SWAR/branchless parse deliberately deferred —
 //! see `README.md` §1 "NOT reimplemented here"): `parse_temp_tenths` is the
@@ -27,27 +32,49 @@
 //! ## Cross-block record state
 //!
 //! A record's `;` and its `\n` are not guaranteed to land in the same
-//! 32-byte block (short station names put the `;` near a block's end and
-//! the `\n` in the next block, or vice versa). Two scalars carry state
-//! across block boundaries:
+//! block (short station names put the `;` near a block's end and the `\n`
+//! in the next block, or vice versa). Two scalars carry state across block
+//! boundaries:
 //!
 //! - `line_start: usize` — the byte offset where the current (in-progress)
 //!   station name begins.
 //! - `pending_semi: Option<usize>` — `Some(offset)` once this record's `;`
 //!   has been seen but its `\n` has not yet arrived; `None` while still
 //!   scanning for the `;`.
 //!
-//! The tail (fewer than 32 bytes remaining after the last full block) is
-//! finished with a plain byte-wise scalar loop — the same station-name /
-//! temp-field extraction shape as `lane_a_scalar`, continuing from whatever
-//! `line_start` / `pending_semi` state the SIMD pass left behind.
+//! The tail (fewer than one full `SimdByte`-wide block remaining after the
+//! last full block) is finished with a plain byte-wise scalar loop — the
+//! same station-name / temp-field extraction shape as `lane_a_scalar`,
+//! continuing from whatever `line_start` / `pending_semi` state the SIMD
+//! pass left behind.
 
 use crate::{parse_temp_tenths, Stats};
-use ndarray::simd::{array_chunks, U8x32};
+use ndarray::simd::array_chunks;
 use std::collections::BTreeMap;
 
-/// Lane B — SIMD delimiter scan. One pass over `data` in 32-byte strides
-/// using `ndarray::simd::U8x32::cmpeq_mask` to locate `;` and `\n` bytes;
+// Compile-time SIMD byte-width dispatch — both widths are `ndarray::simd`
+// types (the iron rule; never a raw `core::arch` intrinsic). AVX-512
+// targets (`target-cpu=x86-64-v4` / `native`) scan in 64-byte `zmm`
+// strides via `U8x64`; the AVX2 default (`x86-64-v3`, the CI baseline)
+// scans in 32-byte `ymm` strides via `U8x32`. The stride, the needle
+// width, and the `array_chunks` const-generic all key off
+// `SimdByte::LANES`, so the same body strides the widest lane the target
+// actually provides. `cmpeq_mask` returns a `u64` (avx512) / `u32`
+// (avx2) mask; the ascending set-bit walk below is generic over both.
+#[cfg(not(target_feature = "avx512f"))]
+use ndarray::simd::U8x32 as SimdByte;
+#[cfg(target_feature = "avx512f")]
+use ndarray::simd::U8x64 as SimdByte;
+
+/// The SIMD block width lane B actually strides, in bytes — the dispatched
+/// `SimdByte::LANES` (64 under avx512, 32 under avx2). Exposed so tests can
+/// assert against the real block boundary instead of a hardcoded width.
+#[cfg(test)]
+pub(crate) const SIMD_LANES: usize = SimdByte::LANES;
+
+/// Lane B — SIMD delimiter scan. One pass over `data` in `SimdByte::LANES`-wide
+/// strides (64-byte `zmm` under avx512, 32-byte `ymm` under avx2) using
+/// `ndarray::simd::{U8x64, U8x32}::cmpeq_mask` to locate `;` and `\n` bytes;
 /// scalar integer temp parse (see module doc); `BTreeMap<String, Stats>`
 /// accumulation identical in shape to `lane_a_scalar`.
 pub fn lane_b_simd(data: &[u8]) -> BTreeMap<String, Stats> {
@@ -60,18 +87,22 @@ pub fn lane_b_simd(data: &[u8]) -> BTreeMap<String, Stats> {
     let mut line_start = 0usize;
     let mut pending_semi: Option<usize> = None;
 
-    let nl_needle = U8x32::splat(b'\n');
-    let semi_needle = U8x32::splat(b';');
+    let nl_needle = SimdByte::splat(b'\n');
+    let semi_needle = SimdByte::splat(b';');
 
-    // The non-overlapping 32-byte stride walk routes through
+    // The non-overlapping `SimdByte::LANES`-wide stride walk routes through
     // `ndarray::simd::array_chunks` (simd_ops.rs) — the W1a batch-walk
     // primitive; `array_windows` is its OVERLAPPING sibling (GEMM-style
     // row windows) and is deliberately NOT used here: delimiter scanning
-    // never re-reads bytes.
-    let aligned_end = (len / U8x32::LANES) * U8x32::LANES;
-    for (chunk_idx, chunk) in array_chunks::<u8, 32>(&data[..aligned_end]).enumerate() {
-        let pos = chunk_idx * U8x32::LANES;
-        let block = U8x32::from_slice(chunk);
+    // never re-reads bytes. The const-generic is `{ SimdByte::LANES }`, so
+    // the chunk width tracks the dispatched lane width (64 under avx512,
+    // 32 under avx2) with no hardcoded stride.
+    let aligned_end = (len / SimdByte::LANES) * SimdByte::LANES;
+    for (chunk_idx, chunk) in
+        array_chunks::<u8, { SimdByte::LANES }>(&data[..aligned_end]).enumerate()
+    {
+        let pos = chunk_idx * SimdByte::LANES;
+        let block = SimdByte::from_slice(chunk);
         let nl_mask = block.cmpeq_mask(nl_needle);
         let semi_mask = block.cmpeq_mask(semi_needle);
         // `;` and `\n` never occupy the same byte, so OR-ing loses no
@@ -105,9 +136,10 @@ pub fn lane_b_simd(data: &[u8]) -> BTreeMap<String, Stats> {
         }
     }
 
-    // Tail — fewer than 32 bytes remain. Finish with a plain scalar scan,
-    // continuing from whatever `line_start` / `pending_semi` state the SIMD
-    // pass left behind (mirrors `lane_a_scalar`'s per-record shape).
+    // Tail — fewer than `SimdByte::LANES` bytes remain. Finish with a plain
+    // scalar scan, continuing from whatever `line_start` / `pending_semi`
+    // state the SIMD pass left behind (mirrors `lane_a_scalar`'s per-record
+    // shape).
     let mut i = aligned_end;
     while i < len {
         match pending_semi {

diff --git a/crates/onebrc-probe/src/lib.rs b/crates/onebrc-probe/src/lib.rs
@@ -404,31 +404,36 @@ mod tests {
     }
 
     /// Hand-built corpus, crafted so at least one record's `;`/`\n` land in
-    /// DIFFERENT 32-byte SIMD blocks (block0=[0,32), block1=[32,64),
-    /// tail=[64,..)) — exercises the cross-iteration `line_start` /
-    /// `pending_semi` carry in `lane_b_simd`, not just the common in-block
-    /// case a random generated corpus would mostly hit.
+    /// DIFFERENT SIMD blocks at BOTH dispatched widths — the 68-byte corpus
+    /// has a record straddling a 32-byte boundary (`long_name`, `;`@29 /
+    /// `\n`@33) AND one straddling a 64-byte boundary (`Vv`, `;`@63 /
+    /// `\n`@67), so the cross-iteration `line_start` / `pending_semi` carry
+    /// in `lane_b_simd` is exercised whether the build strides 32-byte `ymm`
+    /// (avx2) or 64-byte `zmm` (avx512), not just the common in-block case a
+    /// random generated corpus would mostly hit.
     #[cfg(feature = "lane-b")]
     #[test]
-    fn lane_b_handles_records_that_straddle_32_byte_block_boundaries() {
+    fn lane_b_handles_records_that_straddle_block_boundaries() {
+        let lanes = crate::lane_b::SIMD_LANES; // dispatched block width (32 or 64)
         let long_name = "N".repeat(22);
         let mut corpus = String::new();
         corpus.push_str("Ab;1.0\n"); // fully inside block0
-        corpus.push_str(&format!("{long_name};9.9\n")); // straddles block0/block1
-        corpus.push_str("Zz;3.3\n"); // fully inside block1
-        corpus.push_str("QqRrSsTt;2.2\n"); // fully inside block1
-        corpus.push_str("Uu;4.4\n"); // fully inside block1
-        corpus.push_str("Vv;5.5\n"); // straddles block1/tail
+        corpus.push_str(&format!("{long_name};9.9\n")); // straddles a 32-byte boundary
+        corpus.push_str("Zz;3.3\n");
+        corpus.push_str("QqRrSsTt;2.2\n");
+        corpus.push_str("Uu;4.4\n");
+        corpus.push_str("Vv;5.5\n"); // straddles the 64-byte boundary (block0/tail)
 
         let data = corpus.as_bytes();
         assert!(
-            data.len() > 64,
-            "test corpus must span block0, block1, AND a tail region"
+            data.len() > lanes,
+            "test corpus must span at least one full SIMD block AND a tail region"
         );
 
         // Confirm (rather than assume) that at least one record's `;` and
-        // `\n` land in different 32-byte blocks — otherwise this test
-        // would silently degrade into testing only the non-crossing case.
+        // `\n` land in different SIMD blocks AT THE DISPATCHED WIDTH —
+        // otherwise this test would silently degrade into testing only the
+        // non-crossing case.
         let find_all = |needle: u8| -> Vec<usize> {
             data.iter()
                 .enumerate()
@@ -442,10 +447,10 @@ mod tests {
         let crosses_a_block = semis
             .iter()
             .zip(newlines.iter())
-            .any(|(&s, &n)| s / 32 != n / 32);
+            .any(|(&s, &n)| s / lanes != n / lanes);
         assert!(
             crosses_a_block,
-            "test corpus must contain a record whose `;`/`\\n` land in different 32-byte blocks"
+            "test corpus must contain a record whose `;`/`\\n` land in different {lanes}-byte blocks"
         );
 
         let a = lane_a_scalar(data);