From 45623c2993f4c119a01bd5ddad348d6f30171da7 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Jul 2026 20:17:55 +0000 Subject: [PATCH] fix(onebrc lane B): dispatch SIMD width instead of hardcoding 32-byte stride MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The scan hardcoded `array_chunks::` + `U8x32` throughout, pinning the delimiter walk to 32-byte `ymm` (AVX2) regardless of target-cpu — so under `target-cpu=x86-64-v4`/`native` it strided `ymm`, never the 64-byte `zmm` the AVX-512 build provides. (The probe's `.cargo/config.toml` v3 pin is a deliberate CI-parity choice; this is about honoring native/v4 when a run opts into it — "here v4 or native is a must".) - `SimdByte` = compile-time width alias: `U8x64` under `cfg(target_feature = "avx512f")`, `U8x32` otherwise. Both are `ndarray::simd` types (iron rule; no raw intrinsic). `cmpeq_mask` returns `u64`/`u32` respectively; the set-bit walk was already generic over the mask width, so the body is unchanged apart from the alias. - `array_chunks::` — the const-generic tracks the dispatched width; `aligned_end`, `pos`, needles, and `from_slice` all key off `SimdByte::LANES`. No literal stride remains. - Module + fn docs rewritten to describe the dispatch (64-byte zmm avx512 / 32-byte ymm avx2) instead of asserting a fixed 32. - Test `..._straddle_32_byte_block_boundaries` → `..._straddle_block_boundaries`, now asserts crossing at the dispatched `lane_b::SIMD_LANES` (test-gated const) instead of a literal `/ 32`; the 68-byte corpus straddles a boundary at BOTH widths (`long_name` @32, `Vv` @64), so coverage holds either way. Verified both arms: v3 default (U8x32, 32B) and `RUSTFLAGS=-Ctarget-cpu=native` (U8x64, 64B zmm on this avx512f host) — 16/16 lane-b tests byte-parity with lane A, clippy `-D warnings` clean (lib + all-targets) on both, fmt clean. README/FINDINGS narrative on the v3-pin correction is deferred to the parallel session's §5.5 to avoid clobbering its in-flight README edits. Co-Authored-By: Claude Fable 5 Claude-Session: https://claude.ai/code/session_01MLBnPuScZy6w9di2QEjsXM --- crates/onebrc-probe/src/lane_b.rs | 94 +++++++++++++++++++++---------- crates/onebrc-probe/src/lib.rs | 37 ++++++------ 2 files changed, 84 insertions(+), 47 deletions(-) diff --git a/crates/onebrc-probe/src/lane_b.rs b/crates/onebrc-probe/src/lane_b.rs index 0e7679b2..8a5b4315 100644 --- a/crates/onebrc-probe/src/lane_b.rs +++ b/crates/onebrc-probe/src/lane_b.rs @@ -5,18 +5,23 @@ //! parse + accumulate logic. The workspace's SIMD iron rule is "all SIMD from //! `ndarray::simd`" (`simd-savant` agent, //! `.claude/knowledge/ndarray-vertical-simd-alien-magic.md`) — this module -//! uses `ndarray::simd::U8x32::cmpeq_mask` exclusively, never a raw +//! uses `ndarray::simd::{U8x32, U8x64}::cmpeq_mask` exclusively, never a raw //! `core::arch` intrinsic, `pulp`, `wide`, or `memchr`. //! -//! `U8x32` is the AVX2-native byte width (one `__m256i` = 32 bytes; see -//! `ndarray/src/simd_avx2.rs` module doc). The corpus is scanned in 32-byte -//! strides: for each block, `cmpeq_mask(U8x32::splat(b'\n'))` and -//! `cmpeq_mask(U8x32::splat(b';'))` produce 32-bit masks with bit `i` set -//! iff `block[i]` matches. The set bits of the combined mask (newline and -//! semicolon bytes never coincide, so `nl_mask | semi_mask` has no lost -//! information) are walked in ascending order via the classic -//! `mask & (mask - 1)` "clear lowest set bit" trick, recovering the ordered -//! sequence of delimiter events for the block. +//! **Width is compile-time-dispatched** (`SimdByte`): under AVX-512 +//! (`target-cpu=x86-64-v4` / `native`) the scan strides 64-byte `zmm` +//! blocks via `U8x64` (`cmpeq_mask -> u64`); under the AVX2 default +//! (`x86-64-v3`, the CI baseline) it strides 32-byte `ymm` blocks via +//! `U8x32` (`cmpeq_mask -> u32`; one `__m256i`, see +//! `ndarray/src/simd_avx2.rs`). For each block, +//! `cmpeq_mask(SimdByte::splat(b'\n'))` and `cmpeq_mask(SimdByte::splat(b';'))` +//! produce a `SimdByte::LANES`-bit mask with bit `i` set iff `block[i]` +//! matches. The set bits of the combined mask (newline and semicolon bytes +//! never coincide, so `nl_mask | semi_mask` has no lost information) are +//! walked in ascending order via the classic `mask & (mask - 1)` "clear +//! lowest set bit" trick, recovering the ordered sequence of delimiter +//! events for the block. The walk is generic over the `u32`/`u64` mask +//! width, so the same body serves both dispatched widths. //! //! **Parse remains scalar** (SWAR/branchless parse deliberately deferred — //! see `README.md` §1 "NOT reimplemented here"): `parse_temp_tenths` is the @@ -27,9 +32,9 @@ //! ## Cross-block record state //! //! A record's `;` and its `\n` are not guaranteed to land in the same -//! 32-byte block (short station names put the `;` near a block's end and -//! the `\n` in the next block, or vice versa). Two scalars carry state -//! across block boundaries: +//! block (short station names put the `;` near a block's end and the `\n` +//! in the next block, or vice versa). Two scalars carry state across block +//! boundaries: //! //! - `line_start: usize` — the byte offset where the current (in-progress) //! station name begins. @@ -37,17 +42,39 @@ //! has been seen but its `\n` has not yet arrived; `None` while still //! scanning for the `;`. //! -//! The tail (fewer than 32 bytes remaining after the last full block) is -//! finished with a plain byte-wise scalar loop — the same station-name / -//! temp-field extraction shape as `lane_a_scalar`, continuing from whatever -//! `line_start` / `pending_semi` state the SIMD pass left behind. +//! The tail (fewer than one full `SimdByte`-wide block remaining after the +//! last full block) is finished with a plain byte-wise scalar loop — the +//! same station-name / temp-field extraction shape as `lane_a_scalar`, +//! continuing from whatever `line_start` / `pending_semi` state the SIMD +//! pass left behind. use crate::{parse_temp_tenths, Stats}; -use ndarray::simd::{array_chunks, U8x32}; +use ndarray::simd::array_chunks; use std::collections::BTreeMap; -/// Lane B — SIMD delimiter scan. One pass over `data` in 32-byte strides -/// using `ndarray::simd::U8x32::cmpeq_mask` to locate `;` and `\n` bytes; +// Compile-time SIMD byte-width dispatch — both widths are `ndarray::simd` +// types (the iron rule; never a raw `core::arch` intrinsic). AVX-512 +// targets (`target-cpu=x86-64-v4` / `native`) scan in 64-byte `zmm` +// strides via `U8x64`; the AVX2 default (`x86-64-v3`, the CI baseline) +// scans in 32-byte `ymm` strides via `U8x32`. The stride, the needle +// width, and the `array_chunks` const-generic all key off +// `SimdByte::LANES`, so the same body strides the widest lane the target +// actually provides. `cmpeq_mask` returns a `u64` (avx512) / `u32` +// (avx2) mask; the ascending set-bit walk below is generic over both. +#[cfg(not(target_feature = "avx512f"))] +use ndarray::simd::U8x32 as SimdByte; +#[cfg(target_feature = "avx512f")] +use ndarray::simd::U8x64 as SimdByte; + +/// The SIMD block width lane B actually strides, in bytes — the dispatched +/// `SimdByte::LANES` (64 under avx512, 32 under avx2). Exposed so tests can +/// assert against the real block boundary instead of a hardcoded width. +#[cfg(test)] +pub(crate) const SIMD_LANES: usize = SimdByte::LANES; + +/// Lane B — SIMD delimiter scan. One pass over `data` in `SimdByte::LANES`-wide +/// strides (64-byte `zmm` under avx512, 32-byte `ymm` under avx2) using +/// `ndarray::simd::{U8x64, U8x32}::cmpeq_mask` to locate `;` and `\n` bytes; /// scalar integer temp parse (see module doc); `BTreeMap` /// accumulation identical in shape to `lane_a_scalar`. pub fn lane_b_simd(data: &[u8]) -> BTreeMap { @@ -60,18 +87,22 @@ pub fn lane_b_simd(data: &[u8]) -> BTreeMap { let mut line_start = 0usize; let mut pending_semi: Option = None; - let nl_needle = U8x32::splat(b'\n'); - let semi_needle = U8x32::splat(b';'); + let nl_needle = SimdByte::splat(b'\n'); + let semi_needle = SimdByte::splat(b';'); - // The non-overlapping 32-byte stride walk routes through + // The non-overlapping `SimdByte::LANES`-wide stride walk routes through // `ndarray::simd::array_chunks` (simd_ops.rs) — the W1a batch-walk // primitive; `array_windows` is its OVERLAPPING sibling (GEMM-style // row windows) and is deliberately NOT used here: delimiter scanning - // never re-reads bytes. - let aligned_end = (len / U8x32::LANES) * U8x32::LANES; - for (chunk_idx, chunk) in array_chunks::(&data[..aligned_end]).enumerate() { - let pos = chunk_idx * U8x32::LANES; - let block = U8x32::from_slice(chunk); + // never re-reads bytes. The const-generic is `{ SimdByte::LANES }`, so + // the chunk width tracks the dispatched lane width (64 under avx512, + // 32 under avx2) with no hardcoded stride. + let aligned_end = (len / SimdByte::LANES) * SimdByte::LANES; + for (chunk_idx, chunk) in + array_chunks::(&data[..aligned_end]).enumerate() + { + let pos = chunk_idx * SimdByte::LANES; + let block = SimdByte::from_slice(chunk); let nl_mask = block.cmpeq_mask(nl_needle); let semi_mask = block.cmpeq_mask(semi_needle); // `;` and `\n` never occupy the same byte, so OR-ing loses no @@ -105,9 +136,10 @@ pub fn lane_b_simd(data: &[u8]) -> BTreeMap { } } - // Tail — fewer than 32 bytes remain. Finish with a plain scalar scan, - // continuing from whatever `line_start` / `pending_semi` state the SIMD - // pass left behind (mirrors `lane_a_scalar`'s per-record shape). + // Tail — fewer than `SimdByte::LANES` bytes remain. Finish with a plain + // scalar scan, continuing from whatever `line_start` / `pending_semi` + // state the SIMD pass left behind (mirrors `lane_a_scalar`'s per-record + // shape). let mut i = aligned_end; while i < len { match pending_semi { diff --git a/crates/onebrc-probe/src/lib.rs b/crates/onebrc-probe/src/lib.rs index 0c687146..9a41204d 100644 --- a/crates/onebrc-probe/src/lib.rs +++ b/crates/onebrc-probe/src/lib.rs @@ -404,31 +404,36 @@ mod tests { } /// Hand-built corpus, crafted so at least one record's `;`/`\n` land in - /// DIFFERENT 32-byte SIMD blocks (block0=[0,32), block1=[32,64), - /// tail=[64,..)) — exercises the cross-iteration `line_start` / - /// `pending_semi` carry in `lane_b_simd`, not just the common in-block - /// case a random generated corpus would mostly hit. + /// DIFFERENT SIMD blocks at BOTH dispatched widths — the 68-byte corpus + /// has a record straddling a 32-byte boundary (`long_name`, `;`@29 / + /// `\n`@33) AND one straddling a 64-byte boundary (`Vv`, `;`@63 / + /// `\n`@67), so the cross-iteration `line_start` / `pending_semi` carry + /// in `lane_b_simd` is exercised whether the build strides 32-byte `ymm` + /// (avx2) or 64-byte `zmm` (avx512), not just the common in-block case a + /// random generated corpus would mostly hit. #[cfg(feature = "lane-b")] #[test] - fn lane_b_handles_records_that_straddle_32_byte_block_boundaries() { + fn lane_b_handles_records_that_straddle_block_boundaries() { + let lanes = crate::lane_b::SIMD_LANES; // dispatched block width (32 or 64) let long_name = "N".repeat(22); let mut corpus = String::new(); corpus.push_str("Ab;1.0\n"); // fully inside block0 - corpus.push_str(&format!("{long_name};9.9\n")); // straddles block0/block1 - corpus.push_str("Zz;3.3\n"); // fully inside block1 - corpus.push_str("QqRrSsTt;2.2\n"); // fully inside block1 - corpus.push_str("Uu;4.4\n"); // fully inside block1 - corpus.push_str("Vv;5.5\n"); // straddles block1/tail + corpus.push_str(&format!("{long_name};9.9\n")); // straddles a 32-byte boundary + corpus.push_str("Zz;3.3\n"); + corpus.push_str("QqRrSsTt;2.2\n"); + corpus.push_str("Uu;4.4\n"); + corpus.push_str("Vv;5.5\n"); // straddles the 64-byte boundary (block0/tail) let data = corpus.as_bytes(); assert!( - data.len() > 64, - "test corpus must span block0, block1, AND a tail region" + data.len() > lanes, + "test corpus must span at least one full SIMD block AND a tail region" ); // Confirm (rather than assume) that at least one record's `;` and - // `\n` land in different 32-byte blocks — otherwise this test - // would silently degrade into testing only the non-crossing case. + // `\n` land in different SIMD blocks AT THE DISPATCHED WIDTH — + // otherwise this test would silently degrade into testing only the + // non-crossing case. let find_all = |needle: u8| -> Vec { data.iter() .enumerate() @@ -442,10 +447,10 @@ mod tests { let crosses_a_block = semis .iter() .zip(newlines.iter()) - .any(|(&s, &n)| s / 32 != n / 32); + .any(|(&s, &n)| s / lanes != n / lanes); assert!( crosses_a_block, - "test corpus must contain a record whose `;`/`\\n` land in different 32-byte blocks" + "test corpus must contain a record whose `;`/`\\n` land in different {lanes}-byte blocks" ); let a = lane_a_scalar(data);