From c6fce9a7a771db8532245052a3471b1575d0c3c0 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 25 Jun 2026 16:28:57 -0400
Subject: [PATCH 1/3] Add basic operator cuda testing

---
 .../boost/safe_numbers/detail/float_basis.hpp | 18 +++-
 test/cuda_jamfile                             | 28 ++++++
 test/test_cuda_f32_add.cu                     | 82 ++++++++++++++++++
 test/test_cuda_f32_add_error.cu               | 62 ++++++++++++++
 test/test_cuda_f32_div.cu                     | 85 +++++++++++++++++++
 test/test_cuda_f32_div_error.cu               | 63 ++++++++++++++
 test/test_cuda_f32_mod.cu                     | 85 +++++++++++++++++++
 test/test_cuda_f32_mod_error.cu               | 63 ++++++++++++++
 test/test_cuda_f32_mul.cu                     | 84 ++++++++++++++++++
 test/test_cuda_f32_mul_error.cu               | 64 ++++++++++++++
 test/test_cuda_f32_sub.cu                     | 84 ++++++++++++++++++
 test/test_cuda_f32_sub_error.cu               | 64 ++++++++++++++
 test/test_cuda_f64_add.cu                     | 82 ++++++++++++++++++
 test/test_cuda_f64_add_error.cu               | 62 ++++++++++++++
 test/test_cuda_f64_div.cu                     | 85 +++++++++++++++++++
 test/test_cuda_f64_div_error.cu               | 63 ++++++++++++++
 test/test_cuda_f64_mod.cu                     | 85 +++++++++++++++++++
 test/test_cuda_f64_mod_error.cu               | 63 ++++++++++++++
 test/test_cuda_f64_mul.cu                     | 84 ++++++++++++++++++
 test/test_cuda_f64_mul_error.cu               | 64 ++++++++++++++
 test/test_cuda_f64_sub.cu                     | 84 ++++++++++++++++++
 test/test_cuda_f64_sub_error.cu               | 64 ++++++++++++++
 22 files changed, 1517 insertions(+), 1 deletion(-)
 create mode 100644 test/test_cuda_f32_add.cu
 create mode 100644 test/test_cuda_f32_add_error.cu
 create mode 100644 test/test_cuda_f32_div.cu
 create mode 100644 test/test_cuda_f32_div_error.cu
 create mode 100644 test/test_cuda_f32_mod.cu
 create mode 100644 test/test_cuda_f32_mod_error.cu
 create mode 100644 test/test_cuda_f32_mul.cu
 create mode 100644 test/test_cuda_f32_mul_error.cu
 create mode 100644 test/test_cuda_f32_sub.cu
 create mode 100644 test/test_cuda_f32_sub_error.cu
 create mode 100644 test/test_cuda_f64_add.cu
 create mode 100644 test/test_cuda_f64_add_error.cu
 create mode 100644 test/test_cuda_f64_div.cu
 create mode 100644 test/test_cuda_f64_div_error.cu
 create mode 100644 test/test_cuda_f64_mod.cu
 create mode 100644 test/test_cuda_f64_mod_error.cu
 create mode 100644 test/test_cuda_f64_mul.cu
 create mode 100644 test/test_cuda_f64_mul_error.cu
 create mode 100644 test/test_cuda_f64_sub.cu
 create mode 100644 test/test_cuda_f64_sub_error.cu
diff --git a/include/boost/safe_numbers/detail/float_basis.hpp b/include/boost/safe_numbers/detail/float_basis.hpp
index ddfb48c..3a639ee 100644
--- a/include/boost/safe_numbers/detail/float_basis.hpp
+++ b/include/boost/safe_numbers/detail/float_basis.hpp
@@ -23,10 +23,24 @@
 #include <cstdlib>
 #include <utility>
 #include <optional>
+
+#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+#include <cuda/std/cmath>
+#else
 #include <cmath>
+#endif
 
 #endif // BOOST_SAFE_NUMBERS_BUILD_MODULE
 
+// Selects the namespace that provides the <cmath> functions so device builds use
+// the libcu++ implementation. Mirrors the switch in cmath.hpp, which is not
+// visible here because that header undefines the macro before this one is reused.
+#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+#  define BOOST_SAFE_NUMBERS_DETAIL_CMATH_NS cuda::std
+#else
+#  define BOOST_SAFE_NUMBERS_DETAIL_CMATH_NS std
+#endif
+
 namespace boost::safe_numbers::detail {
 
 template <compatible_float_type BasisType>
@@ -1748,7 +1762,7 @@ namespace impl {
 template <compatible_float_type T>
 BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] auto checked_float_modulo(const T lhs, const T rhs, T& res) -> error_category
 {
-    res = std::fmod(lhs, rhs);
+    res = BOOST_SAFE_NUMBERS_DETAIL_CMATH_NS::fmod(lhs, rhs);
 
     // The hot path is that our modulo has nothing funny happening
     if (!constexpr_isinf(res) && !constexpr_isnan(res)) [[likely]]
@@ -2008,4 +2022,6 @@ BOOST_SAFE_NUMBERS_DEFINE_MIXED_FLOAT_OP("modulo", operator%)
 
 #undef BOOST_SAFE_NUMBERS_DEFINE_MIXED_FLOAT_OP
 
+#undef BOOST_SAFE_NUMBERS_DETAIL_CMATH_NS
+
 #endif // BOOST_SAFE_NUMBERS_DETAIL_FLOAT_BASIS_HPP
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 3678fe8..7d2af82 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -344,6 +344,34 @@ run test_cuda_i128_mul.cu ;
 run test_cuda_i128_div.cu ;
 run test_cuda_i128_mod.cu ;
 
+# f32 tests
+run test_cuda_f32_add.cu ;
+run test_cuda_f32_sub.cu ;
+run test_cuda_f32_mul.cu ;
+run test_cuda_f32_div.cu ;
+run test_cuda_f32_mod.cu ;
+
+# f64 tests
+run test_cuda_f64_add.cu ;
+run test_cuda_f64_sub.cu ;
+run test_cuda_f64_mul.cu ;
+run test_cuda_f64_div.cu ;
+run test_cuda_f64_mod.cu ;
+
+# f32 error tests
+run test_cuda_f32_add_error.cu ;
+run test_cuda_f32_sub_error.cu ;
+run test_cuda_f32_mul_error.cu ;
+run test_cuda_f32_div_error.cu ;
+run test_cuda_f32_mod_error.cu ;
+
+# f64 error tests
+run test_cuda_f64_add_error.cu ;
+run test_cuda_f64_sub_error.cu ;
+run test_cuda_f64_mul_error.cu ;
+run test_cuda_f64_div_error.cu ;
+run test_cuda_f64_mod_error.cu ;
+
 # Examples
 run ../examples/cuda.cu ;
 run ../examples/cuda_error_handling.cu ;
diff --git a/test/test_cuda_f32_add.cu b/test/test_cuda_f32_add.cu
new file mode 100644
index 0000000..c171564
--- /dev/null
+++ b/test/test_cuda_f32_add.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] + in[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // 2 * x stays finite well within the float range
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e18), static_cast<basis_type>(1e18)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] + input_vector[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f32_add_error.cu b/test/test_cuda_f32_add_error.cu
new file mode 100644
index 0000000..d8d2ecf
--- /dev/null
+++ b/test/test_cuda_f32_add_error.cu
@@ -0,0 +1,62 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] + in[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // max + max saturates to +infinity -> overflow
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{(std::numeric_limits<basis_type>::max)()};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from overflow but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::overflow_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}
diff --git a/test/test_cuda_f32_div.cu b/test/test_cuda_f32_div.cu
new file mode 100644
index 0000000..ef5d8da
--- /dev/null
+++ b/test/test_cuda_f32_div.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] / in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Divisor is bounded away from zero (>= 1) so x / y stays finite
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e15), static_cast<basis_type>(1e15)};
+    std::uniform_real_distribution<basis_type> dist2{static_cast<basis_type>(1), static_cast<basis_type>(1e15)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] / input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f32_div_error.cu b/test/test_cuda_f32_div_error.cu
new file mode 100644
index 0000000..b761742
--- /dev/null
+++ b/test/test_cuda_f32_div_error.cu
@@ -0,0 +1,63 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] / in2[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // finite non-zero / zero -> division by zero
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(1)};
+        input_vector2[i] = test_type{static_cast<basis_type>(0)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from division by zero but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::domain_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}
diff --git a/test/test_cuda_f32_mod.cu b/test/test_cuda_f32_mod.cu
new file mode 100644
index 0000000..0f258e6
--- /dev/null
+++ b/test/test_cuda_f32_mod.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] % in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Divisor is bounded away from zero (>= 1) so fmod is always well defined
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e15), static_cast<basis_type>(1e15)};
+    std::uniform_real_distribution<basis_type> dist2{static_cast<basis_type>(1), static_cast<basis_type>(1e15)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] % input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f32_mod_error.cu b/test/test_cuda_f32_mod_error.cu
new file mode 100644
index 0000000..48a6c8e
--- /dev/null
+++ b/test/test_cuda_f32_mod_error.cu
@@ -0,0 +1,63 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] % in2[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // finite non-zero modulo zero -> modulo by zero
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(1)};
+        input_vector2[i] = test_type{static_cast<basis_type>(0)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from modulo by zero but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::domain_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}
diff --git a/test/test_cuda_f32_mul.cu b/test/test_cuda_f32_mul.cu
new file mode 100644
index 0000000..99a4ef5
--- /dev/null
+++ b/test/test_cuda_f32_mul.cu
@@ -0,0 +1,84 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] * in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // x * y has magnitude at most 1e30, well within the float range
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e15), static_cast<basis_type>(1e15)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] * input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f32_mul_error.cu b/test/test_cuda_f32_mul_error.cu
new file mode 100644
index 0000000..0db00bd
--- /dev/null
+++ b/test/test_cuda_f32_mul_error.cu
@@ -0,0 +1,64 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] * in2[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // max * 2 saturates to +infinity -> overflow
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{(std::numeric_limits<basis_type>::max)()};
+        input_vector2[i] = test_type{static_cast<basis_type>(2)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from overflow but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::overflow_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}
diff --git a/test/test_cuda_f32_sub.cu b/test/test_cuda_f32_sub.cu
new file mode 100644
index 0000000..75d060e
--- /dev/null
+++ b/test/test_cuda_f32_sub.cu
@@ -0,0 +1,84 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] - in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // x - y stays finite well within the float range
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e18), static_cast<basis_type>(1e18)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] - input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f32_sub_error.cu b/test/test_cuda_f32_sub_error.cu
new file mode 100644
index 0000000..82293b2
--- /dev/null
+++ b/test/test_cuda_f32_sub_error.cu
@@ -0,0 +1,64 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] - in2[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // lowest - max saturates to -infinity -> underflow
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{std::numeric_limits<basis_type>::lowest()};
+        input_vector2[i] = test_type{(std::numeric_limits<basis_type>::max)()};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from underflow but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::underflow_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}
diff --git a/test/test_cuda_f64_add.cu b/test/test_cuda_f64_add.cu
new file mode 100644
index 0000000..516f2a1
--- /dev/null
+++ b/test/test_cuda_f64_add.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] + in[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // 2 * x stays finite well within the double range
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e18), static_cast<basis_type>(1e18)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] + input_vector[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f64_add_error.cu b/test/test_cuda_f64_add_error.cu
new file mode 100644
index 0000000..197e598
--- /dev/null
+++ b/test/test_cuda_f64_add_error.cu
@@ -0,0 +1,62 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] + in[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // max + max saturates to +infinity -> overflow
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{(std::numeric_limits<basis_type>::max)()};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from overflow but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::overflow_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}
diff --git a/test/test_cuda_f64_div.cu b/test/test_cuda_f64_div.cu
new file mode 100644
index 0000000..fafa358
--- /dev/null
+++ b/test/test_cuda_f64_div.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] / in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Divisor is bounded away from zero (>= 1) so x / y stays finite
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e15), static_cast<basis_type>(1e15)};
+    std::uniform_real_distribution<basis_type> dist2{static_cast<basis_type>(1), static_cast<basis_type>(1e15)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] / input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f64_div_error.cu b/test/test_cuda_f64_div_error.cu
new file mode 100644
index 0000000..cebe472
--- /dev/null
+++ b/test/test_cuda_f64_div_error.cu
@@ -0,0 +1,63 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] / in2[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // finite non-zero / zero -> division by zero
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(1)};
+        input_vector2[i] = test_type{static_cast<basis_type>(0)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from division by zero but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::domain_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}
diff --git a/test/test_cuda_f64_mod.cu b/test/test_cuda_f64_mod.cu
new file mode 100644
index 0000000..140d477
--- /dev/null
+++ b/test/test_cuda_f64_mod.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] % in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Divisor is bounded away from zero (>= 1) so fmod is always well defined
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e15), static_cast<basis_type>(1e15)};
+    std::uniform_real_distribution<basis_type> dist2{static_cast<basis_type>(1), static_cast<basis_type>(1e15)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] % input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f64_mod_error.cu b/test/test_cuda_f64_mod_error.cu
new file mode 100644
index 0000000..2e22757
--- /dev/null
+++ b/test/test_cuda_f64_mod_error.cu
@@ -0,0 +1,63 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] % in2[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // finite non-zero modulo zero -> modulo by zero
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(1)};
+        input_vector2[i] = test_type{static_cast<basis_type>(0)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from modulo by zero but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::domain_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}
diff --git a/test/test_cuda_f64_mul.cu b/test/test_cuda_f64_mul.cu
new file mode 100644
index 0000000..2e3fc2a
--- /dev/null
+++ b/test/test_cuda_f64_mul.cu
@@ -0,0 +1,84 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] * in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // x * y has magnitude at most 1e30, well within the double range
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e15), static_cast<basis_type>(1e15)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] * input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f64_mul_error.cu b/test/test_cuda_f64_mul_error.cu
new file mode 100644
index 0000000..13f950b
--- /dev/null
+++ b/test/test_cuda_f64_mul_error.cu
@@ -0,0 +1,64 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] * in2[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // max * 2 saturates to +infinity -> overflow
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{(std::numeric_limits<basis_type>::max)()};
+        input_vector2[i] = test_type{static_cast<basis_type>(2)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from overflow but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::overflow_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}
diff --git a/test/test_cuda_f64_sub.cu b/test/test_cuda_f64_sub.cu
new file mode 100644
index 0000000..2f1430f
--- /dev/null
+++ b/test/test_cuda_f64_sub.cu
@@ -0,0 +1,84 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] - in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // x - y stays finite well within the double range
+    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e18), static_cast<basis_type>(1e18)};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] - input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_f64_sub_error.cu b/test/test_cuda_f64_sub_error.cu
new file mode 100644
index 0000000..a40dd7f
--- /dev/null
+++ b/test/test_cuda_f64_sub_error.cu
@@ -0,0 +1,64 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <boost/safe_numbers/floats.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::f64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] - in2[i];
+    }
+}
+
+int main(void)
+{
+    int numElements = 1024;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // lowest - max saturates to -infinity -> underflow
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{std::numeric_limits<basis_type>::lowest()};
+        input_vector2[i] = test_type{(std::numeric_limits<basis_type>::max)()};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::safe_numbers::device_error_context ctx;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+
+    try
+    {
+        ctx.synchronize();
+        std::cerr << "Expected exception from underflow but none was thrown!" << std::endl;
+        return EXIT_FAILURE;
+    }
+    catch (const std::underflow_error& e)
+    {
+        std::cerr << "Caught expected error: " << e.what() << std::endl;
+        std::cerr << "Test PASSED\n";
+        return 0;
+    }
+}

From 4a8bf860060eecef55a5c14e9e532af420f6208c Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 25 Jun 2026 17:10:26 -0400
Subject: [PATCH 2/3] Remove operator%

---
 include/boost/safe_numbers/bounded_floats.hpp |  27 --
 .../boost/safe_numbers/detail/float_basis.hpp | 290 +----------------
 test/Jamfile                                  |   2 -
 test/cuda_jamfile                             |   4 -
 test/test_bounded_float_modulo.cpp            |  71 ----
 test/test_cuda_f32_mod.cu                     |  85 -----
 test/test_cuda_f32_mod_error.cu               |  63 ----
 test/test_cuda_f64_mod.cu                     |  85 -----
 test/test_cuda_f64_mod_error.cu               |  63 ----
 test/test_float_mod.cpp                       | 305 ------------------
 10 files changed, 4 insertions(+), 991 deletions(-)
 delete mode 100644 test/test_bounded_float_modulo.cpp
 delete mode 100644 test/test_cuda_f32_mod.cu
 delete mode 100644 test/test_cuda_f32_mod_error.cu
 delete mode 100644 test/test_cuda_f64_mod.cu
 delete mode 100644 test/test_cuda_f64_mod_error.cu
 delete mode 100644 test/test_float_mod.cpp

diff --git a/include/boost/safe_numbers/bounded_floats.hpp b/include/boost/safe_numbers/bounded_floats.hpp
index 6401df5..0790f63 100644
--- a/include/boost/safe_numbers/bounded_floats.hpp
+++ b/include/boost/safe_numbers/bounded_floats.hpp
@@ -144,8 +144,6 @@ class bounded_float
     constexpr auto operator*=(bounded_float<Min, Max> rhs) -> bounded_float&;
 
     constexpr auto operator/=(bounded_float<Min, Max> rhs) -> bounded_float&;
-
-    constexpr auto operator%=(bounded_float<Min, Max> rhs) -> bounded_float&;
 };
 
 // ------------------------------
@@ -200,17 +198,6 @@ template <auto Min, auto Max>
     return bounded_float<Min, Max>{lhs_b / rhs_b};
 }
 
-template <auto Min, auto Max>
-[[nodiscard]] constexpr auto operator%(const bounded_float<Min, Max> lhs,
-                                       const bounded_float<Min, Max> rhs) -> bounded_float<Min, Max>
-{
-    using basis = typename bounded_float<Min, Max>::basis_type;
-    using underlying = detail::underlying_type_t<basis>;
-    const basis lhs_b {static_cast<underlying>(lhs)};
-    const basis rhs_b {static_cast<underlying>(rhs)};
-    return bounded_float<Min, Max>{lhs_b % rhs_b};
-}
-
 // ------------------------------
 // Compound assignment
 // ------------------------------
@@ -267,19 +254,6 @@ constexpr auto bounded_float<Min, Max>::operator/=(bounded_float<Min, Max> rhs)
     return *this;
 }
 
-template <auto Min, auto Max>
-    requires (detail::valid_float_bound<decltype(Min)> &&
-              detail::valid_float_bound<decltype(Max)> &&
-              std::is_same_v<decltype(Min), decltype(Max)> &&
-              detail::float_raw_value(Min) == detail::float_raw_value(Min) &&
-              detail::float_raw_value(Max) == detail::float_raw_value(Max) &&
-              detail::float_raw_value(Max) > detail::float_raw_value(Min))
-constexpr auto bounded_float<Min, Max>::operator%=(bounded_float<Min, Max> rhs) -> bounded_float&
-{
-    *this = *this % rhs;
-    return *this;
-}
-
 } // namespace boost::safe_numbers
 
 // Mixed-bounds blocking for bounded_float
@@ -308,7 +282,6 @@ BOOST_SAFE_NUMBERS_DEFINE_MIXED_BOUNDED_FLOAT_OP("addition", operator+)
 BOOST_SAFE_NUMBERS_DEFINE_MIXED_BOUNDED_FLOAT_OP("subtraction", operator-)
 BOOST_SAFE_NUMBERS_DEFINE_MIXED_BOUNDED_FLOAT_OP("multiplication", operator*)
 BOOST_SAFE_NUMBERS_DEFINE_MIXED_BOUNDED_FLOAT_OP("division", operator/)
-BOOST_SAFE_NUMBERS_DEFINE_MIXED_BOUNDED_FLOAT_OP("modulo", operator%)
 
 } // namespace boost::safe_numbers
 
diff --git a/include/boost/safe_numbers/detail/float_basis.hpp b/include/boost/safe_numbers/detail/float_basis.hpp
index 3a639ee..ed593bd 100644
--- a/include/boost/safe_numbers/detail/float_basis.hpp
+++ b/include/boost/safe_numbers/detail/float_basis.hpp
@@ -23,24 +23,10 @@
 #include <cstdlib>
 #include <utility>
 #include <optional>
-
-#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
-#include <cuda/std/cmath>
-#else
 #include <cmath>
-#endif
 
 #endif // BOOST_SAFE_NUMBERS_BUILD_MODULE
 
-// Selects the namespace that provides the <cmath> functions so device builds use
-// the libcu++ implementation. Mirrors the switch in cmath.hpp, which is not
-// visible here because that header undefines the macro before this one is reused.
-#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
-#  define BOOST_SAFE_NUMBERS_DETAIL_CMATH_NS cuda::std
-#else
-#  define BOOST_SAFE_NUMBERS_DETAIL_CMATH_NS std
-#endif
-
 namespace boost::safe_numbers::detail {
 
 template <compatible_float_type BasisType>
@@ -333,45 +319,6 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto divbyzero_div_msg()
     }
 }
 
-template <compatible_float_type BasisType>
-BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto nan_mod_msg() noexcept -> const char*
-{
-    if constexpr (std::is_same_v<BasisType, float>)
-    {
-        return "Operation with NAN detected in f32 modulo";
-    }
-    else
-    {
-        return "Operation with NAN detected in f64 modulo";
-    }
-}
-
-template <compatible_float_type BasisType>
-BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto invalid_mod_msg() noexcept -> const char*
-{
-    if constexpr (std::is_same_v<BasisType, float>)
-    {
-        return "Invalid operation (IEEE 754-2008 section 7.2) detected in f32 modulo";
-    }
-    else
-    {
-        return "Invalid operation (IEEE 754-2008 section 7.2) detected in f64 modulo";
-    }
-}
-
-template <compatible_float_type BasisType>
-BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto modbyzero_mod_msg() noexcept -> const char*
-{
-    if constexpr (std::is_same_v<BasisType, float>)
-    {
-        return "Modulo by zero detected in f32 modulo";
-    }
-    else
-    {
-        return "Modulo by zero detected in f64 modulo";
-    }
-}
-
 // ------------------------------
 // Helper <cmath> functions
 // ------------------------------
@@ -686,7 +633,7 @@ namespace impl {
 //   4) Add infs of differing sign      -> Invalid Op
 //   5) Any operations with an SNAN     -> Invalid Op
 template <compatible_float_type T>
-BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] auto checked_float_addition(const T lhs, const T rhs, T& res) -> error_category
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto checked_float_addition(const T lhs, const T rhs, T& res) -> error_category
 {
     res = lhs + rhs;
 
@@ -934,7 +881,7 @@ namespace impl {
 
 // See comment above on checked_float_add
 template <compatible_float_type T>
-BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] auto checked_float_subtraction(const T lhs, const T rhs, T& res) -> error_category
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto checked_float_subtraction(const T lhs, const T rhs, T& res) -> error_category
 {
     res = lhs - rhs;
 
@@ -1188,7 +1135,7 @@ namespace impl {
 
 // See comment above on checked_float_addition
 template <compatible_float_type T>
-BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] auto checked_float_multiplication(const T lhs, const T rhs, T& res) -> error_category
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto checked_float_multiplication(const T lhs, const T rhs, T& res) -> error_category
 {
     res = lhs * rhs;
 
@@ -1446,7 +1393,7 @@ namespace impl {
 
 // See comment above on checked_float_addition
 template <compatible_float_type T>
-BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] auto checked_float_division(const T lhs, const T rhs, T& res) -> error_category
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto checked_float_division(const T lhs, const T rhs, T& res) -> error_category
 {
     res = lhs / rhs;
 
@@ -1738,232 +1685,6 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE
     return float_basis<BasisType>{res};
 }
 
-// ------------------------------
-// Modulo
-// ------------------------------
-
-namespace impl {
-
-// Our comparison to zero is fine
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wfloat-equal"
-#elif defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-
-
-// std::fmod produces either a finite value with |result| < |rhs| or NaN.
-// It cannot produce a true infinity, so overflow/underflow categories are not reachable.
-// IEEE 754 7.2.f: remainder is invalid when the divisor is zero or the dividend
-// is infinite (and neither operand is NaN). For consistency with operator/, we
-// peel off the finite_nonzero / 0 case as divide_by_zero.
-template <compatible_float_type T>
-BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] auto checked_float_modulo(const T lhs, const T rhs, T& res) -> error_category
-{
-    res = BOOST_SAFE_NUMBERS_DETAIL_CMATH_NS::fmod(lhs, rhs);
-
-    // The hot path is that our modulo has nothing funny happening
-    if (!constexpr_isinf(res) && !constexpr_isnan(res)) [[likely]]
-    {
-        return error_category::no_error;
-    }
-
-    // Start with section 7.2 invalid ops
-    // 7.2.a: any general computation on a signaling NAN
-    if (constexpr_issignaling(lhs) || constexpr_issignaling(rhs))
-    {
-        return error_category::invalid_op;
-    }
-    // 7.2.f sub-case 1: zero modulo zero (matches operator/ treatment of 0/0).
-    // is_true_zero so a denormal flushed to zero by DAZ does not match here.
-    if (is_true_zero(lhs) && is_true_zero(rhs))
-    {
-        return error_category::invalid_op;
-    }
-    // Modulo by zero with a finite non-zero dividend. Strict IEEE 7.2.f
-    // classifies this as invalid_op, but we surface it separately to mirror
-    // operator/'s divide-by-zero behavior.
-    if (is_true_zero(rhs) && !constexpr_isinf(lhs) && !constexpr_isnan(lhs))
-    {
-        return error_category::divide_by_zero;
-    }
-    // 7.2.f sub-case 2: dividend is infinite, divisor is not NaN
-    if (constexpr_isinf(lhs) && !constexpr_isnan(rhs))
-    {
-        return error_category::invalid_op;
-    }
-
-    // Section 6.2: Operations with NAN yield NAN
-    if (constexpr_isnan(lhs) || constexpr_isnan(rhs))
-    {
-        return error_category::nan_op;
-    }
-
-    BOOST_SAFE_NUMBERS_UNREACHABLE; // LCOV_EXCL_LINE
-}
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#elif defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
-
-template <compatible_float_type BasisType>
-BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto throw_nan_mod() -> void
-{
-    #if !(defined(__CUDACC__) && defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA))
-    if (std::is_constant_evaluated())
-    {
-        if constexpr (std::is_same_v<BasisType, float>)
-        {
-            throw std::domain_error("Operation with NAN detected in f32 modulo");
-        }
-        else
-        {
-            throw std::domain_error("Operation with NAN detected in f64 modulo");
-        }
-    }
-    else
-    #endif
-    {
-        BOOST_SAFE_NUMBERS_THROW_EXCEPTION(std::domain_error, nan_mod_msg<BasisType>());
-    }
-}
-
-template <compatible_float_type BasisType>
-BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto throw_invalid_mod() -> void
-{
-    #if !(defined(__CUDACC__) && defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA))
-    if (std::is_constant_evaluated())
-    {
-        if constexpr (std::is_same_v<BasisType, float>)
-        {
-            throw std::domain_error("Invalid operation (IEEE 754-2008 section 7.2) detected in f32 modulo");
-        }
-        else
-        {
-            throw std::domain_error("Invalid operation (IEEE 754-2008 section 7.2) detected in f64 modulo");
-        }
-    }
-    else
-    #endif
-    {
-        BOOST_SAFE_NUMBERS_THROW_EXCEPTION(std::domain_error, invalid_mod_msg<BasisType>());
-    }
-}
-
-template <compatible_float_type BasisType>
-BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto throw_modbyzero_mod() -> void
-{
-    #if !(defined(__CUDACC__) && defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA))
-    if (std::is_constant_evaluated())
-    {
-        if constexpr (std::is_same_v<BasisType, float>)
-        {
-            throw std::domain_error("Modulo by zero detected in f32 modulo");
-        }
-        else
-        {
-            throw std::domain_error("Modulo by zero detected in f64 modulo");
-        }
-    }
-    else
-    #endif
-    {
-        BOOST_SAFE_NUMBERS_THROW_EXCEPTION(std::domain_error, modbyzero_mod_msg<BasisType>());
-    }
-}
-
-} // namespace impl
-
-template <compatible_float_type BasisType>
-BOOST_SAFE_NUMBERS_HOST_DEVICE
-[[nodiscard]] constexpr auto operator%(const float_basis<BasisType> lhs,
-                                       const float_basis<BasisType> rhs) -> float_basis<BasisType>
-{
-    const auto lhs_basis {static_cast<BasisType>(lhs)};
-    const auto rhs_basis {static_cast<BasisType>(rhs)};
-    [[maybe_unused]] BasisType res {};
-
-    // The throw branches are inlined here (rather than calling impl::throw_*_mod)
-    // so BOOST_THROW_EXCEPTION captures operator% as the source location of the throw.
-    switch (impl::checked_float_modulo(lhs_basis, rhs_basis, res))
-    {
-        case impl::error_category::no_error:
-            break;
-        case impl::error_category::nan_op:
-            #if !(defined(__CUDACC__) && defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA))
-            if (std::is_constant_evaluated())
-            {
-                if constexpr (std::is_same_v<BasisType, float>)
-                {
-                    throw std::domain_error("Operation with NAN detected in f32 modulo");
-                }
-                else
-                {
-                    throw std::domain_error("Operation with NAN detected in f64 modulo");
-                }
-            }
-            else
-            #endif
-            {
-                BOOST_SAFE_NUMBERS_THROW_EXCEPTION(std::domain_error, nan_mod_msg<BasisType>());
-            }
-            break;
-        case impl::error_category::invalid_op:
-            #if !(defined(__CUDACC__) && defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA))
-            if (std::is_constant_evaluated())
-            {
-                if constexpr (std::is_same_v<BasisType, float>)
-                {
-                    throw std::domain_error("Invalid operation (IEEE 754-2008 section 7.2) detected in f32 modulo");
-                }
-                else
-                {
-                    throw std::domain_error("Invalid operation (IEEE 754-2008 section 7.2) detected in f64 modulo");
-                }
-            }
-            else
-            #endif
-            {
-                BOOST_SAFE_NUMBERS_THROW_EXCEPTION(std::domain_error, invalid_mod_msg<BasisType>());
-            }
-            break;
-        case impl::error_category::divide_by_zero:
-            #if !(defined(__CUDACC__) && defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA))
-            if (std::is_constant_evaluated())
-            {
-                if constexpr (std::is_same_v<BasisType, float>)
-                {
-                    throw std::domain_error("Modulo by zero detected in f32 modulo");
-                }
-                else
-                {
-                    throw std::domain_error("Modulo by zero detected in f64 modulo");
-                }
-            }
-            else
-            #endif
-            {
-                BOOST_SAFE_NUMBERS_THROW_EXCEPTION(std::domain_error, modbyzero_mod_msg<BasisType>());
-            }
-            break;
-        case impl::error_category::overflow:
-            BOOST_SAFE_NUMBERS_UNREACHABLE; // LCOV_EXCL_LINE
-            break;                          // LCOV_EXCL_LINE
-        case impl::error_category::underflow:
-            BOOST_SAFE_NUMBERS_UNREACHABLE; // LCOV_EXCL_LINE
-            break;                          // LCOV_EXCL_LINE
-        default:
-            BOOST_SAFE_NUMBERS_UNREACHABLE; // LCOV_EXCL_LINE
-            break;                          // LCOV_EXCL_LINE
-    }
-
-    return float_basis<BasisType>{res};
-}
-
 } // namespace boost::safe_numbers::detail
 
 // Block any mixed floating point type operation (e.g. f32 and f64) with a
@@ -2016,12 +1737,9 @@ BOOST_SAFE_NUMBERS_DEFINE_MIXED_FLOAT_OP("addition", operator+)
 BOOST_SAFE_NUMBERS_DEFINE_MIXED_FLOAT_OP("subtraction", operator-)
 BOOST_SAFE_NUMBERS_DEFINE_MIXED_FLOAT_OP("multiplication", operator*)
 BOOST_SAFE_NUMBERS_DEFINE_MIXED_FLOAT_OP("division", operator/)
-BOOST_SAFE_NUMBERS_DEFINE_MIXED_FLOAT_OP("modulo", operator%)
 
 } // namespace boost::safe_numbers::detail
 
 #undef BOOST_SAFE_NUMBERS_DEFINE_MIXED_FLOAT_OP
 
-#undef BOOST_SAFE_NUMBERS_DETAIL_CMATH_NS
-
 #endif // BOOST_SAFE_NUMBERS_DETAIL_FLOAT_BASIS_HPP
diff --git a/test/Jamfile b/test/Jamfile
index 0b115cc..b6f5171 100644
--- a/test/Jamfile
+++ b/test/Jamfile
@@ -56,7 +56,6 @@ run test_float_addition.cpp ;
 run test_float_subtraction.cpp ;
 run test_float_mul.cpp ;
 run test_float_div.cpp ;
-run test_float_mod.cpp ;
 run test_float_streaming.cpp ;
 run test_float_std_format.cpp ;
 run test_float_fmt_format.cpp ;
@@ -182,7 +181,6 @@ run test_bounded_float_addition.cpp ;
 run test_bounded_float_subtraction.cpp ;
 run test_bounded_float_multiplication.cpp ;
 run test_bounded_float_division.cpp ;
-run test_bounded_float_modulo.cpp ;
 run test_bounded_float_conversions.cpp ;
 run test_bounded_float_streaming.cpp ;
 run test_bounded_float_std_format.cpp ;
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 7d2af82..5b6c91b 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -349,28 +349,24 @@ run test_cuda_f32_add.cu ;
 run test_cuda_f32_sub.cu ;
 run test_cuda_f32_mul.cu ;
 run test_cuda_f32_div.cu ;
-run test_cuda_f32_mod.cu ;
 
 # f64 tests
 run test_cuda_f64_add.cu ;
 run test_cuda_f64_sub.cu ;
 run test_cuda_f64_mul.cu ;
 run test_cuda_f64_div.cu ;
-run test_cuda_f64_mod.cu ;
 
 # f32 error tests
 run test_cuda_f32_add_error.cu ;
 run test_cuda_f32_sub_error.cu ;
 run test_cuda_f32_mul_error.cu ;
 run test_cuda_f32_div_error.cu ;
-run test_cuda_f32_mod_error.cu ;
 
 # f64 error tests
 run test_cuda_f64_add_error.cu ;
 run test_cuda_f64_sub_error.cu ;
 run test_cuda_f64_mul_error.cu ;
 run test_cuda_f64_div_error.cu ;
-run test_cuda_f64_mod_error.cu ;
 
 # Examples
 run ../examples/cuda.cu ;
diff --git a/test/test_bounded_float_modulo.cpp b/test/test_bounded_float_modulo.cpp
deleted file mode 100644
index d700b02..0000000
--- a/test/test_bounded_float_modulo.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2026 Matt Borland
-// Distributed under the Boost Software License, Version 1.0.
-// https://www.boost.org/LICENSE_1_0.txt
-
-#include <boost/safe_numbers/detail/config.hpp>
-#include <boost/core/lightweight_test.hpp>
-
-#if BOOST_SAFE_NUMBERS_HAS_BOUNDED_FLOAT
-
-#ifdef BOOST_SAFE_NUMBERS_BUILD_MODULE
-
-import boost.safe_numbers;
-
-#else
-
-#include <boost/safe_numbers/bounded_floats.hpp>
-#include <boost/safe_numbers/floats.hpp>
-
-#endif
-
-#include <stdexcept>
-
-using namespace boost::safe_numbers;
-
-void test_modulo_in_bounds()
-{
-    const bounded_float<-100.0f, 100.0f> a {f32{7.5f}};
-    const bounded_float<-100.0f, 100.0f> b {f32{3.0f}};
-    const auto r {a % b};
-    const bounded_float<-100.0f, 100.0f> expected {f32{1.5f}};
-    BOOST_TEST(r == expected);
-}
-
-void test_modulo_by_zero()
-{
-    const bounded_float<-100.0f, 100.0f> a {f32{7.5f}};
-    const bounded_float<-100.0f, 100.0f> b {f32{0.0f}};
-    BOOST_TEST_THROWS((void)(a % b), std::domain_error);
-}
-
-void test_modulo_post_op_out_of_range()
-{
-    const bounded_float<5.0f, 10.0f> a {f32{7.5f}};
-    const bounded_float<5.0f, 10.0f> b {f32{6.0f}};
-    BOOST_TEST_THROWS((void)(a % b), std::domain_error);
-}
-
-void test_modulo_compound_assignment()
-{
-    bounded_float<-100.0f, 100.0f> a {f32{7.5f}};
-    const bounded_float<-100.0f, 100.0f> b {f32{3.0f}};
-    a %= b;
-    const bounded_float<-100.0f, 100.0f> expected {f32{1.5f}};
-    BOOST_TEST(a == expected);
-}
-
-int main()
-{
-    test_modulo_in_bounds();
-    test_modulo_by_zero();
-    test_modulo_post_op_out_of_range();
-    test_modulo_compound_assignment();
-
-    return boost::report_errors();
-}
-
-#else // BOOST_SAFE_NUMBERS_HAS_BOUNDED_FLOAT
-
-int main() { return 0; }
-
-#endif
diff --git a/test/test_cuda_f32_mod.cu b/test/test_cuda_f32_mod.cu
deleted file mode 100644
index 0f258e6..0000000
--- a/test/test_cuda_f32_mod.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-//  Copyright Matt Borland 2026.
-//  Use, modification and distribution are subject to the
-//  Boost Software License, Version 1.0. (See accompanying file
-//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-
-#include <iostream>
-#include <iomanip>
-#include <vector>
-#include <random>
-#include <limits>
-#include <boost/safe_numbers/floats.hpp>
-#include <boost/safe_numbers/cuda_error_reporting.hpp>
-#include "cuda_managed_ptr.hpp"
-#include "stopwatch.hpp"
-
-#include <cuda_runtime.h>
-
-using test_type = boost::safe_numbers::f32;
-using basis_type = test_type::basis_type;
-
-__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < numElements)
-    {
-        out[i] = in[i] % in2[i];
-    }
-}
-
-int main(void)
-{
-    std::mt19937_64 rng{42};
-
-    int numElements = 50000;
-    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
-
-    cuda_managed_ptr<test_type> input_vector(numElements);
-    cuda_managed_ptr<test_type> input_vector2(numElements);
-    cuda_managed_ptr<test_type> output_vector(numElements);
-
-    // Divisor is bounded away from zero (>= 1) so fmod is always well defined
-    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e15), static_cast<basis_type>(1e15)};
-    std::uniform_real_distribution<basis_type> dist2{static_cast<basis_type>(1), static_cast<basis_type>(1e15)};
-    for (int i = 0; i < numElements; ++i)
-    {
-        input_vector[i] = test_type{dist(rng)};
-        input_vector2[i] = test_type{dist2(rng)};
-    }
-
-    int threadsPerBlock = 256;
-    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
-    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
-
-    boost::safe_numbers::device_error_context ctx;
-    watch w;
-
-    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
-    ctx.synchronize();
-
-    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
-
-    std::vector<test_type> results;
-    results.reserve(numElements);
-    w.reset();
-    for (int i = 0; i < numElements; ++i)
-    {
-        results.push_back(input_vector[i] % input_vector2[i]);
-    }
-    double t = w.elapsed();
-
-    for (int i = 0; i < numElements; ++i)
-    {
-        if (output_vector[i] != results[i])
-        {
-            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
-            return EXIT_FAILURE;
-        }
-    }
-
-    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
-    std::cout << "Done\n";
-
-    return 0;
-}
diff --git a/test/test_cuda_f32_mod_error.cu b/test/test_cuda_f32_mod_error.cu
deleted file mode 100644
index 48a6c8e..0000000
--- a/test/test_cuda_f32_mod_error.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-//  Copyright Matt Borland 2026.
-//  Use, modification and distribution are subject to the
-//  Boost Software License, Version 1.0. (See accompanying file
-//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-
-
-
-#include <iostream>
-#include <stdexcept>
-#include <boost/safe_numbers/floats.hpp>
-#include <boost/safe_numbers/cuda_error_reporting.hpp>
-#include "cuda_managed_ptr.hpp"
-
-#include <cuda_runtime.h>
-
-using test_type = boost::safe_numbers::f32;
-using basis_type = test_type::basis_type;
-
-__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < numElements)
-    {
-        out[i] = in[i] % in2[i];
-    }
-}
-
-int main(void)
-{
-    int numElements = 1024;
-
-    cuda_managed_ptr<test_type> input_vector(numElements);
-    cuda_managed_ptr<test_type> input_vector2(numElements);
-    cuda_managed_ptr<test_type> output_vector(numElements);
-
-    // finite non-zero modulo zero -> modulo by zero
-    for (int i = 0; i < numElements; ++i)
-    {
-        input_vector[i] = test_type{static_cast<basis_type>(1)};
-        input_vector2[i] = test_type{static_cast<basis_type>(0)};
-    }
-
-    int threadsPerBlock = 256;
-    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
-
-    boost::safe_numbers::device_error_context ctx;
-
-    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
-
-    try
-    {
-        ctx.synchronize();
-        std::cerr << "Expected exception from modulo by zero but none was thrown!" << std::endl;
-        return EXIT_FAILURE;
-    }
-    catch (const std::domain_error& e)
-    {
-        std::cerr << "Caught expected error: " << e.what() << std::endl;
-        std::cerr << "Test PASSED\n";
-        return 0;
-    }
-}
diff --git a/test/test_cuda_f64_mod.cu b/test/test_cuda_f64_mod.cu
deleted file mode 100644
index 140d477..0000000
--- a/test/test_cuda_f64_mod.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-//  Copyright Matt Borland 2026.
-//  Use, modification and distribution are subject to the
-//  Boost Software License, Version 1.0. (See accompanying file
-//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-
-#include <iostream>
-#include <iomanip>
-#include <vector>
-#include <random>
-#include <limits>
-#include <boost/safe_numbers/floats.hpp>
-#include <boost/safe_numbers/cuda_error_reporting.hpp>
-#include "cuda_managed_ptr.hpp"
-#include "stopwatch.hpp"
-
-#include <cuda_runtime.h>
-
-using test_type = boost::safe_numbers::f64;
-using basis_type = test_type::basis_type;
-
-__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < numElements)
-    {
-        out[i] = in[i] % in2[i];
-    }
-}
-
-int main(void)
-{
-    std::mt19937_64 rng{42};
-
-    int numElements = 50000;
-    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
-
-    cuda_managed_ptr<test_type> input_vector(numElements);
-    cuda_managed_ptr<test_type> input_vector2(numElements);
-    cuda_managed_ptr<test_type> output_vector(numElements);
-
-    // Divisor is bounded away from zero (>= 1) so fmod is always well defined
-    std::uniform_real_distribution<basis_type> dist{static_cast<basis_type>(-1e15), static_cast<basis_type>(1e15)};
-    std::uniform_real_distribution<basis_type> dist2{static_cast<basis_type>(1), static_cast<basis_type>(1e15)};
-    for (int i = 0; i < numElements; ++i)
-    {
-        input_vector[i] = test_type{dist(rng)};
-        input_vector2[i] = test_type{dist2(rng)};
-    }
-
-    int threadsPerBlock = 256;
-    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
-    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
-
-    boost::safe_numbers::device_error_context ctx;
-    watch w;
-
-    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
-    ctx.synchronize();
-
-    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
-
-    std::vector<test_type> results;
-    results.reserve(numElements);
-    w.reset();
-    for (int i = 0; i < numElements; ++i)
-    {
-        results.push_back(input_vector[i] % input_vector2[i]);
-    }
-    double t = w.elapsed();
-
-    for (int i = 0; i < numElements; ++i)
-    {
-        if (output_vector[i] != results[i])
-        {
-            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
-            return EXIT_FAILURE;
-        }
-    }
-
-    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
-    std::cout << "Done\n";
-
-    return 0;
-}
diff --git a/test/test_cuda_f64_mod_error.cu b/test/test_cuda_f64_mod_error.cu
deleted file mode 100644
index 2e22757..0000000
--- a/test/test_cuda_f64_mod_error.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-//  Copyright Matt Borland 2026.
-//  Use, modification and distribution are subject to the
-//  Boost Software License, Version 1.0. (See accompanying file
-//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-
-
-
-#include <iostream>
-#include <stdexcept>
-#include <boost/safe_numbers/floats.hpp>
-#include <boost/safe_numbers/cuda_error_reporting.hpp>
-#include "cuda_managed_ptr.hpp"
-
-#include <cuda_runtime.h>
-
-using test_type = boost::safe_numbers::f64;
-using basis_type = test_type::basis_type;
-
-__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < numElements)
-    {
-        out[i] = in[i] % in2[i];
-    }
-}
-
-int main(void)
-{
-    int numElements = 1024;
-
-    cuda_managed_ptr<test_type> input_vector(numElements);
-    cuda_managed_ptr<test_type> input_vector2(numElements);
-    cuda_managed_ptr<test_type> output_vector(numElements);
-
-    // finite non-zero modulo zero -> modulo by zero
-    for (int i = 0; i < numElements; ++i)
-    {
-        input_vector[i] = test_type{static_cast<basis_type>(1)};
-        input_vector2[i] = test_type{static_cast<basis_type>(0)};
-    }
-
-    int threadsPerBlock = 256;
-    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
-
-    boost::safe_numbers::device_error_context ctx;
-
-    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
-
-    try
-    {
-        ctx.synchronize();
-        std::cerr << "Expected exception from modulo by zero but none was thrown!" << std::endl;
-        return EXIT_FAILURE;
-    }
-    catch (const std::domain_error& e)
-    {
-        std::cerr << "Caught expected error: " << e.what() << std::endl;
-        std::cerr << "Test PASSED\n";
-        return 0;
-    }
-}
diff --git a/test/test_float_mod.cpp b/test/test_float_mod.cpp
deleted file mode 100644
index 4865682..0000000
--- a/test/test_float_mod.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-// Copyright 2026 Matt Borland
-// Distributed under the Boost Software License, Version 1.0.
-// https://www.boost.org/LICENSE_1_0.txt
-
-#include <boost/core/lightweight_test.hpp>
-
-// Ignore [[nodiscard]] on the tests that we know are going to throw
-#ifdef __clang__
-#  pragma clang diagnostic push
-#  pragma clang diagnostic ignored "-Wunused-result"
-#elif defined(__GNUC__)
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wunused-result"
-#elif defined(_MSC_VER)
-#  pragma warning (push)
-#  pragma warning (disable: 4834)
-#endif
-
-#ifdef BOOST_SAFE_NUMBERS_BUILD_MODULE
-
-import boost.safe_numbers;
-
-#else
-
-#include <boost/safe_numbers/floats.hpp>
-
-#endif
-
-#include <bit>
-#include <cstdint>
-#include <limits>
-#include <stdexcept>
-#include <type_traits>
-
-using namespace boost::safe_numbers;
-
-// -----------------------------------------------
-// No error: regular finite modulo
-// IEEE 754 section 6 hot path
-// -----------------------------------------------
-
-template <typename T>
-void test_finite_modulo()
-{
-    using basis_type = typename T::basis_type;
-
-    BOOST_TEST(T{static_cast<basis_type>(7.0)} % T{static_cast<basis_type>(3.0)} == T{static_cast<basis_type>(1.0)});
-    BOOST_TEST(T{static_cast<basis_type>(6.0)} % T{static_cast<basis_type>(3.0)} == T{static_cast<basis_type>(0.0)});
-    BOOST_TEST(T{static_cast<basis_type>(5.5)} % T{static_cast<basis_type>(2.0)} == T{static_cast<basis_type>(1.5)});
-    BOOST_TEST(T{static_cast<basis_type>(0.0)} % T{static_cast<basis_type>(1.0)} == T{static_cast<basis_type>(0.0)});
-
-    // Sign of fmod result follows the sign of the dividend
-    BOOST_TEST(T{static_cast<basis_type>(-7.0)} % T{static_cast<basis_type>(3.0)} == T{static_cast<basis_type>(-1.0)});
-    BOOST_TEST(T{static_cast<basis_type>(7.0)} % T{static_cast<basis_type>(-3.0)} == T{static_cast<basis_type>(1.0)});
-    BOOST_TEST(T{static_cast<basis_type>(-7.0)} % T{static_cast<basis_type>(-3.0)} == T{static_cast<basis_type>(-1.0)});
-
-    // Dividend smaller than divisor returns the dividend
-    BOOST_TEST(T{static_cast<basis_type>(1.0)} % T{static_cast<basis_type>(5.0)} == T{static_cast<basis_type>(1.0)});
-
-    // x % +inf returns x for any finite x
-    const T pos_inf {std::numeric_limits<basis_type>::infinity()};
-    const T neg_inf {-std::numeric_limits<basis_type>::infinity()};
-    BOOST_TEST(T{static_cast<basis_type>(5.0)} % pos_inf == T{static_cast<basis_type>(5.0)});
-    BOOST_TEST(T{static_cast<basis_type>(5.0)} % neg_inf == T{static_cast<basis_type>(5.0)});
-    BOOST_TEST(T{std::numeric_limits<basis_type>::max()} % pos_inf == T{std::numeric_limits<basis_type>::max()});
-    BOOST_TEST(T{std::numeric_limits<basis_type>::lowest()} % pos_inf == T{std::numeric_limits<basis_type>::lowest()});
-}
-
-// -----------------------------------------------
-// NaN op: any operand is a quiet NaN
-// IEEE 754 section 6.2
-// -----------------------------------------------
-
-template <typename T>
-void test_qnan_operand()
-{
-    using basis_type = typename T::basis_type;
-
-    const T qnan {std::numeric_limits<basis_type>::quiet_NaN()};
-    const T finite {static_cast<basis_type>(2.0)};
-    const T pos_inf {std::numeric_limits<basis_type>::infinity()};
-    const T neg_inf {-std::numeric_limits<basis_type>::infinity()};
-    const T zero {static_cast<basis_type>(0.0)};
-
-    // QNaN % finite, in either order
-    BOOST_TEST_THROWS(qnan % finite, std::domain_error);
-    BOOST_TEST_THROWS(finite % qnan, std::domain_error);
-
-    // QNaN % QNaN
-    BOOST_TEST_THROWS(qnan % qnan, std::domain_error);
-
-    // QNaN with +inf and -inf, in either order
-    // Per IEEE 7.2.f, "neither is NaN" excludes invalid_op when an operand is NaN,
-    // so these stay as nan_op.
-    BOOST_TEST_THROWS(qnan % pos_inf, std::domain_error);
-    BOOST_TEST_THROWS(pos_inf % qnan, std::domain_error);
-    BOOST_TEST_THROWS(qnan % neg_inf, std::domain_error);
-    BOOST_TEST_THROWS(neg_inf % qnan, std::domain_error);
-
-    // QNaN % zero and zero % QNaN (NaN takes priority over divide-by-zero)
-    BOOST_TEST_THROWS(qnan % zero, std::domain_error);
-    BOOST_TEST_THROWS(zero % qnan, std::domain_error);
-}
-
-// -----------------------------------------------
-// Invalid op: zero modulo zero
-// IEEE 754 section 7.2.f (sub-case where divisor is zero)
-// (0 % 0) yields NaN regardless of sign, classified as invalid_op
-// to match operator/'s 0/0 treatment
-// -----------------------------------------------
-
-template <typename T>
-void test_zero_mod_zero_invalid()
-{
-    using basis_type = typename T::basis_type;
-
-    const T pos_zero {static_cast<basis_type>(0.0)};
-    const T neg_zero {static_cast<basis_type>(-0.0)};
-
-    BOOST_TEST_THROWS(pos_zero % pos_zero, std::domain_error);
-    BOOST_TEST_THROWS(pos_zero % neg_zero, std::domain_error);
-    BOOST_TEST_THROWS(neg_zero % pos_zero, std::domain_error);
-    BOOST_TEST_THROWS(neg_zero % neg_zero, std::domain_error);
-}
-
-// -----------------------------------------------
-// Invalid op: infinite dividend
-// IEEE 754 section 7.2.f (sub-case where dividend is infinite)
-// (inf % y) yields NaN for any non-NaN y
-// -----------------------------------------------
-
-template <typename T>
-void test_inf_dividend_invalid()
-{
-    using basis_type = typename T::basis_type;
-
-    const T pos_inf {std::numeric_limits<basis_type>::infinity()};
-    const T neg_inf {-std::numeric_limits<basis_type>::infinity()};
-    const T finite_pos {static_cast<basis_type>(2.0)};
-    const T finite_neg {static_cast<basis_type>(-2.0)};
-    const T pos_zero {static_cast<basis_type>(0.0)};
-    const T neg_zero {static_cast<basis_type>(-0.0)};
-
-    BOOST_TEST_THROWS(pos_inf % finite_pos, std::domain_error);
-    BOOST_TEST_THROWS(pos_inf % finite_neg, std::domain_error);
-    BOOST_TEST_THROWS(neg_inf % finite_pos, std::domain_error);
-    BOOST_TEST_THROWS(neg_inf % finite_neg, std::domain_error);
-
-    // inf % 0 and inf % -0
-    BOOST_TEST_THROWS(pos_inf % pos_zero, std::domain_error);
-    BOOST_TEST_THROWS(pos_inf % neg_zero, std::domain_error);
-    BOOST_TEST_THROWS(neg_inf % pos_zero, std::domain_error);
-    BOOST_TEST_THROWS(neg_inf % neg_zero, std::domain_error);
-
-    // inf % inf
-    BOOST_TEST_THROWS(pos_inf % pos_inf, std::domain_error);
-    BOOST_TEST_THROWS(pos_inf % neg_inf, std::domain_error);
-    BOOST_TEST_THROWS(neg_inf % pos_inf, std::domain_error);
-    BOOST_TEST_THROWS(neg_inf % neg_inf, std::domain_error);
-}
-
-// -----------------------------------------------
-// Modulo by zero: finite non-zero dividend % zero
-// Surfaced separately from invalid_op to mirror operator/'s
-// divide-by-zero behavior.
-// -----------------------------------------------
-
-template <typename T>
-void test_modulo_by_zero()
-{
-    using basis_type = typename T::basis_type;
-
-    const T finite_pos {static_cast<basis_type>(1.0)};
-    const T finite_neg {static_cast<basis_type>(-1.0)};
-    const T pos_zero {static_cast<basis_type>(0.0)};
-    const T neg_zero {static_cast<basis_type>(-0.0)};
-
-    BOOST_TEST_THROWS(finite_pos % pos_zero, std::domain_error);
-    BOOST_TEST_THROWS(finite_pos % neg_zero, std::domain_error);
-    BOOST_TEST_THROWS(finite_neg % pos_zero, std::domain_error);
-    BOOST_TEST_THROWS(finite_neg % neg_zero, std::domain_error);
-
-    // max and lowest modulo zero
-    const T max_val {std::numeric_limits<basis_type>::max()};
-    const T lowest_val {std::numeric_limits<basis_type>::lowest()};
-    BOOST_TEST_THROWS(max_val % pos_zero, std::domain_error);
-    BOOST_TEST_THROWS(lowest_val % pos_zero, std::domain_error);
-
-    // Subnormal modulo zero
-    const T tiny {std::numeric_limits<basis_type>::denorm_min()};
-    BOOST_TEST_THROWS(tiny % pos_zero, std::domain_error);
-    BOOST_TEST_THROWS(tiny % neg_zero, std::domain_error);
-}
-
-// -----------------------------------------------
-// Invalid op: any operand is a signaling NaN
-// IEEE 754 section 7.2.a
-//
-// We construct the SNaN via std::bit_cast from the integer bit pattern
-// of std::numeric_limits<basis_type>::signaling_NaN(). Going through bit_cast
-// avoids any FP ops that could quiet the value before it reaches our checker.
-// -----------------------------------------------
-
-template <typename T>
-void test_snan_operand()
-{
-    using basis_type = typename T::basis_type;
-
-    if constexpr (std::numeric_limits<basis_type>::has_signaling_NaN)
-    {
-        using bit_type = std::conditional_t<std::is_same_v<basis_type, float>, std::uint32_t, std::uint64_t>;
-
-        constexpr auto snan_bits {std::bit_cast<bit_type>(std::numeric_limits<basis_type>::signaling_NaN())};
-        constexpr auto qnan_bits {std::bit_cast<bit_type>(std::numeric_limits<basis_type>::quiet_NaN())};
-
-        // On platforms that do not actually distinguish between SNaN and QNaN at
-        // the bit-pattern level there is nothing to test here.
-        if constexpr (snan_bits != qnan_bits)
-        {
-            const auto snan_val {std::bit_cast<basis_type>(snan_bits)};
-            const T snan {snan_val};
-            const T finite {static_cast<basis_type>(2.0)};
-            const T pos_inf {std::numeric_limits<basis_type>::infinity()};
-            const T neg_inf {-std::numeric_limits<basis_type>::infinity()};
-            const T zero {static_cast<basis_type>(0.0)};
-
-            // SNaN % finite, in either order
-            BOOST_TEST_THROWS(snan % finite, std::domain_error);
-            BOOST_TEST_THROWS(finite % snan, std::domain_error);
-
-            // SNaN % SNaN
-            BOOST_TEST_THROWS(snan % snan, std::domain_error);
-
-            // SNaN with +inf and -inf, in either order
-            BOOST_TEST_THROWS(snan % pos_inf, std::domain_error);
-            BOOST_TEST_THROWS(pos_inf % snan, std::domain_error);
-            BOOST_TEST_THROWS(snan % neg_inf, std::domain_error);
-            BOOST_TEST_THROWS(neg_inf % snan, std::domain_error);
-
-            // SNaN % zero and zero % SNaN (SNaN takes priority over divide-by-zero)
-            BOOST_TEST_THROWS(snan % zero, std::domain_error);
-            BOOST_TEST_THROWS(zero % snan, std::domain_error);
-
-            // Negative-sign SNaN: flip the sign bit on the canonical SNaN
-            // pattern. Detection must work for both signs.
-            constexpr bit_type sign_mask {bit_type{1} << (std::numeric_limits<bit_type>::digits - 1)};
-            const auto neg_snan_bits {static_cast<bit_type>(snan_bits ^ sign_mask)};
-            const auto neg_snan_val {std::bit_cast<basis_type>(neg_snan_bits)};
-            const T neg_snan {neg_snan_val};
-            BOOST_TEST_THROWS(neg_snan % finite, std::domain_error);
-            BOOST_TEST_THROWS(finite % neg_snan, std::domain_error);
-
-            // SNaN with a non-default payload. The lowest SNaN bit pattern is
-            // inf_bits + 1 (any payload, mantissa MSB still 0), and the highest
-            // is one below the canonical quiet_NaN(). Both must be detected.
-            constexpr auto inf_bits {std::bit_cast<bit_type>(std::numeric_limits<basis_type>::infinity())};
-            const auto low_snan_val {std::bit_cast<basis_type>(static_cast<bit_type>(inf_bits + bit_type{1}))};
-            const T low_snan {low_snan_val};
-            BOOST_TEST_THROWS(low_snan % finite, std::domain_error);
-
-            const auto high_snan_val {std::bit_cast<basis_type>(static_cast<bit_type>(qnan_bits - bit_type{1}))};
-            const T high_snan {high_snan_val};
-            BOOST_TEST_THROWS(high_snan % finite, std::domain_error);
-
-            // QNaN (not SNaN) at the boundary should still be reported as
-            // nan_op rather than invalid_op. The exception type is the same
-            // (std::domain_error) but this exercises that the upper bound of
-            // the SNaN range is exclusive.
-            const auto qnan_boundary {std::bit_cast<basis_type>(qnan_bits)};
-            const T qnan_b {qnan_boundary};
-            BOOST_TEST_THROWS(qnan_b % finite, std::domain_error);
-        }
-    }
-}
-
-int main()
-{
-    test_finite_modulo<f32>();
-    test_finite_modulo<f64>();
-
-    test_qnan_operand<f32>();
-    test_qnan_operand<f64>();
-
-    test_zero_mod_zero_invalid<f32>();
-    test_zero_mod_zero_invalid<f64>();
-
-    test_inf_dividend_invalid<f32>();
-    test_inf_dividend_invalid<f64>();
-
-    test_modulo_by_zero<f32>();
-    test_modulo_by_zero<f64>();
-
-    test_snan_operand<f32>();
-    test_snan_operand<f64>();
-
-    return boost::report_errors();
-}
-
-#ifdef __clang__
-#  pragma clang diagnostic pop
-#elif defined(__GNUC__)
-#  pragma GCC diagnostic pop
-#elif defined(_MSC_VER)
-#  pragma warning (pop)
-#endif

From 89dd0e9895ff2ad7bbd4426add9f09af3b1467c4 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 25 Jun 2026 17:10:35 -0400
Subject: [PATCH 3/3] Remove mod from docs

---
 doc/modules/ROOT/pages/bounded_float.adoc |  4 ----
 doc/modules/ROOT/pages/floats.adoc        | 18 ++++--------------
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/doc/modules/ROOT/pages/bounded_float.adoc b/doc/modules/ROOT/pages/bounded_float.adoc
index 732da43..f8370ef 100644
--- a/doc/modules/ROOT/pages/bounded_float.adoc
+++ b/doc/modules/ROOT/pages/bounded_float.adoc
@@ -88,14 +88,12 @@ public:
     friend constexpr auto operator-(bounded_float, bounded_float) -> bounded_float;
     friend constexpr auto operator*(bounded_float, bounded_float) -> bounded_float;
     friend constexpr auto operator/(bounded_float, bounded_float) -> bounded_float;
-    friend constexpr auto operator%(bounded_float, bounded_float) -> bounded_float;
 
     // Compound assignment
     constexpr auto operator+=(bounded_float) -> bounded_float&;
     constexpr auto operator-=(bounded_float) -> bounded_float&;
     constexpr auto operator*=(bounded_float) -> bounded_float&;
     constexpr auto operator/=(bounded_float) -> bounded_float&;
-    constexpr auto operator%=(bounded_float) -> bounded_float&;
 };
 
 } // namespace boost::safe_numbers
@@ -117,8 +115,6 @@ public:
 | Multiplication underflow | `std::underflow_error`
 | Division producing NaN (e.g., 0/0, inf/inf) | `std::domain_error`
 | Division by zero (finite numerator) | `std::domain_error`
-| Modulo with zero divisor | `std::domain_error`
-| Modulo with infinite numerator | `std::domain_error`
 | Narrowing conversion (e.g., f64 -> f32) overflowing to infinity | `std::overflow_error`
 |===
 
diff --git a/doc/modules/ROOT/pages/floats.adoc b/doc/modules/ROOT/pages/floats.adoc
index 4df9b6b..cafc510 100644
--- a/doc/modules/ROOT/pages/floats.adoc
+++ b/doc/modules/ROOT/pages/floats.adoc
@@ -75,10 +75,6 @@ template <compatible_float_type BasisType>
 constexpr auto operator/(float_basis<BasisType> lhs,
                          float_basis<BasisType> rhs) -> float_basis<BasisType>;
 
-template <compatible_float_type BasisType>
-constexpr auto operator%(float_basis<BasisType> lhs,
-                         float_basis<BasisType> rhs) -> float_basis<BasisType>;
-
 } // namespace boost::safe_numbers
 ----
 
@@ -179,10 +175,6 @@ constexpr auto operator*(float_basis<BasisType> lhs,
 template <compatible_float_type BasisType>
 constexpr auto operator/(float_basis<BasisType> lhs,
                          float_basis<BasisType> rhs) -> float_basis<BasisType>;
-
-template <compatible_float_type BasisType>
-constexpr auto operator%(float_basis<BasisType> lhs,
-                         float_basis<BasisType> rhs) -> float_basis<BasisType>;
 ----
 
 Each arithmetic operator computes the result, classifies it according to IEEE 754-2008 sections 6 and 7, and throws when the result is an exceptional value:
@@ -190,10 +182,10 @@ Each arithmetic operator computes the result, classifies it according to IEEE 75
 - `pass:[+]`, `-`: Throw `std::overflow_error` on saturation to positive infinity and `std::underflow_error` on saturation to negative infinity. Subtracting like-signed infinities, or adding opposite-signed infinities, is an invalid operation and throws `std::domain_error`.
 - `pass:[*]`: Throws `std::overflow_error` or `std::underflow_error` on saturation to an infinity. Multiplying zero by an infinity is an invalid operation and throws `std::domain_error`.
 - `/`: Throws `std::overflow_error` or `std::underflow_error` on saturation to an infinity. Dividing zero by zero or infinity by infinity is an invalid operation and throws `std::domain_error`. Dividing a finite non-zero value by zero throws `std::domain_error`.
-- `%`: Computes the IEEE 754 remainder via `std::fmod`. Modulo by zero, or modulo of an infinite dividend, throws `std::domain_error`. The remainder cannot overflow or underflow.
 
 In every operation, an operand that is a quiet or signaling NaN causes the operation to throw `std::domain_error`.
-The `%` operator borrows the spelling of the integer modulo operator but performs floating-point remainder, mirroring `std::fmod` rather than truncated integer division.
+
+NOTE: There is no `operator%` for `f32`/`f64`. C++ has no built-in floating-point remainder operator; use the `xref:cmath.adoc[fmod]` function instead.
 
 == Exception Behavior
 
@@ -204,14 +196,12 @@ The following table summarizes the exceptional conditions and the exception each
 
 | Result saturates to positive infinity | `pass:[+]` `-` `pass:[*]` `/` | `std::overflow_error`
 | Result saturates to negative infinity | `pass:[+]` `-` `pass:[*]` `/` | `std::underflow_error`
-| Either operand is a quiet NaN | `pass:[+]` `-` `pass:[*]` `/` `%` | `std::domain_error`
-| Either operand is a signaling NaN | `pass:[+]` `-` `pass:[*]` `/` `%` | `std::domain_error`
+| Either operand is a quiet NaN | `pass:[+]` `-` `pass:[*]` `/` | `std::domain_error`
+| Either operand is a signaling NaN | `pass:[+]` `-` `pass:[*]` `/` | `std::domain_error`
 | Addition of opposite-signed infinities, or subtraction of like-signed infinities | `pass:[+]` `-` | `std::domain_error`
 | Multiplication of zero by an infinity | `pass:[*]` | `std::domain_error`
 | Division of zero by zero, or infinity by infinity | `/` | `std::domain_error`
 | Division of a finite non-zero value by zero | `/` | `std::domain_error`
-| Modulo of zero by zero, or with an infinite dividend | `%` | `std::domain_error`
-| Modulo by zero with a finite non-zero dividend | `%` | `std::domain_error`
 |===
 
 NOTE: Saturation to an infinity (IEEE 754 section 6.1) maps to `std::overflow_error` or `std::underflow_error` according to sign. The invalid-operation cases (section 7.2), NaN propagation (section 6.2), and division by zero of a finite numerator (section 7.3) are all reported as `std::domain_error`.