From a396bff3a94076778926cfe0bafb3b832b591814 Mon Sep 17 00:00:00 2001 From: Jony Castagna Date: Thu, 29 Jan 2026 10:04:44 +0000 Subject: [PATCH 1/5] Fixed issues with CUDA compilation --- CMakeLists.txt | 2 +- include/bout/bout_types.hxx | 2 ++ include/bout/fieldops.hxx | 33 +++++++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b5beb75898..7cb66b8fc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,7 +51,7 @@ message(STATUS "Configuring BOUT++ version ${BOUT_FULL_VERSION}") project(BOUT++ DESCRIPTION "Fluid PDE solver framework" VERSION ${BOUT_CMAKE_ACCEPTABLE_VERSION} - LANGUAGES CXX) + LANGUAGES C CXX) include(CMakeDependentOption) diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx index 03bc4dcee4..7c68132835 100644 --- a/include/bout/bout_types.hxx +++ b/include/bout/bout_types.hxx @@ -25,6 +25,8 @@ #include #include +#include + /// Size of real numbers using BoutReal = double; diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 114793ebc8..a837f6f1b0 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -5,10 +5,16 @@ #include "bout/array.hxx" #include "bout/bout_types.hxx" -#include #include #include #include +#include + +// CUDA: only pull in the *runtime API* (types + function prototypes) when CUDA is enabled. +// Do NOT include in a public header. +#if defined(__CUDACC__) || defined(CAMP_HAVE_CUDA) || defined(BOUT_ENABLE_CUDA) + #include +#endif class Mesh; class Field3D; @@ -102,6 +108,11 @@ struct Add { }; }; +// ---------------------------------------------------------------------------- +// CUDA kernel: only visible to NVCC (or a CUDA-capable compilation frontend). +// This prevents g++ from seeing __global__/__launch_bounds__/threadIdx/etc. +// ---------------------------------------------------------------------------- +#if defined(__CUDACC__) template __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -127,9 +138,14 @@ __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Ex // out[idx] = expr(idx); // single‐pass fusion //} } +#endif inline std::unordered_map> regionIndicesCache; +// ---------------------------------------------------------------------------- +// Streams: only for CUDA compilation. Host builds get a fallback path in evaluate(). +// ---------------------------------------------------------------------------- +#if defined(__CUDACC__) struct StreamsRAII { std::vector streams; @@ -163,6 +179,7 @@ struct StreamsRAII { StreamsRAII& operator=(StreamsRAII&&) = delete; }; inline struct StreamsRAII streams; +#endif template struct BinaryExpr { @@ -254,18 +271,18 @@ struct BinaryExpr { operator View() const { return View{lhs, rhs, &indices[0], indices.size(), f}; } void evaluate(BoutReal* data) const { -#if 1 + +#if defined(__CUDACC__) cudaStream_t stream = streams.get(); int blocks = (size() + THREADS - 1) / THREADS; evaluatorExpr<<>>(&data[0], static_cast(*this)); cudaStreamSynchronize(stream); streams.put(stream); -#endif -#if 0 - // OpenMP impl. - int e = size(); - //#pragma omp parallel for +#else + // Host fallback: keep header parsable/compilable by g++ in pylib builds. + // (Optional: add OpenMP pragma if desired.) + const int e = size(); for (int i = 0; i < e; ++i) { int idx = regionIdx(i); data[idx] = operator()(idx); // single‐pass fusion @@ -279,4 +296,4 @@ struct BinaryExpr { std::optional getRegionID() const { return regionID; }; }; -#endif // BOUT_FIELDSOPS_HXX +#endif // BOUT_FIELDOPS_HXX From 546ff265c6b576c891b6241eecd1ced58a870219 Mon Sep 17 00:00:00 2001 From: Jony Castagna Date: Mon, 16 Feb 2026 12:26:15 +0000 Subject: [PATCH 2/5] Added porting of ddt(U). Part I --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 86 +++++++++++++++++-- 1 file changed, 81 insertions(+), 5 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 96c28bab12..360fd9b0da 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -50,6 +50,27 @@ #include // Defines BOUT_FOR_RAJA +#include +inline void nvtxPushColor(const char* name, uint32_t argb) +{ + nvtxEventAttributes_t ev{}; + ev.version = NVTX_VERSION; + ev.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + ev.messageType = NVTX_MESSAGE_TYPE_ASCII; + ev.message.ascii = name; + ev.colorType = NVTX_COLOR_ARGB; + ev.color = argb; + + nvtxRangePushEx(&ev); +} + +inline void nvtxPop() +{ + nvtxRangePop(); +} + + + #if BOUT_HAS_HYPRE #include #endif @@ -1275,13 +1296,18 @@ class ELMpb : public PhysicsModel { bool first_run = true; // For printing out some diagnostics first time around int rhs(BoutReal t) override { + + nvtxPushColor("rhs_communicate", 0xFFFFFF00); // Perform communications mesh->communicate(comms); + nvtxPop(); Coordinates* metric = mesh->getCoordinates(); //////////////////////////////////////////// // Transitions from 0 in core to 1 in vacuum + + nvtxPushColor("rhs_nonlinear", 0xFFFFFF00); if (nonlinear) { vac_mask = (1.0 - tanh(((P0 + P) - vacuum_pressure) / vacuum_trans)) / 2.0; @@ -1302,10 +1328,12 @@ class ELMpb : public PhysicsModel { } eta = interp_to(eta, loc); } + nvtxPop(); //////////////////////////////////////////// // Resonant Magnetic Perturbation code + nvtxPushColor("rhs_include_rmp", 0xFFFFFF00); if (include_rmp) { if ((rmp_ramp > 0.0) || (rmp_freq > 0.0) || (rmp_rotate != 0.0)) { @@ -1356,16 +1384,20 @@ class ELMpb : public PhysicsModel { mesh->communicate(rmp_Psi); } + nvtxPop(); //////////////////////////////////////////// // Inversion + nvtxPushColor("BOUT_FOR_RAJA_aparSolver", 0xFFFFFF00); if (evolve_jpar) { // Invert laplacian for Psi Psi = aparSolver->solve(Jpar); mesh->communicate(Psi); } + nvtxPop(); + nvtxPushColor("rhs_phi_constraint", 0xFFFFFF00); if (phi_constraint) { // Phi being solved as a constraint @@ -1417,7 +1449,9 @@ class ELMpb : public PhysicsModel { phi.applyBoundary(); mesh->communicate(phi); } + nvtxPop(); + nvtxPushColor("rhs_evolve_jpar", 0xFFFFFF00); if (!evolve_jpar) { // Get J from Psi Jpar = Delp2(Psi); @@ -1479,6 +1513,7 @@ class ELMpb : public PhysicsModel { } } } + nvtxPop(); //////////////////////////////////////////////////// // Sheath boundary conditions @@ -1487,6 +1522,7 @@ class ELMpb : public PhysicsModel { // to Jpar so that Jpar = sqrt(mi/me)/(2*pi) * phi // + nvtxPushColor("rhs_sheath_boundaries", 0xFFFFFF00); if (sheath_boundaries) { // At y = ystart (lower boundary) @@ -1533,6 +1569,7 @@ class ELMpb : public PhysicsModel { } } } + nvtxPop(); //////////////////////////////////////////////////// // Check that settings match compile-time switches @@ -1604,6 +1641,7 @@ class ELMpb : public PhysicsModel { // Note: Capture all class member variables into local scope // or an illegal memory access may occur on GPUs + nvtxPushColor("BOUT_FOR_RAJA_ddt_Jpar_acc", 0xFFFFFF00); BOUT_FOR_RAJA( i, Jpar.getRegion("RGN_NOBNDRY"), CAPTURE(delta_i, hyperresist, relax_j_tconst, dnorm, ehyperviscos, viscos_perp)) { @@ -1680,6 +1718,7 @@ class ELMpb : public PhysicsModel { ddt(P_acc)[i] = 0.0; #endif }; + nvtxPop(); // Terms which are not yet single index operators // Note: Terms which are included in the single index loop @@ -1750,45 +1789,78 @@ class ELMpb : public PhysicsModel { // ddt(U) += SQ(B0) * b0xGrad_dot_Grad(rmp_Psi, J0, CELL_CENTRE); // } - ddt(U) += b0xcv * Grad(P); // curvature term + nvtxPushColor("rhs_ddt", 0xFFFFFF00); + + // nvtxPushColor("rhs_ddt_U_acc", 0xFFFFFF00); + // ddt(U) += b0xcv * Grad(P); // curvature term + // nvtxPop(); + + mesh->communicate(P); + P.applyBoundary(); + + auto region = U.getRegion("RGN_NOBNDRY"); + + auto b0xcv_x_acc = FieldAccessor<>(b0xcv.x); + auto b0xcv_y_acc = FieldAccessor<>(b0xcv.y); + auto b0xcv_z_acc = FieldAccessor<>(b0xcv.z); + + nvtxPushColor("rhs_ddt_U_acc", 0xFFFFFF00); + BOUT_FOR_RAJA(i, region, CAPTURE(P_acc, U_acc, b0xcv_x_acc, b0xcv_y_acc, b0xcv_z_acc)) { + BoutReal dPdx = DDX(P_acc, i); + BoutReal dPdy = DDY(P_acc, i); + BoutReal dPdz = DDZ(P_acc, i); + + ddt(U_acc)[i] += b0xcv_x_acc[i] * dPdx + + b0xcv_y_acc[i] * dPdy + + b0xcv_z_acc[i] * dPdz; + }; + nvtxPop(); // if (!nogradparj) { // Parallel current term // ddt(U) -= SQ(B0) * Grad_parP(Jpar, CELL_CENTRE); // b dot grad j // } + nvtxPushColor("rhs_ddt_withflow_K_H_term", 0xFFFFFF00); if (withflow && K_H_term) { // K_H_term ddt(U) -= b0xGrad_dot_Grad(phi, U0); } + nvtxPop(); // if (diamag_phi0) { // Equilibrium flow // ddt(U) -= b0xGrad_dot_Grad(phi0, U); // } + nvtxPushColor("rhs_ddt_withflow", 0xFFFFFF00); if (withflow) { // net flow ddt(U) -= V_dot_Grad(V0net, U); } + nvtxPop(); // if (nonlinear) { // Advection // ddt(U) -= bracket(phi, U, bm_exb) * B0; // } // Viscosity terms - + nvtxPushColor("rhs_ddt_viscos_par", 0xFFFFFF00); if (viscos_par > 0.0) { // Parallel viscosity ddt(U) += viscos_par * Grad2_par2(U); } + nvtxPop(); + nvtxPushColor("rhs_ddt_diffusion_u4", 0xFFFFFF00); if (diffusion_u4 > 0.0) { tmpU2 = D2DY2(U); mesh->communicate(tmpU2); tmpU2.applyBoundary(); ddt(U) -= diffusion_u4 * D2DY2(tmpU2); } + nvtxPop(); // if (viscos_perp > 0.0) { // Perpendicular viscosity // ddt(U) += viscos_perp * Delp2(U); // } - + nvtxPop(); + nvtxPushColor("rhs_hyperviscos", 0xFFFFFF00); // Hyper-viscosity if (hyperviscos > 0.0) { // Calculate coefficient. @@ -1874,7 +1946,8 @@ class ELMpb : public PhysicsModel { //////////////////////////////////////////////////// // Pressure equation - + nvtxPop(); + nvtxPushColor("rhs_evolve_pressure", 0xFFFFFF00); if (evolve_pressure) { // ddt(P) -= b0xGrad_dot_Grad(phi, P0); @@ -1921,7 +1994,8 @@ class ELMpb : public PhysicsModel { //////////////////////////////////////////////////// // Compressional effects - + nvtxPop(); + nvtxPushColor("rhs_compress", 0xFFFFFF00); if (compress) { ddt(P) -= beta * Div_par(Vpar, CELL_CENTRE); @@ -1980,6 +2054,8 @@ class ELMpb : public PhysicsModel { first_run = false; + nvtxPop(); + return 0; } From d2d151e64410d68e36c8b43fde2644bb510686dd Mon Sep 17 00:00:00 2001 From: Jony Castagna Date: Fri, 27 Feb 2026 21:21:08 +0000 Subject: [PATCH 3/5] Ported Delp2(Jpar), curvature term and diffusion in elm-pb-outerloop --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 61 ++++++++++++++++++- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 360fd9b0da..2293dcbf34 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -1490,7 +1490,19 @@ class ELMpb : public PhysicsModel { } // Get Delp2(J) from J - Jpar2 = Delp2(Jpar); + nvtxPushColor("rhs_evolve_jpar", 0xFFFFFF00); + mesh->communicate(Jpar); + Jpar.applyBoundary(); + + Jpar2 = 0.0; + auto Jpar_acc = FieldAccessor<>(Jpar); + auto Jpar2_acc = FieldAccessor<>(Jpar2); + + BOUT_FOR_RAJA(i, Jpar.getRegion("RGN_NOBNDRY"), + CAPTURE(Jpar_acc, Jpar2_acc)) { + Jpar2_acc[i] = Delp2(Jpar_acc, i); + }; + nvtxPop(); Jpar2.applyBoundary(); mesh->communicate(Jpar2); @@ -1795,14 +1807,18 @@ class ELMpb : public PhysicsModel { // ddt(U) += b0xcv * Grad(P); // curvature term // nvtxPop(); + nvtxPushColor("rhs_communicate", 0xFFFFFF00); mesh->communicate(P); P.applyBoundary(); + nvtxPop(); + nvtxPushColor("rhs_accessor", 0xFFFFFF00); auto region = U.getRegion("RGN_NOBNDRY"); auto b0xcv_x_acc = FieldAccessor<>(b0xcv.x); auto b0xcv_y_acc = FieldAccessor<>(b0xcv.y); auto b0xcv_z_acc = FieldAccessor<>(b0xcv.z); + nvtxPop(); nvtxPushColor("rhs_ddt_U_acc", 0xFFFFFF00); BOUT_FOR_RAJA(i, region, CAPTURE(P_acc, U_acc, b0xcv_x_acc, b0xcv_y_acc, b0xcv_z_acc)) { @@ -1848,11 +1864,50 @@ class ELMpb : public PhysicsModel { nvtxPop(); nvtxPushColor("rhs_ddt_diffusion_u4", 0xFFFFFF00); + // if (diffusion_u4 > 0.0) { + // tmpU2 = D2DY2(U); + // mesh->communicate(tmpU2); + // tmpU2.applyBoundary(); + // ddt(U) -= diffusion_u4 * D2DY2(tmpU2); + // } + // nvtxPop(); + + if (diffusion_u4 > 0.0) { - tmpU2 = D2DY2(U); + + // Kernel 1: tmpU2 = D2DY2(U) + nvtxPushColor("rhs_diff_initTmpU2", 0xFFFFFF00); + tmpU2 = 0.0; + auto tmpU2a_acc = FieldAccessor<>(tmpU2); + nvtxPop(); + + nvtxPushColor("rhs_diff_loop1", 0xFFFFFF00); + BOUT_FOR_RAJA(i, region, CAPTURE(U_acc, tmpU2a_acc)) { + tmpU2a_acc[i] = D2DY2(U_acc, i); + }; + nvtxPop(); + + nvtxPushColor("rhs_diff_communicate", 0xFFFFFF00); + // Host: halo exchange + BCs mesh->communicate(tmpU2); tmpU2.applyBoundary(); - ddt(U) -= diffusion_u4 * D2DY2(tmpU2); + nvtxPop(); + + nvtxPushColor("rhs_diff_acce", 0xFFFFFF00); + // Kernel 2: ddt(U) -= diffusion_u4 * D2DY2(tmpU2) + auto tmpU2b_acc = FieldAccessor<>(tmpU2); + + const BoutReal diffusion = diffusion_u4; + + auto& ddtU = ddt(U); // keep alive (avoid proxy lifetime) + auto ddtU_acc = FieldAccessor<>(ddtU); + nvtxPop(); + + nvtxPushColor("rhs_diff_loop2", 0xFFFFFF00); + BOUT_FOR_RAJA(i, region, CAPTURE(diffusion, ddtU_acc, tmpU2b_acc)) { + ddtU_acc[i] -= diffusion * D2DY2(tmpU2b_acc, i); + }; + nvtxPop(); } nvtxPop(); From c76dc901763add39fb370b5c9ada6fd173fb202a Mon Sep 17 00:00:00 2001 From: Jony Castagna Date: Thu, 5 Mar 2026 10:28:14 +0000 Subject: [PATCH 4/5] Corrected 2D index in curvature term --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 31 ++++++++----------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 2293dcbf34..ba50afc85b 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -1807,28 +1807,23 @@ class ELMpb : public PhysicsModel { // ddt(U) += b0xcv * Grad(P); // curvature term // nvtxPop(); - nvtxPushColor("rhs_communicate", 0xFFFFFF00); - mesh->communicate(P); - P.applyBoundary(); - nvtxPop(); - - nvtxPushColor("rhs_accessor", 0xFFFFFF00); + // nvtxPushColor("rhs_accessor", 0xFFFFFF00); auto region = U.getRegion("RGN_NOBNDRY"); - auto b0xcv_x_acc = FieldAccessor<>(b0xcv.x); - auto b0xcv_y_acc = FieldAccessor<>(b0xcv.y); - auto b0xcv_z_acc = FieldAccessor<>(b0xcv.z); - nvtxPop(); + auto b0xcv_x = Field2DAccessor<>(b0xcv.x); + auto b0xcv_y = Field2DAccessor<>(b0xcv.y); + auto b0xcv_z = Field2DAccessor<>(b0xcv.z); + + BOUT_FOR_RAJA(i, region, CAPTURE(P_acc, U_acc, b0xcv_x, b0xcv_y, b0xcv_z)) { + const int i2d = i / P_acc.mesh_nz; // or U_acc.mesh_nz - nvtxPushColor("rhs_ddt_U_acc", 0xFFFFFF00); - BOUT_FOR_RAJA(i, region, CAPTURE(P_acc, U_acc, b0xcv_x_acc, b0xcv_y_acc, b0xcv_z_acc)) { - BoutReal dPdx = DDX(P_acc, i); - BoutReal dPdy = DDY(P_acc, i); - BoutReal dPdz = DDZ(P_acc, i); + const BoutReal dPdx = DDX(P_acc, i); + const BoutReal dPdy = DDY(P_acc, i); + const BoutReal dPdz = DDZ(P_acc, i); - ddt(U_acc)[i] += b0xcv_x_acc[i] * dPdx - + b0xcv_y_acc[i] * dPdy - + b0xcv_z_acc[i] * dPdz; + ddt(U_acc)[i] += b0xcv_x[i2d] * dPdx + + b0xcv_y[i2d] * dPdy + + b0xcv_z[i2d] * dPdz; }; nvtxPop(); From 8ba63ea72411bbe51e455ee828d4fc4ee1f9503e Mon Sep 17 00:00:00 2001 From: Jony Castagna Date: Thu, 11 Jun 2026 13:41:39 +0100 Subject: [PATCH 5/5] Fix for compability with Hermes-3. Use same branch support-gpu-field-operators-lazy-j --- include/bout/field.hxx | 1 - include/bout/field2d.hxx | 4 ++-- include/bout/field3d.hxx | 2 +- include/bout/fieldperp.hxx | 2 +- include/bout/options.hxx | 6 +++--- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/bout/field.hxx b/include/bout/field.hxx index fe2b4767d2..ccd05c0a6a 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -546,7 +546,6 @@ T pow(BoutReal lhs, const T& rhs, const std::string& rgn = "RGN_ALL") { template > \ inline BinaryExpr name(const T& f, \ const std::string& rgn = "RGN_ALL") { \ - std::cout << "RUNNING " #name " with CUDA\n"; \ return BinaryExpr{static_cast(f), \ static_cast(f), \ bout::op::name{}, \ diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index b452df7fd6..ae63793733 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -108,7 +108,7 @@ public: || (is_expr_constant_v && is_expr_field2d_v) || (is_expr_field2d_v && is_expr_constant_v)>> Field2D(const BinaryExpr& expr) { - std::cout << "RUNNING Field2D constructor with CUDA\n"; + //std::cout << "RUNNING Field2D constructor with CUDA\n"; Array data{expr.size()}; expr.evaluate(&data[0]); *this = std::move(Field2D{std::move(data), expr.getMesh(), expr.getLocation(), @@ -192,7 +192,7 @@ public: template std::enable_if_t, Field2D&> operator=(const BinaryExpr& expr) { - std::cout << "RUNNING Field2D operator= with CUDA\n"; + //std::cout << "RUNNING Field2D operator= with CUDA\n"; if (isAllocated()) { expr.evaluate(&data[0]); } else { diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 9dc064d6e6..cf631bb642 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -460,7 +460,7 @@ public: template std::enable_if_t, Field3D&> operator=(BinaryExpr& expr) { - std::cout << "RUNNING operator= with CUDA\n"; + //std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); if(isAllocated()) { expr.evaluate(&data[0]); diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index 49d4fec1b7..6b503747b4 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -90,7 +90,7 @@ public: typename ResT, typename L, typename R, typename Func, typename = std::enable_if_t<(is_expr_fieldperp_v && is_expr_fieldperp_v)>> FieldPerp(const BinaryExpr& expr) { - std::cout << "RUNNING FieldPerp constructor with CUDA\n"; + //std::cout << "RUNNING FieldPerp constructor with CUDA\n"; Array data{expr.size()}; expr.evaluate(&data[0]); *this = std::move(FieldPerp{std::move(data), expr.getMesh(), expr.getLocation(), diff --git a/include/bout/options.hxx b/include/bout/options.hxx index 9356326d0e..ac88aeb5d6 100644 --- a/include/bout/options.hxx +++ b/include/bout/options.hxx @@ -459,9 +459,9 @@ public: /// ------- /// A reference to `this`, for method chaining template - Options& force(T val, const std::string source = "") { - is_section = true; // Invalidates any existing setting - return assign(val, source); + Options& force(T&& val, const std::string& source = "") { + is_section = true; + return assign(std::forward(val), source); } /// Assign a value that is expected to vary in time.