MFlowCode · sbryngelson · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
@@ -22,6 +22,9 @@ TKE = "TKE"
 HSA = "HSA"
 infp = "infp"
 Sur = "Sur"
+thi = "thi"           # AMR clustering local: tagged-box hi index (tlo/thi)
+alo = "alo"           # AMR clustering local: accepted-box lo array (alo/ahi)
+thr = "thr"           # AMR clustering local: min-separation merge threshold
 chioces = "chioces"   # typo for "choices" - tests constraint key validation
 reqires = "reqires"   # typo for "requires" - tests dependency key validation
 choises = "choises"   # appears in comment explaining validation purpose

@@ -109,13 +109,15 @@ is equivalent to `"riemann_solver": 2`. Defined names appear in each parameter's
 | ---:             |    :----:      |          :---                             |
 | `run_time_info`  | Logical        | Output run-time information               |
 | `rdma_mpi`       | Logical        | (GPUs) Enable RDMA for MPI communication. |
+| `active_box`     | Logical        | Enable causal-envelope active-box restriction of the RHS compute window. |
 | `case_dir`       | String         | Case directory path                       |
 | `old_grid`       | Logical        | Use grid from previous simulation         |
 | `old_ic`         | Logical        | Use initial conditions from previous simulation |
 | `t_step_old`     | Integer        | Time step to restart from                 |
 | `n_start_old`    | Integer        | Starting index from previous simulation   |
 
 - `run_time_info` generates a text file that includes run-time information including the CFL number(s) at each time-step.
+- `active_box` enables the causal-envelope active-box optimization, restricting the RHS compute window to the region where the solution deviates from a uniform ambient state. Requires WENO reconstruction (`recon_type = 1`) and SSP-RK3 time stepping (`time_stepper = 3`). Incompatible with immersed boundaries, acoustic sources, body forces, Lagrangian bubbles, phase change, and the IGR solver.
 - `rdma_mpi` optimizes data transfers between GPUs using Remote Direct Memory Access (RDMA).
 The underlying MPI implementation and communication infrastructure must support this
 feature, detecting GPU pointers and performing RDMA accordingly.
@@ -670,6 +672,24 @@ To restart the simulation from $k$-th time step, see @ref running "Restarting Ca
 | `file_per_process`      | Logical | Whether or not to write one IO file per process |
 | `cons_vars_wrt`         | Logical | Write conservative variables |
 | `prim_vars_wrt`         | Logical | Write primitive variables	|
+| `load_weight_wrt`       | Logical | Write per-cell load-weight diagnostic field |
+| `sfc_partition_wrt`     | Logical | Report SFC-weighted load-balance partition |
+| `rank_time_wrt`         | Logical | Report per-rank RHS compute-time imbalance (max/mean) |
+| `load_balance`          | Logical | (Experimental/diagnostic) Weighted static Cartesian decomposition at init (requires `parallel_io = T`, >1 rank). Measured gain is small on CPU (~5%) and can be slower on GPU due to the occupancy floor; equal decomposition is near-optimal for uniform-cost workloads. |
+| `amr`                   | Logical | (Experimental) Enable block-structured AMR: a 2:1 refined level-1 block with gradient-based dynamic regrid, optional dt/2 subcycling, and conservative coupling with refluxing. Requires WENO reconstruction, SSP-RK3, model_eqns=2; num_fluids > 1 requires mpp_lim; supports physical viscosity. |
+| `amr_block_beg(i)`      | Integer | Refined-block start cell index in direction $i$ (level-0 index space) |
+| `amr_block_end(i)`      | Integer | Refined-block end cell index in direction $i$ (level-0 index space) |
+| `amr_regrid_int`        | Integer | Steps between AMR regrid events (0 = static block) |
+| `amr_tag_eps`           | Real    | Relative density-gradient threshold for AMR refinement tagging (default 0.1) |
+| `amr_buf`               | Integer | Coarse-cell padding around tagged cells when regridding (default 3) |
+| `amr_subcycle`          | Logical | Advance the coarse level at the case dt and the fine level at dt/2 (two substeps; Berger-Colella refluxing). Requires `amr`; incompatible with `cfl_dt`. |
+| `amr_max_blocks`       | Integer | Number of fixed refined-block slots preallocated (each max-block sized; ~N x device memory); must be >= 1 (default 4) |
+| `amr_cluster_eff`       | Real    | Berger-Rigoutsos min tag efficiency a clustered block box reaches before splitting stops; must satisfy 0 < eff <= 1 (default 0.7) |
+| `hybrid_weno`           | Logical | Use linear-optimal reconstruction in smooth cells, full WENO only at flagged discontinuities (requires WENO reconstruction) |
+| `hybrid_weno_eps`       | Real    | Smoothness threshold for hybrid WENO shock flagging; must be > 0 (default 1e-2) |
+| `hybrid_riemann`        | Logical | Use a cheap central/Rusanov flux in smooth cells, full HLLC only at flagged discontinuities (requires HLLC, 5eq/6eq) |
+| `hybrid_smooth_flux`    | Integer | Smooth-region flux for hybrid Riemann: 1 = central, 2 = Rusanov (default 2) |
+| `partition_tile_size`   | Integer | Tile side for the SFC partitioner (default 8) |
 | `alpha_rho_wrt(i)`      | Logical | Add the partial density of the fluid $i$ to the database \|
 | `rho_wrt`               | Logical | Add the mixture density to the database	 |
 | `mom_wrt(i)`            | Logical | Add the $i$-direction momentum to the database	 |
@@ -754,6 +774,104 @@ This is useful for large domains where only a portion of the domain is of intere
 It is not supported when `precision = 1` and `format = 1`.
 It also cannot be enabled with `flux_wrt`, `heat_ratio_wrt`, `pres_inf_wrt`, `c_wrt`, `omega_wrt`, `ib`, `schlieren_wrt`, `qm_wrt`, or 'liutex_wrt'.
 
+### 7.1. Adaptive Mesh Refinement (AMR) {#sec-amr}
+
+MFC supports block-structured AMR (Experimental) via a single 2:1 refined level-1 block
+that coexists with the base-level solve.
+The fine block is initialized from the base grid by piecewise-linear interpolation and
+remains continuously coupled to the base solve through conservative ghost-cell exchange
+and flux refluxing at the coarse–fine interface.
+
+**Restrictions.**
+AMR requires WENO reconstruction (`recon_type = 1`, any order), SSP-RK3 time-stepping
+(`time_stepper = 3`), and the 5-equation model (`model_eqns = 2`).
+Multiple fluids (`num_fluids > 1`) are supported and additionally require `mpp_lim`,
+whose volume-fraction clamp+renormalize maintains coarse/fine alpha consistency; the
+per-fluid masses are refluxed exactly, and volume fractions are prolonged with a
+sum-preserving closure (fine-level volume fractions sum to one by construction).
+Physical viscosity (`viscous = T`) is supported: the viscous stress/work travels through
+the momentum- and energy-equation source fluxes, which are captured into the same
+coarse–fine flux registers as the advective fluxes, so the interface is refluxed against
+the matched *total* (advective + viscous) flux and energy — including viscous work — is
+conserved. Fine-ghost velocity gradients at the coarse–fine boundary are taken from the
+conservative-linear prolongation of the coarse state (no special gradient reconstruction);
+that interface inconsistency is bounded and conservation is enforced by the flux-register
+matching. The density-gradient regrid tagger does not sense shear or boundary layers well,
+so viscous features may need a static or generously buffered block (error-estimator taggers
+are future work).
+Monodisperse (`nb = 1`) polytropic Euler-Euler bubbles (`bubbles_euler = T` with
+`polytropic = T`) are supported: the bubble moments are flux-based conserved variables
+refluxed through the same registers, and prolongation floors the radius moment so the
+reconstructed radius and number density stay positive (realizability). QBMM, non-polytropic,
+and polydisperse bubbles are not yet supported (their internal pressure / vapor-mass
+sub-fields and quadrature weights are not advanced on the fine level).
+Phase change (`relax`) is supported: the cell-local, mass/energy-conserving relaxation
+runs on the fine solution before restriction (matching the coarse once-per-step timing).
+It is incompatible with surface tension, Lagrangian bubbles, QBMM, non-polytropic bubbles,
+polydisperse bubbles, immersed boundaries, IGR, cylindrical
+coordinates, MHD, chemistry, `hybrid_weno`, `hybrid_riemann`, and `acoustic_source`.
+Multi-rank runs are supported: the fine level mirrors the base decomposition (each rank
+holds the fine cells covering the block's intersection with its own subdomain), so the
+block may span rank boundaries and move freely across them under dynamic regrid.
+The block may cover at most about half of any rank's subdomain per dimension (the fine
+advance reuses the rank-local solver scratch).
+
+**Static vs. dynamic block.**
+Setting `amr_regrid_int = 0` fixes the block at the initial `amr_block_beg`/`amr_block_end`
+position for the entire run (useful for convergence studies or GPU correctness testing).
+Setting `amr_regrid_int > 0` triggers dynamic regrid every that many coarse steps:
+cells whose normalized density gradient exceeds `amr_tag_eps` are tagged, then clustered
+by a Berger–Rigoutsos recursive bisection into a list of separated block boxes (each grown
+by `amr_buf` coarse cells of buffer padding). Boxes whose padded extents would come within a
+ghost-cell buffer width of each other are merged, so separated features get their own refined
+box while nearby ones stay a single box (guaranteeing no fine–fine adjacency). Splitting stops once a
+box's tag efficiency (tagged/total cells) reaches `amr_cluster_eff`; the number of blocks
+is capped at `amr_max_blocks`.
+A positive `amr_tag_eps` and `amr_buf >= 1` are required whenever regridding is active.
+
+**Subcycling.**
+`amr_subcycle = T` enables Berger–Colella dt/2 subcycling: the coarse level advances
+one full step at the case `dt`, while the fine level takes two half-steps at `dt/2` with
+time-interpolated ghost values at the intermediate stage.
+Accumulated fine-level fluxes are applied back to the coarse level (reflux correction)
+after each coarse step.
+`amr_subcycle` is incompatible with `cfl_dt` (variable time step) and requires `amr = T`.
+
+**Block slots.**
+`amr_max_blocks` (default 4) sets the number of fixed refined-block slots preallocated
+for the run. Each slot is sized to the maximum block extent, so `N` slots require roughly
+`N` times the device memory of a single block; the goal is the compute win of refining
+separated features independently, and memory efficiency (compact per-block pools) is a
+follow-up. Dynamic regrid clusters the tagged cells into up to `amr_max_blocks` separated
+boxes (`amr_cluster_eff` sets the min tag efficiency each box reaches before splitting stops).
+
+**Restart.**
+Each save step writes a fine-level AMR restart file alongside the level-0 restart data
+(whose format is unchanged): the current — possibly regridded — block box and the fine
+solution, per rank (an `amr_fine.dat` in each rank's step directory, or a single shared
+`amr_*.dat` next to the level-0 MPI-IO restart file when `parallel_io` is on).
+Restarting (`t_step_start > 0`) restores the saved box and fine state seamlessly; it
+requires the same rank count (and decomposition) as the run that wrote the file, and
+aborts with a clear message otherwise.
+If the AMR file is absent (e.g., data from an older run), the run proceeds with a
+warning and re-initializes the fine level by prolongation from the coarse restart data,
+losing the accumulated fine-level accuracy.
+Note that level-0 output already contains the restricted (coarse-resolution) fine
+solution over the block, so existing visualization works unchanged; fine-resolution
+visualization output is future work.
+
+| Parameter               | Type    | Description                                    |
+| ---:                    | :----:  |          :---                                  |
+| `amr`                   | Logical | Enable AMR (see prose above for requirements and restrictions) |
+| `amr_block_beg(i)`      | Integer | Initial refined-block start cell index in direction $i$ (level-0 index space) |
+| `amr_block_end(i)`      | Integer | Initial refined-block end cell index in direction \f$i\f$ (level-0 index space); must satisfy \f$2\,(e_i - b_i + 1) - 1 \le N_i\f$ |
+| `amr_regrid_int`        | Integer | Coarse steps between regrid events (0 = static block) |
+| `amr_tag_eps`           | Real    | Normalized density-gradient threshold for refinement tagging; must be > 0 when `amr_regrid_int > 0` (default 0.1) |
+| `amr_buf`               | Integer | Coarse-cell padding around tagged cells; must be >= 1 when `amr_regrid_int > 0` (default 3) |
+| `amr_subcycle`          | Logical | Advance fine level at dt/2 (two substeps per coarse step) with Berger–Colella refluxing |
+| `amr_max_blocks`       | Integer | Number of fixed refined-block slots preallocated (each max-block sized; ~N x device memory); must be >= 1 (default 4) |
+| `amr_cluster_eff`       | Real    | Berger-Rigoutsos min tag efficiency a clustered block box reaches before splitting stops; must satisfy 0 < eff <= 1 (default 0.7) |
+
 ### 8. Acoustic Source {#sec-acoustic-source}
 
 | Parameter                             | Type    | Description |

@@ -3,6 +3,7 @@
         "category": "Solver Core",
         "modules": [
             "m_rhs",
+            "m_active_box",
             "m_time_steppers",
             "m_weno",
             "m_riemann_solvers",
@@ -58,7 +59,11 @@
             "m_start_up",
             "m_data_output",
             "m_data_input",
-            "m_delay_file_access"
+            "m_delay_file_access",
+            "m_load_weight",
+            "m_load_balance",
+            "m_sfc_partition",
+            "m_rank_timing"
         ]
     },
     {
@@ -69,6 +74,9 @@
             "m_global_parameters_common",
             "m_mpi_common",
             "m_mpi_proxy",
+            "m_box",
+            "m_amr",
+            "m_amr_registers",
             "m_constants",
             "m_precision_select",
             "m_helper",

@@ -73,6 +73,10 @@ contains
         type(integer_field), dimension(1:num_dims,1:2), intent(in)                                           :: bc_type
         type(scalar_field), optional, intent(inout)                                                          :: q_T_sf
 
+#ifdef MFC_SIMULATION
+        if (amr_in_fine_advance) return  ! AMR fine block: ghosts pre-filled from the coarse level
+#endif
+
         call s_populate_bc_direction(1, -1, bc_x, bc_type(1, 1), q_prim_vf, pb_in, mv_in, q_T_sf)
         call s_populate_bc_direction(1, 1, bc_x, bc_type(1, 2), q_prim_vf, pb_in, mv_in, q_T_sf)
 

@@ -0,0 +1,85 @@
+!>
+!!@file
+!!@brief Contains module m_box
+
+#:include 'macros.fpp'
+
+!> @brief Owned domain-decomposition Box abstraction and partition arithmetic (v1: one box per rank).
+module m_box
+
+    use m_derived_types, only: t_box
+    use m_global_parameters, only: wp
+
+    implicit none
+
+    private
+    public :: t_box, f_equal_splits, f_weighted_splits, f_box_from_splits
+
+contains
+
+    !> Cumulative equal-cell offsets for g cells over n_parts ranks: off(r) = r*(g/n_parts) + min(r, mod(g,n_parts)). Reproduces
+    !! MFC's block distribution (remainder to the first ranks) exactly. Pure integer path.
+    pure function f_equal_splits(g, n_parts) result(off)
+
+        integer, intent(in)           :: g, n_parts
+        integer, dimension(0:n_parts) :: off
+        integer                       :: q, rem, r
+
+        q = g/n_parts
+        rem = mod(g, n_parts)
+        do r = 0, n_parts
+            off(r) = r*q + min(r, rem)
+        end do
+
+    end function f_equal_splits
+
+    !> Cumulative offsets splitting marginal w into n_parts contiguous chunks of near-equal weight, each >= l_min cells. off(0)=0,
+    !! off(n_parts)=size(w). Feasibility (size(w) >= n_parts*l_min) is the caller's responsibility (pure; no abort).
+    pure function f_weighted_splits(w, n_parts, l_min) result(off)
+
+        real(wp), dimension(0:), intent(in) :: w
+        integer, intent(in)                 :: n_parts, l_min
+        integer, dimension(0:n_parts)       :: off
+        real(wp)                            :: csum, total
+        integer                             :: g, i, r
+
+        g = size(w)
+        off(0) = 0
+        off(n_parts) = g
+        if (n_parts == 1) return
+        total = sum(w)
+        r = 1
+        csum = 0._wp
+        do i = 0, g - 1
+            csum = csum + w(i)
+            do while (r < n_parts .and. csum >= real(r, wp)*total/real(n_parts, wp))
+                off(r) = i + 1
+                r = r + 1
+            end do
+        end do
+        do while (r < n_parts)
+            off(r) = g; r = r + 1
+        end do
+        do r = 1, n_parts - 1
+            if (off(r) < r*l_min) off(r) = r*l_min
+            if (off(r) > g - (n_parts - r)*l_min) off(r) = g - (n_parts - r)*l_min
+            if (off(r) <= off(r - 1)) off(r) = off(r - 1) + l_min
+        end do
+
+    end function f_weighted_splits
+
+    !> Assemble this rank's box from per-axis cumulative offsets and the rank's Cartesian coords (0-based). lo(d) =
+    !! off_d(coords(d)); hi(d) = off_d(coords(d)+1) - 1. Works for collapsed axes (off_d = [0,1] -> lo=hi=0).
+    pure function f_box_from_splits(off_x, off_y, off_z, coords) result(box)
+
+        integer, dimension(0:), intent(in) :: off_x, off_y, off_z
+        integer, intent(in)                :: coords(3)
+        type(t_box)                        :: box
+
+        box%lo(1) = off_x(coords(1)); box%hi(1) = off_x(coords(1) + 1) - 1
+        box%lo(2) = off_y(coords(2)); box%hi(2) = off_y(coords(2) + 1) - 1
+        box%lo(3) = off_z(coords(3)); box%hi(3) = off_z(coords(3) + 1) - 1
+
+    end function f_box_from_splits
+
+end module m_box
@@ -537,4 +537,11 @@ module m_derived_types
         real(wp), dimension(1:num_fluids_max)   :: perturb_dens_scale
         real(wp), dimension(1:num_fluids_max,3) :: perturb_dens_offset
     end type simplex_noise_params
+
+    !> An index-space rectangle in global cell indices. In v1, one t_box = one rank's subdomain. Flat leaf: no allocatable/pointer
+    !! components, host-only, never namelist/broadcast.
+    type t_box
+        integer :: lo(3)  !< global low cell index per axis (x,y,z)
+        integer :: hi(3)  !< global high cell index per axis
+    end type t_box
 end module m_derived_types
@@ -80,6 +80,7 @@ module m_global_parameters_common
     $:GPU_DECLARE(create='[hyperelasticity, elasticity, low_Mach]')
     $:GPU_DECLARE(create='[cont_damage, hyper_cleaning]')
     $:GPU_DECLARE(create='[relax, relax_model, palpha_eps, ptgalpha_eps]')
+    $:GPU_DECLARE(create='[load_weight_wrt]')
     $:GPU_DECLARE(create='[down_sample]')
     $:GPU_DECLARE(create='[fd_order]')
     $:GPU_DECLARE(create='[rhoref, pref]')
@@ -348,6 +349,11 @@ contains
 
         allocate (proc_coords(1:num_dims))
 
+#ifdef MFC_MPI
+        ! start_idx is always needed (e.g. for sfc_partition_wrt); parallel I/O setup below is optional.
+        allocate (start_idx(1:num_dims))
+#endif
+
         if (parallel_io .neqv. .true.) return
 
 #ifdef MFC_MPI
@@ -359,8 +365,6 @@ contains
 
         ! Option for UNIX file system (Hooke/Thomson) WRITE(mpiiofs, '(A)') '/ufs_' mpiiofs = TRIM(mpiiofs) mpi_info_int =
         ! MPI_INFO_NULL
-
-        allocate (start_idx(1:num_dims))
 #endif
 
     end subroutine s_initialize_parallel_io_common
@@ -373,9 +377,7 @@ contains
         deallocate (proc_coords)
 
 #ifdef MFC_MPI
-        if (parallel_io) then
-            deallocate (start_idx)
-        end if
+        deallocate (start_idx)
 #endif
 
     end subroutine s_finalize_global_parameters_common