From bfd435dd41bcc61d1c07ea9a05e6f32df9e3b46d Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Fri, 9 Jan 2026 04:52:49 -0500 Subject: [PATCH 01/25] feat: use fast division on Nvidia GPUs in newton_div context Adds specialised implementation of the approximate `newton_div` for the CUDA backend. It allows to avoid slow-path of the `rcp` and `div` and provides few per-cent speed-up of advection kernels. --- ext/OceananigansCUDAExt.jl | 20 ++++++++++++++++++++ src/Utils/newton_div.jl | 3 +++ 2 files changed, 23 insertions(+) diff --git a/ext/OceananigansCUDAExt.jl b/ext/OceananigansCUDAExt.jl index fd705879c04..f3a88a1fe90 100644 --- a/ext/OceananigansCUDAExt.jl +++ b/ext/OceananigansCUDAExt.jl @@ -132,4 +132,24 @@ end @inline UT.sync_device!(::CUDAGPU) = CUDA.synchronize() @inline UT.sync_device!(::CUDABackend) = CUDA.synchronize() +# Use faster versions of `newton_div` on Nvidia GPUs +CUDA.@device_override UT.newton_div(::Type{Float32}, a::Float64, b::Float64) = a * fast_inv_cuda(b) +CUDA.@device_override UT.newton_div(::Type{Float64}, a::Float64, b::Float64) = a * fast_inv_cuda(b) + +function fast_inv_cuda(a::Float64) + # Get the approximate reciprocal + # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64 + # This instruction chops off last 32bits of mantissa and computes inverse + # while treating all subnormal numbers as 0.0 + # If reciprocal would be subnormal, underflows to 0.0 + # 32 least significant bits of the result are filled with 0s + inv_a = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), a) + + # Approximate the missing 32bits of mantissa with a single cubic iteration + e = fma(inv_a, -a, 1.0) + e = fma(e, e, e) + inv_a = fma(e, inv_a, inv_a) + return inv_a +end + end # module OceananigansCUDAExt diff --git a/src/Utils/newton_div.jl b/src/Utils/newton_div.jl index 86da3e1d04c..9afa56bdc8b 100644 --- a/src/Utils/newton_div.jl +++ b/src/Utils/newton_div.jl @@ -21,3 +21,6 @@ end # Fallback for no precision lowering @inline newton_div(::Type{FT}, a, b::FT) where FT = a * Base.FastMath.inv_fast(b) + +# Note that the implementation may be overridden for some specific backends +# Refer to extension modules (e.g. `OceananigansCUDAExt`) From a2ee0a5876ea5a652a099fbb6e42a2640f8ddead Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Thu, 15 Jan 2026 06:05:42 -0500 Subject: [PATCH 02/25] fix: do not specify the type of numerator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we just multiply reciprocal of denominator by the numerator we don't need to know the exact representation of the numerator. If specified can lead to unexpected dispatch to the fallback method if the numerator is e.g. π literal (of type Irrational). --- ext/OceananigansCUDAExt.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/OceananigansCUDAExt.jl b/ext/OceananigansCUDAExt.jl index f3a88a1fe90..5fbf2b999fe 100644 --- a/ext/OceananigansCUDAExt.jl +++ b/ext/OceananigansCUDAExt.jl @@ -133,8 +133,8 @@ end @inline UT.sync_device!(::CUDABackend) = CUDA.synchronize() # Use faster versions of `newton_div` on Nvidia GPUs -CUDA.@device_override UT.newton_div(::Type{Float32}, a::Float64, b::Float64) = a * fast_inv_cuda(b) -CUDA.@device_override UT.newton_div(::Type{Float64}, a::Float64, b::Float64) = a * fast_inv_cuda(b) +CUDA.@device_override UT.newton_div(::Type{Float32}, a, b::Float64) = a * fast_inv_cuda(b) +CUDA.@device_override UT.newton_div(::Type{Float64}, a, b::Float64) = a * fast_inv_cuda(b) function fast_inv_cuda(a::Float64) # Get the approximate reciprocal From 09305009cd4329e1ec859b96a1ecb0a587f85425 Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Fri, 30 Jan 2026 11:51:37 -0500 Subject: [PATCH 03/25] refactor: select type of newton_div in a WENO scheme by type parameter --- ext/OceananigansCUDAExt.jl | 3 +- src/Advection/weno_interpolants.jl | 4 +-- src/Advection/weno_reconstruction.jl | 37 ++++++++++++----------- src/Utils/Utils.jl | 2 +- src/Utils/newton_div.jl | 44 ++++++++++++++++++++++------ 5 files changed, 59 insertions(+), 31 deletions(-) diff --git a/ext/OceananigansCUDAExt.jl b/ext/OceananigansCUDAExt.jl index 5fbf2b999fe..8abecc71ece 100644 --- a/ext/OceananigansCUDAExt.jl +++ b/ext/OceananigansCUDAExt.jl @@ -133,8 +133,7 @@ end @inline UT.sync_device!(::CUDABackend) = CUDA.synchronize() # Use faster versions of `newton_div` on Nvidia GPUs -CUDA.@device_override UT.newton_div(::Type{Float32}, a, b::Float64) = a * fast_inv_cuda(b) -CUDA.@device_override UT.newton_div(::Type{Float64}, a, b::Float64) = a * fast_inv_cuda(b) +CUDA.@device_override UT.newton_div(::Type{UT.BackendOptimizedNewtonDiv}, a, b::Float64) = a * fast_inv_cuda(b) function fast_inv_cuda(a::Float64) # Get the approximate reciprocal diff --git a/src/Advection/weno_interpolants.jl b/src/Advection/weno_interpolants.jl index dc4be2ddd71..85562e4e53b 100644 --- a/src/Advection/weno_interpolants.jl +++ b/src/Advection/weno_interpolants.jl @@ -291,7 +291,7 @@ end @inline function metaprogrammed_zweno_alpha_loop(buffer) elem = Vector(undef, buffer) for stencil = 1:buffer - elem[stencil] = :(C★(scheme, Val($(stencil-1))) * (1 + (newton_div(FT2, τ, β[$stencil] + ϵ))^2)) + elem[stencil] = :(C★(scheme, Val($(stencil-1))) * (1 + (newton_div(NDC, τ, β[$stencil] + ϵ))^2)) end return :($(elem...),) @@ -301,7 +301,7 @@ for buffer in advection_buffers[2:end] @eval begin @inline beta_sum(scheme::WENO{$buffer, FT}, β₁, β₂) where FT = @inbounds $(metaprogrammed_beta_sum(buffer)) @inline beta_loop(scheme::WENO{$buffer, FT}, ψ) where FT = @inbounds $(metaprogrammed_beta_loop(buffer)) - @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, FT2}, β, τ) where {FT, FT2} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer)) + @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, FT2, NDC}, β, τ) where {FT, FT2, NDC} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer)) end end diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl index 65a90346923..b05f1c45c8b 100644 --- a/src/Advection/weno_reconstruction.jl +++ b/src/Advection/weno_reconstruction.jl @@ -4,15 +4,15 @@ import Oceananigans ##### Weighted Essentially Non-Oscillatory (WENO) advection scheme ##### -struct WENO{N, FT, FT2, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT} +struct WENO{N, FT, FT2, NDC, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT} bounds :: PP buffer_scheme :: CA advecting_velocity_scheme :: SI - function WENO{N, FT, FT2}(bounds::PP, buffer_scheme::CA, - advecting_velocity_scheme :: SI) where {N, FT, FT2, PP, CA, SI} + function WENO{N, FT, FT2, NDC}(bounds::PP, buffer_scheme::CA, + advecting_velocity_scheme :: SI) where {N, FT, FT2, NDC, PP, CA, SI} - return new{N, FT, FT2, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme) + return new{N, FT, FT2, NDC, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme) end end @@ -32,7 +32,8 @@ Arguments Keyword arguments ================= - +- `newton_div`: The type of approximate division to use in performance-critical parts of the scheme. + Default: `Oceananigans.Utils.NewtonDivWithConversion{FT2}` - `order`: The order of the WENO advection scheme. Default: 5 - `bounds` (experimental): Whether to use bounds-preserving WENO, which produces a reconstruction that attempts to restrict a quantity to lie between a `bounds` tuple. @@ -92,7 +93,9 @@ WENO{5, Float64, Float32}(order=9, bounds=(0.0, 1.0)) └── advecting_velocity_scheme: Centered(order=8) ``` """ -function WENO(FT::DataType=Oceananigans.defaults.FloatType, FT2::DataType=Float32; +function WENO(FT::DataType=Oceananigans.defaults.FloatType, + FT2::DataType=Float32; + newton_div::DataType=Oceananigans.Utils.NewtonDivWithConversion{FT2}, order = 5, buffer_scheme = DecreasingOrderAdvectionScheme(), bounds = nothing, @@ -121,15 +124,15 @@ function WENO(FT::DataType=Oceananigans.defaults.FloatType, FT2::DataType=Float3 end N = Int((order + 1) ÷ 2) - return WENO{N, FT, FT2}(bounds, buffer_scheme, advecting_velocity_scheme) + return WENO{N, FT, FT2, newton_div}(bounds, buffer_scheme, advecting_velocity_scheme) end end weno_order(::WENO{N}) where N = 2N-1 Base.eltype(::WENO{N, FT}) where {N, FT} = FT eltype2(::WENO{N, FT, FT2}) where {N, FT, FT2} = FT2 -Base.summary(a::WENO{N, FT, FT2, Nothing}) where {N, FT, FT2} = string("WENO{$N, $FT, $FT2}(order=", 2N-1, ")") -Base.summary(a::WENO{N, FT, FT2, PP}) where {N, FT, FT2, PP} = string("WENO{$N, $FT, $FT2}(order=", 2N-1, ", bounds=", string(a.bounds), ")") +Base.summary(a::WENO{N, FT, FT2, NDC, Nothing}) where {N, FT, FT2, NDC} = string("WENO{$N, $FT, $FT2, $NDC}(order=", 2N-1, ")") +Base.summary(a::WENO{N, FT, FT2, NDC, PP}) where {N, FT, FT2, NDC, PP} = string("WENO{$N, $FT, $FT2, $NDC}(order=", 2N-1, ", bounds=", string(a.bounds), ")") function Base.show(io::IO, a::WENO) print(io, summary(a), '\n') @@ -145,12 +148,12 @@ function Base.show(io::IO, a::WENO) print(io, "└── advecting_velocity_scheme: ", summary(a.advecting_velocity_scheme)) end -Adapt.adapt_structure(to, scheme::WENO{N, FT, FT2}) where {N, FT, FT2} = - WENO{N, FT, FT2}(Adapt.adapt(to, scheme.bounds), - Adapt.adapt(to, scheme.buffer_scheme), - Adapt.adapt(to, scheme.advecting_velocity_scheme)) +Adapt.adapt_structure(to, scheme::WENO{N, FT, FT2, NDC}) where {N, FT, FT2, NDC} = + WENO{N, FT, FT2, NDC}(Adapt.adapt(to, scheme.bounds), + Adapt.adapt(to, scheme.buffer_scheme), + Adapt.adapt(to, scheme.advecting_velocity_scheme)) -on_architecture(to, scheme::WENO{N, FT, FT2}) where {N, FT, FT2} = - WENO{N, FT, FT2}(on_architecture(to, scheme.bounds), - on_architecture(to, scheme.buffer_scheme), - on_architecture(to, scheme.advecting_velocity_scheme)) +on_architecture(to, scheme::WENO{N, FT, FT2, NDC}) where {N, FT, FT2, NDC} = + WENO{N, FT, FT2, NDC}(on_architecture(to, scheme.bounds), + on_architecture(to, scheme.buffer_scheme), + on_architecture(to, scheme.advecting_velocity_scheme)) diff --git a/src/Utils/Utils.jl b/src/Utils/Utils.jl index 9eb600c78fd..a12a7bb8085 100644 --- a/src/Utils/Utils.jl +++ b/src/Utils/Utils.jl @@ -11,7 +11,7 @@ export seconds_to_nanosecond, period_to_seconds, time_difference_seconds, add_ti export TimeInterval, IterationInterval, WallTimeInterval, SpecifiedTimes, AndSchedule, OrSchedule, ConsecutiveIterations export apply_regionally!, construct_regionally, @apply_regionally, MultiRegionObject export isregional, getregion, _getregion, regions, sync_device! -export newton_div +export newton_div, NoNewtonDiv, NewtonDivWithConversion, BackendOptimizedNewtonDiv export TabulatedFunction export interpolator, _interpolate export ϕ₁, ϕ₂, ϕ₃, ϕ₄, ϕ₅, ϕ₆, ϕ₇, ϕ₈ diff --git a/src/Utils/newton_div.jl b/src/Utils/newton_div.jl index 9afa56bdc8b..cd2f57756bc 100644 --- a/src/Utils/newton_div.jl +++ b/src/Utils/newton_div.jl @@ -1,11 +1,37 @@ """ - newton_div(inv_FT, a, b::FT) +Abstract supertype grouping configuration options for `newton_div`. +Use one of the concrete subtypes as the first argument to `newton_div` +to select the implementation. +""" +abstract type NewtonDivConfig end + +""" +Configuration selecting regular, full-precision division. +""" +struct NoNewtonDiv <: NewtonDivConfig end + +""" +Configuration selecting approximate division via one Newton iteration. +The reciprocal `1/b` is first evaluated in a lower-precision type `FT` +to obtain a fast initial guess, then refined with a single Newton step +in the original precision. +""" +struct NewtonDivWithConversion{FT} <: NewtonDivConfig end -Compute an approximation of `a / b` that uses `inv_FT` type to compute -`1/b`, and then performs a single Newton iteration to add a few more bits of precision -afterwards. """ -@inline function newton_div(inv_FT, a, b::FT) where FT +Configuration selecting a backend-optimized implementation of approximate division. +The actual algorithm may differ across CPU and different GPU backends. +""" +struct BackendOptimizedNewtonDiv <: NewtonDivConfig end + +""" + newton_div(::Type{T}, a, b) + +Compute an approximate division `a/b` using a method specified by selector type `T`. +""" +function newton_div end + +@inline function newton_div(::Type{NewtonDivWithConversion{inv_FT}}, a, b::FT) where {inv_FT, FT} # Low precision division: b_low = convert(inv_FT, b) inv_b = Base.FastMath.inv_fast(b_low) @@ -19,8 +45,8 @@ afterwards. return x end -# Fallback for no precision lowering -@inline newton_div(::Type{FT}, a, b::FT) where FT = a * Base.FastMath.inv_fast(b) +# Case of matching precisions +@inline newton_div(::Type{NewtonDivWithConversion{FT}}, a, b::FT) where FT = a * Base.FastMath.inv_fast(b) -# Note that the implementation may be overridden for some specific backends -# Refer to extension modules (e.g. `OceananigansCUDAExt`) +# Exact division if requested +@inline newton_div(::Type{NoNewtonDiv}, a, b) = a / b From 29bfd5d9bd3993282d4565a4fc974badab72d99d Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Mon, 2 Feb 2026 05:42:44 -0500 Subject: [PATCH 04/25] fix: newton_div type propagation into buffer schemes --- src/Advection/weno_reconstruction.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl index b05f1c45c8b..81d82d7294a 100644 --- a/src/Advection/weno_reconstruction.jl +++ b/src/Advection/weno_reconstruction.jl @@ -119,7 +119,7 @@ function WENO(FT::DataType=Oceananigans.defaults.FloatType, # At minimum order, switch to Centered scheme buffer_scheme = Centered(FT; order=2) else - buffer_scheme = WENO(FT, FT2; order=order-2, bounds, minimum_buffer_upwind_order) + buffer_scheme = WENO(FT, FT2; order=order-2, bounds, minimum_buffer_upwind_order, newton_div) end end From d5fe7b84730b58c902beb3a1ec00fe5860b179bc Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Fri, 30 Jan 2026 18:19:59 -0500 Subject: [PATCH 05/25] test: update doctests --- src/Advection/vector_invariant_advection.jl | 8 ++--- src/Advection/weno_reconstruction.jl | 35 +++++++++++++-------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl index d674568110e..e4e37abf93a 100644 --- a/src/Advection/vector_invariant_advection.jl +++ b/src/Advection/vector_invariant_advection.jl @@ -194,11 +194,11 @@ julia> using Oceananigans julia> WENOVectorInvariant() WENOVectorInvariant{5, Float64, Float32}(vorticity_order=9, vertical_order=5) -├── vorticity_scheme: WENO{5, Float64, Float32}(order=9) +├── vorticity_scheme: WENO{5, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) ├── vorticity_stencil: Oceananigans.Advection.VelocityStencil -├── vertical_advection_scheme: WENO{3, Float64, Float32}(order=5) -├── kinetic_energy_gradient_scheme: WENO{3, Float64, Float32}(order=5) -├── divergence_scheme: WENO{3, Float64, Float32}(order=5) +├── vertical_advection_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +├── kinetic_energy_gradient_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +├── divergence_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) └── upwinding: OnlySelfUpwinding ``` """ diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl index 81d82d7294a..d2a30ef0903 100644 --- a/src/Advection/weno_reconstruction.jl +++ b/src/Advection/weno_reconstruction.jl @@ -52,8 +52,8 @@ To build the default 5th-order scheme: julia> using Oceananigans julia> WENO() -WENO{3, Float64, Float32}(order=5) -├── buffer_scheme: WENO{2, Float64, Float32}(order=3) +WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +├── buffer_scheme: WENO{2, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=4) ``` @@ -63,10 +63,10 @@ yet minimally-dissipative advection scheme): ```jldoctest weno julia> WENO(order=9) -WENO{5, Float64, Float32}(order=9) -├── buffer_scheme: WENO{4, Float64, Float32}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Float32}(order=5) -│ └── buffer_scheme: WENO{2, Float64, Float32}(order=3) +WENO{5, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) +├── buffer_scheme: WENO{4, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +│ └── buffer_scheme: WENO{2, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` @@ -76,22 +76,31 @@ which uses `Centered(order=2)` as the innermost buffer scheme: ```jldoctest weno julia> WENO(order=9, minimum_buffer_upwind_order=5) -WENO{5, Float64, Float32}(order=9) -├── buffer_scheme: WENO{4, Float64, Float32}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Float32}(order=5) +WENO{5, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) +├── buffer_scheme: WENO{4, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` ```jldoctest weno julia> WENO(order=9, bounds=(0, 1)) -WENO{5, Float64, Float32}(order=9, bounds=(0.0, 1.0)) -├── buffer_scheme: WENO{4, Float64, Float32}(order=7, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{3, Float64, Float32}(order=5, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{2, Float64, Float32}(order=3, bounds=(0.0, 1.0)) +WENO{5, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9, bounds=(0.0, 1.0)) +├── buffer_scheme: WENO{4, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{2, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3, bounds=(0.0, 1.0)) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` + +To build a WENO scheme that uses approximate division on a GPU to execute faster: +```jldoctest weno +julia> WENO(;newton_div=Oceananigans.Utils.BackendOptimizedNewtonDiv) +WENO{3, Float64, Float32, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=5) +├── buffer_scheme: WENO{2, Float64, Float32, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=3) +│ └── buffer_scheme: Centered(order=2) +└── advecting_velocity_scheme: Centered(order=4) +``` """ function WENO(FT::DataType=Oceananigans.defaults.FloatType, FT2::DataType=Float32; From 2ab9cf5d4e48dd1089c03a685d853e3c9ce594e3 Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Fri, 30 Jan 2026 18:46:22 -0500 Subject: [PATCH 06/25] feat: add CUDA fast division for f32 --- ext/OceananigansCUDAExt.jl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ext/OceananigansCUDAExt.jl b/ext/OceananigansCUDAExt.jl index 8abecc71ece..5f9a8ea194f 100644 --- a/ext/OceananigansCUDAExt.jl +++ b/ext/OceananigansCUDAExt.jl @@ -133,7 +133,7 @@ end @inline UT.sync_device!(::CUDABackend) = CUDA.synchronize() # Use faster versions of `newton_div` on Nvidia GPUs -CUDA.@device_override UT.newton_div(::Type{UT.BackendOptimizedNewtonDiv}, a, b::Float64) = a * fast_inv_cuda(b) +CUDA.@device_override UT.newton_div(::Type{UT.BackendOptimizedNewtonDiv}, a, b) = a * fast_inv_cuda(b) function fast_inv_cuda(a::Float64) # Get the approximate reciprocal @@ -151,4 +151,12 @@ function fast_inv_cuda(a::Float64) return inv_a end +function fast_inv_cuda(a::Float32) + # This instruction just computes reciprocal flushing subnormals to 0.0 + # Hence for subnormal inputs it returns Inf + # For large number whose reciprocal is subnormal it underflows to 0.0 + inv_a = ccall("llvm.nvvm.rcp.approx.ftz.f32", llvmcall, Float32, (Float32,), a) + return inv_a +end + end # module OceananigansCUDAExt From bbc4642f95ad0be4182c184f15365b93b384b318 Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Thu, 5 Feb 2026 11:14:33 -0500 Subject: [PATCH 07/25] refactor: remove lower-precision WENO type parameter This is a breaking change! It requires requires minor version bump. It has been made since the 2nd floating precision type parameter is no longer required. It has been replaced with type-based specifier for the division type in WENO reconstruction scheme. --- src/Advection/vector_invariant_advection.jl | 13 ++--- src/Advection/weno_interpolants.jl | 2 +- src/Advection/weno_reconstruction.jl | 65 ++++++++++----------- 3 files changed, 38 insertions(+), 42 deletions(-) diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl index e4e37abf93a..f5e68d2163f 100644 --- a/src/Advection/vector_invariant_advection.jl +++ b/src/Advection/vector_invariant_advection.jl @@ -137,8 +137,7 @@ function Base.summary(a::WENOVectorInvariant{N}) where N vertical_order = weno_order(a.vertical_advection_scheme) order = weno_order(a.vorticity_scheme) FT = eltype(a.vorticity_scheme) - FT2 = eltype2(a.vorticity_scheme) - return string("WENOVectorInvariant{$N, $FT, $FT2}(vorticity_order=$vorticity_order, vertical_order=$vertical_order)") + return string("WENOVectorInvariant{$N, $FT}(vorticity_order=$vorticity_order, vertical_order=$vertical_order)") end function Base.show(io::IO, a::VectorInvariant{N, FT}) where {N, FT} @@ -193,12 +192,12 @@ Example julia> using Oceananigans julia> WENOVectorInvariant() -WENOVectorInvariant{5, Float64, Float32}(vorticity_order=9, vertical_order=5) -├── vorticity_scheme: WENO{5, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) +WENOVectorInvariant{5, Float64}(vorticity_order=9, vertical_order=5) +├── vorticity_scheme: WENO{5, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) ├── vorticity_stencil: Oceananigans.Advection.VelocityStencil -├── vertical_advection_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) -├── kinetic_energy_gradient_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) -├── divergence_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +├── vertical_advection_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +├── kinetic_energy_gradient_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +├── divergence_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) └── upwinding: OnlySelfUpwinding ``` """ diff --git a/src/Advection/weno_interpolants.jl b/src/Advection/weno_interpolants.jl index 85562e4e53b..ac1656b7476 100644 --- a/src/Advection/weno_interpolants.jl +++ b/src/Advection/weno_interpolants.jl @@ -301,7 +301,7 @@ for buffer in advection_buffers[2:end] @eval begin @inline beta_sum(scheme::WENO{$buffer, FT}, β₁, β₂) where FT = @inbounds $(metaprogrammed_beta_sum(buffer)) @inline beta_loop(scheme::WENO{$buffer, FT}, ψ) where FT = @inbounds $(metaprogrammed_beta_loop(buffer)) - @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, FT2, NDC}, β, τ) where {FT, FT2, NDC} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer)) + @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, NDC}, β, τ) where {FT, NDC} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer)) end end diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl index d2a30ef0903..497862b20dc 100644 --- a/src/Advection/weno_reconstruction.jl +++ b/src/Advection/weno_reconstruction.jl @@ -4,20 +4,20 @@ import Oceananigans ##### Weighted Essentially Non-Oscillatory (WENO) advection scheme ##### -struct WENO{N, FT, FT2, NDC, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT} +struct WENO{N, FT, NDC, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT} bounds :: PP buffer_scheme :: CA advecting_velocity_scheme :: SI - function WENO{N, FT, FT2, NDC}(bounds::PP, buffer_scheme::CA, - advecting_velocity_scheme :: SI) where {N, FT, FT2, NDC, PP, CA, SI} + function WENO{N, FT, NDC}(bounds::PP, buffer_scheme::CA, + advecting_velocity_scheme :: SI) where {N, FT, NDC, PP, CA, SI} - return new{N, FT, FT2, NDC, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme) + return new{N, FT, NDC, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme) end end """ - WENO([FT=Float64, FT2=Float32;] + WENO([FT=Float64;] order = 5, bounds = nothing, minimum_buffer_upwind_order = 3) @@ -28,12 +28,11 @@ Arguments ========= - `FT`: The floating point type used in the scheme. Default: `Oceananigans.defaults.FloatType` -- `FT2`: The floating point type used in some performance-critical parts of the scheme. Default: `Float32` Keyword arguments ================= - `newton_div`: The type of approximate division to use in performance-critical parts of the scheme. - Default: `Oceananigans.Utils.NewtonDivWithConversion{FT2}` + Default: `Oceananigans.Utils.NewtonDivWithConversion{Float32}` - `order`: The order of the WENO advection scheme. Default: 5 - `bounds` (experimental): Whether to use bounds-preserving WENO, which produces a reconstruction that attempts to restrict a quantity to lie between a `bounds` tuple. @@ -52,8 +51,8 @@ To build the default 5th-order scheme: julia> using Oceananigans julia> WENO() -WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) -├── buffer_scheme: WENO{2, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3) +WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=4) ``` @@ -63,10 +62,10 @@ yet minimally-dissipative advection scheme): ```jldoctest weno julia> WENO(order=9) -WENO{5, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) -├── buffer_scheme: WENO{4, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) -│ └── buffer_scheme: WENO{2, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3) +WENO{5, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) +├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` @@ -76,19 +75,19 @@ which uses `Centered(order=2)` as the innermost buffer scheme: ```jldoctest weno julia> WENO(order=9, minimum_buffer_upwind_order=5) -WENO{5, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) -├── buffer_scheme: WENO{4, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +WENO{5, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) +├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` ```jldoctest weno julia> WENO(order=9, bounds=(0, 1)) -WENO{5, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9, bounds=(0.0, 1.0)) -├── buffer_scheme: WENO{4, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{3, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{2, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3, bounds=(0.0, 1.0)) +WENO{5, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9, bounds=(0.0, 1.0)) +├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3, bounds=(0.0, 1.0)) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` @@ -96,15 +95,14 @@ WENO{5, Float64, Float32, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(o To build a WENO scheme that uses approximate division on a GPU to execute faster: ```jldoctest weno julia> WENO(;newton_div=Oceananigans.Utils.BackendOptimizedNewtonDiv) -WENO{3, Float64, Float32, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=5) -├── buffer_scheme: WENO{2, Float64, Float32, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=3) +WENO{3, Float64, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=5) +├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=4) ``` """ -function WENO(FT::DataType=Oceananigans.defaults.FloatType, - FT2::DataType=Float32; - newton_div::DataType=Oceananigans.Utils.NewtonDivWithConversion{FT2}, +function WENO(FT::DataType=Oceananigans.defaults.FloatType; + newton_div::DataType=Oceananigans.Utils.NewtonDivWithConversion{Float32}, order = 5, buffer_scheme = DecreasingOrderAdvectionScheme(), bounds = nothing, @@ -128,20 +126,19 @@ function WENO(FT::DataType=Oceananigans.defaults.FloatType, # At minimum order, switch to Centered scheme buffer_scheme = Centered(FT; order=2) else - buffer_scheme = WENO(FT, FT2; order=order-2, bounds, minimum_buffer_upwind_order, newton_div) + buffer_scheme = WENO(FT; order=order-2, bounds, minimum_buffer_upwind_order, newton_div) end end N = Int((order + 1) ÷ 2) - return WENO{N, FT, FT2, newton_div}(bounds, buffer_scheme, advecting_velocity_scheme) + return WENO{N, FT, newton_div}(bounds, buffer_scheme, advecting_velocity_scheme) end end weno_order(::WENO{N}) where N = 2N-1 Base.eltype(::WENO{N, FT}) where {N, FT} = FT -eltype2(::WENO{N, FT, FT2}) where {N, FT, FT2} = FT2 -Base.summary(a::WENO{N, FT, FT2, NDC, Nothing}) where {N, FT, FT2, NDC} = string("WENO{$N, $FT, $FT2, $NDC}(order=", 2N-1, ")") -Base.summary(a::WENO{N, FT, FT2, NDC, PP}) where {N, FT, FT2, NDC, PP} = string("WENO{$N, $FT, $FT2, $NDC}(order=", 2N-1, ", bounds=", string(a.bounds), ")") +Base.summary(a::WENO{N, FT, NDC, Nothing}) where {N, FT, NDC} = string("WENO{$N, $FT, $NDC}(order=", 2N-1, ")") +Base.summary(a::WENO{N, FT, NDC, PP}) where {N, FT, NDC, PP} = string("WENO{$N, $FT, $NDC}(order=", 2N-1, ", bounds=", string(a.bounds), ")") function Base.show(io::IO, a::WENO) print(io, summary(a), '\n') @@ -157,12 +154,12 @@ function Base.show(io::IO, a::WENO) print(io, "└── advecting_velocity_scheme: ", summary(a.advecting_velocity_scheme)) end -Adapt.adapt_structure(to, scheme::WENO{N, FT, FT2, NDC}) where {N, FT, FT2, NDC} = - WENO{N, FT, FT2, NDC}(Adapt.adapt(to, scheme.bounds), +Adapt.adapt_structure(to, scheme::WENO{N, FT, NDC}) where {N, FT, NDC} = + WENO{N, FT, NDC}(Adapt.adapt(to, scheme.bounds), Adapt.adapt(to, scheme.buffer_scheme), Adapt.adapt(to, scheme.advecting_velocity_scheme)) -on_architecture(to, scheme::WENO{N, FT, FT2, NDC}) where {N, FT, FT2, NDC} = - WENO{N, FT, FT2, NDC}(on_architecture(to, scheme.bounds), +on_architecture(to, scheme::WENO{N, FT, NDC}) where {N, FT, NDC} = + WENO{N, FT, NDC}(on_architecture(to, scheme.bounds), on_architecture(to, scheme.buffer_scheme), on_architecture(to, scheme.advecting_velocity_scheme)) From ecf4e8e29beeb59fe24a21a284d089c34f77c34e Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Fri, 27 Feb 2026 06:21:13 -0500 Subject: [PATCH 08/25] refactor: use `weight_computation` to refer to division type in WENO --- src/Advection/weno_interpolants.jl | 4 +-- src/Advection/weno_reconstruction.jl | 40 ++++++++++++++-------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/Advection/weno_interpolants.jl b/src/Advection/weno_interpolants.jl index ac1656b7476..bf38ac3fad3 100644 --- a/src/Advection/weno_interpolants.jl +++ b/src/Advection/weno_interpolants.jl @@ -291,7 +291,7 @@ end @inline function metaprogrammed_zweno_alpha_loop(buffer) elem = Vector(undef, buffer) for stencil = 1:buffer - elem[stencil] = :(C★(scheme, Val($(stencil-1))) * (1 + (newton_div(NDC, τ, β[$stencil] + ϵ))^2)) + elem[stencil] = :(C★(scheme, Val($(stencil-1))) * (1 + (newton_div(WCT, τ, β[$stencil] + ϵ))^2)) end return :($(elem...),) @@ -301,7 +301,7 @@ for buffer in advection_buffers[2:end] @eval begin @inline beta_sum(scheme::WENO{$buffer, FT}, β₁, β₂) where FT = @inbounds $(metaprogrammed_beta_sum(buffer)) @inline beta_loop(scheme::WENO{$buffer, FT}, ψ) where FT = @inbounds $(metaprogrammed_beta_loop(buffer)) - @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, NDC}, β, τ) where {FT, NDC} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer)) + @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, WCT}, β, τ) where {FT, WCT} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer)) end end diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl index 497862b20dc..7f2a774866a 100644 --- a/src/Advection/weno_reconstruction.jl +++ b/src/Advection/weno_reconstruction.jl @@ -4,15 +4,15 @@ import Oceananigans ##### Weighted Essentially Non-Oscillatory (WENO) advection scheme ##### -struct WENO{N, FT, NDC, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT} +struct WENO{N, FT, WCT, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT} bounds :: PP buffer_scheme :: CA advecting_velocity_scheme :: SI - function WENO{N, FT, NDC}(bounds::PP, buffer_scheme::CA, - advecting_velocity_scheme :: SI) where {N, FT, NDC, PP, CA, SI} + function WENO{N, FT, WCT}(bounds::PP, buffer_scheme::CA, + advecting_velocity_scheme :: SI) where {N, FT, WCT, PP, CA, SI} - return new{N, FT, NDC, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme) + return new{N, FT, WCT, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme) end end @@ -31,8 +31,8 @@ Arguments Keyword arguments ================= -- `newton_div`: The type of approximate division to use in performance-critical parts of the scheme. - Default: `Oceananigans.Utils.NewtonDivWithConversion{Float32}` +- `weight_computation`: The type of approximate division to used when computing WENO weights. + Default: `Oceananigans.Utils.NewtonDivWithConversion{Float32}` - `order`: The order of the WENO advection scheme. Default: 5 - `bounds` (experimental): Whether to use bounds-preserving WENO, which produces a reconstruction that attempts to restrict a quantity to lie between a `bounds` tuple. @@ -94,7 +94,7 @@ WENO{5, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9, b To build a WENO scheme that uses approximate division on a GPU to execute faster: ```jldoctest weno -julia> WENO(;newton_div=Oceananigans.Utils.BackendOptimizedNewtonDiv) +julia> WENO(;weight_computation=Oceananigans.Utils.BackendOptimizedNewtonDiv) WENO{3, Float64, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=5) ├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=3) │ └── buffer_scheme: Centered(order=2) @@ -102,7 +102,7 @@ WENO{3, Float64, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=5) ``` """ function WENO(FT::DataType=Oceananigans.defaults.FloatType; - newton_div::DataType=Oceananigans.Utils.NewtonDivWithConversion{Float32}, + weight_computation::DataType=Oceananigans.Utils.NewtonDivWithConversion{Float32}, order = 5, buffer_scheme = DecreasingOrderAdvectionScheme(), bounds = nothing, @@ -126,19 +126,19 @@ function WENO(FT::DataType=Oceananigans.defaults.FloatType; # At minimum order, switch to Centered scheme buffer_scheme = Centered(FT; order=2) else - buffer_scheme = WENO(FT; order=order-2, bounds, minimum_buffer_upwind_order, newton_div) + buffer_scheme = WENO(FT; order=order-2, bounds, minimum_buffer_upwind_order, weight_computation) end end N = Int((order + 1) ÷ 2) - return WENO{N, FT, newton_div}(bounds, buffer_scheme, advecting_velocity_scheme) + return WENO{N, FT, weight_computation}(bounds, buffer_scheme, advecting_velocity_scheme) end end weno_order(::WENO{N}) where N = 2N-1 Base.eltype(::WENO{N, FT}) where {N, FT} = FT -Base.summary(a::WENO{N, FT, NDC, Nothing}) where {N, FT, NDC} = string("WENO{$N, $FT, $NDC}(order=", 2N-1, ")") -Base.summary(a::WENO{N, FT, NDC, PP}) where {N, FT, NDC, PP} = string("WENO{$N, $FT, $NDC}(order=", 2N-1, ", bounds=", string(a.bounds), ")") +Base.summary(a::WENO{N, FT, WCT, Nothing}) where {N, FT, WCT} = string("WENO{$N, $FT, $WCT}(order=", 2N-1, ")") +Base.summary(a::WENO{N, FT, WCT, PP}) where {N, FT, WCT, PP} = string("WENO{$N, $FT, $WCT}(order=", 2N-1, ", bounds=", string(a.bounds), ")") function Base.show(io::IO, a::WENO) print(io, summary(a), '\n') @@ -154,12 +154,12 @@ function Base.show(io::IO, a::WENO) print(io, "└── advecting_velocity_scheme: ", summary(a.advecting_velocity_scheme)) end -Adapt.adapt_structure(to, scheme::WENO{N, FT, NDC}) where {N, FT, NDC} = - WENO{N, FT, NDC}(Adapt.adapt(to, scheme.bounds), - Adapt.adapt(to, scheme.buffer_scheme), - Adapt.adapt(to, scheme.advecting_velocity_scheme)) +Adapt.adapt_structure(to, scheme::WENO{N, FT, WCT}) where {N, FT, WCT} = + WENO{N, FT, WCT}(Adapt.adapt(to, scheme.bounds), + Adapt.adapt(to, scheme.buffer_scheme), + Adapt.adapt(to, scheme.advecting_velocity_scheme)) -on_architecture(to, scheme::WENO{N, FT, NDC}) where {N, FT, NDC} = - WENO{N, FT, NDC}(on_architecture(to, scheme.bounds), - on_architecture(to, scheme.buffer_scheme), - on_architecture(to, scheme.advecting_velocity_scheme)) +on_architecture(to, scheme::WENO{N, FT, WCT}) where {N, FT, WCT} = + WENO{N, FT, WCT}(on_architecture(to, scheme.bounds), + on_architecture(to, scheme.buffer_scheme), + on_architecture(to, scheme.advecting_velocity_scheme)) From 3cd0d41ed87239101ada4cb34bc53b11d3365a46 Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Fri, 27 Feb 2026 06:23:59 -0500 Subject: [PATCH 09/25] refactor: make `newton_div` type names less verbose --- ext/OceananigansCUDAExt.jl | 2 +- src/Advection/vector_invariant_advection.jl | 8 ++--- src/Advection/weno_reconstruction.jl | 36 ++++++++++----------- src/Utils/Utils.jl | 2 +- src/Utils/newton_div.jl | 12 +++---- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/ext/OceananigansCUDAExt.jl b/ext/OceananigansCUDAExt.jl index 5f9a8ea194f..d659e63413c 100644 --- a/ext/OceananigansCUDAExt.jl +++ b/ext/OceananigansCUDAExt.jl @@ -133,7 +133,7 @@ end @inline UT.sync_device!(::CUDABackend) = CUDA.synchronize() # Use faster versions of `newton_div` on Nvidia GPUs -CUDA.@device_override UT.newton_div(::Type{UT.BackendOptimizedNewtonDiv}, a, b) = a * fast_inv_cuda(b) +CUDA.@device_override UT.newton_div(::Type{UT.BackendOptimizedDivision}, a, b) = a * fast_inv_cuda(b) function fast_inv_cuda(a::Float64) # Get the approximate reciprocal diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl index f5e68d2163f..59186fedadc 100644 --- a/src/Advection/vector_invariant_advection.jl +++ b/src/Advection/vector_invariant_advection.jl @@ -193,11 +193,11 @@ julia> using Oceananigans julia> WENOVectorInvariant() WENOVectorInvariant{5, Float64}(vorticity_order=9, vertical_order=5) -├── vorticity_scheme: WENO{5, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) +├── vorticity_scheme: WENO{5, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=9) ├── vorticity_stencil: Oceananigans.Advection.VelocityStencil -├── vertical_advection_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) -├── kinetic_energy_gradient_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) -├── divergence_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +├── vertical_advection_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) +├── kinetic_energy_gradient_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) +├── divergence_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) └── upwinding: OnlySelfUpwinding ``` """ diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl index 7f2a774866a..21629020843 100644 --- a/src/Advection/weno_reconstruction.jl +++ b/src/Advection/weno_reconstruction.jl @@ -32,7 +32,7 @@ Arguments Keyword arguments ================= - `weight_computation`: The type of approximate division to used when computing WENO weights. - Default: `Oceananigans.Utils.NewtonDivWithConversion{Float32}` + Default: `Oceananigans.Utils.ConvertingDivision{Float32}` - `order`: The order of the WENO advection scheme. Default: 5 - `bounds` (experimental): Whether to use bounds-preserving WENO, which produces a reconstruction that attempts to restrict a quantity to lie between a `bounds` tuple. @@ -51,8 +51,8 @@ To build the default 5th-order scheme: julia> using Oceananigans julia> WENO() -WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) -├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3) +WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) +├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=4) ``` @@ -62,10 +62,10 @@ yet minimally-dissipative advection scheme): ```jldoctest weno julia> WENO(order=9) -WENO{5, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) -├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) -│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3) +WENO{5, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=9) +├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) +│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` @@ -75,34 +75,34 @@ which uses `Centered(order=2)` as the innermost buffer scheme: ```jldoctest weno julia> WENO(order=9, minimum_buffer_upwind_order=5) -WENO{5, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9) -├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5) +WENO{5, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=9) +├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` ```jldoctest weno julia> WENO(order=9, bounds=(0, 1)) -WENO{5, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=9, bounds=(0.0, 1.0)) -├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=7, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=5, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.NewtonDivWithConversion{Float32}}(order=3, bounds=(0.0, 1.0)) +WENO{5, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=9, bounds=(0.0, 1.0)) +├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=7, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=3, bounds=(0.0, 1.0)) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` To build a WENO scheme that uses approximate division on a GPU to execute faster: ```jldoctest weno -julia> WENO(;weight_computation=Oceananigans.Utils.BackendOptimizedNewtonDiv) -WENO{3, Float64, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=5) -├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedNewtonDiv}(order=3) +julia> WENO(;weight_computation=Oceananigans.Utils.BackendOptimizedDivision) +WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) +├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=4) ``` """ function WENO(FT::DataType=Oceananigans.defaults.FloatType; - weight_computation::DataType=Oceananigans.Utils.NewtonDivWithConversion{Float32}, + weight_computation::DataType=Oceananigans.Utils.ConvertingDivision{Float32}, order = 5, buffer_scheme = DecreasingOrderAdvectionScheme(), bounds = nothing, diff --git a/src/Utils/Utils.jl b/src/Utils/Utils.jl index a12a7bb8085..a726206d952 100644 --- a/src/Utils/Utils.jl +++ b/src/Utils/Utils.jl @@ -11,7 +11,7 @@ export seconds_to_nanosecond, period_to_seconds, time_difference_seconds, add_ti export TimeInterval, IterationInterval, WallTimeInterval, SpecifiedTimes, AndSchedule, OrSchedule, ConsecutiveIterations export apply_regionally!, construct_regionally, @apply_regionally, MultiRegionObject export isregional, getregion, _getregion, regions, sync_device! -export newton_div, NoNewtonDiv, NewtonDivWithConversion, BackendOptimizedNewtonDiv +export newton_div, NormalDivision, ConvertingDivision, BackendOptimizedDivision export TabulatedFunction export interpolator, _interpolate export ϕ₁, ϕ₂, ϕ₃, ϕ₄, ϕ₅, ϕ₆, ϕ₇, ϕ₈ diff --git a/src/Utils/newton_div.jl b/src/Utils/newton_div.jl index cd2f57756bc..9607f3ba7f7 100644 --- a/src/Utils/newton_div.jl +++ b/src/Utils/newton_div.jl @@ -8,7 +8,7 @@ abstract type NewtonDivConfig end """ Configuration selecting regular, full-precision division. """ -struct NoNewtonDiv <: NewtonDivConfig end +struct NormalDivision <: NewtonDivConfig end """ Configuration selecting approximate division via one Newton iteration. @@ -16,13 +16,13 @@ The reciprocal `1/b` is first evaluated in a lower-precision type `FT` to obtain a fast initial guess, then refined with a single Newton step in the original precision. """ -struct NewtonDivWithConversion{FT} <: NewtonDivConfig end +struct ConvertingDivision{FT} <: NewtonDivConfig end """ Configuration selecting a backend-optimized implementation of approximate division. The actual algorithm may differ across CPU and different GPU backends. """ -struct BackendOptimizedNewtonDiv <: NewtonDivConfig end +struct BackendOptimizedDivision <: NewtonDivConfig end """ newton_div(::Type{T}, a, b) @@ -31,7 +31,7 @@ Compute an approximate division `a/b` using a method specified by selector type """ function newton_div end -@inline function newton_div(::Type{NewtonDivWithConversion{inv_FT}}, a, b::FT) where {inv_FT, FT} +@inline function newton_div(::Type{ConvertingDivision{inv_FT}}, a, b::FT) where {inv_FT, FT} # Low precision division: b_low = convert(inv_FT, b) inv_b = Base.FastMath.inv_fast(b_low) @@ -46,7 +46,7 @@ function newton_div end end # Case of matching precisions -@inline newton_div(::Type{NewtonDivWithConversion{FT}}, a, b::FT) where FT = a * Base.FastMath.inv_fast(b) +@inline newton_div(::Type{ConvertingDivision{FT}}, a, b::FT) where FT = a * Base.FastMath.inv_fast(b) # Exact division if requested -@inline newton_div(::Type{NoNewtonDiv}, a, b) = a / b +@inline newton_div(::Type{NormalDivision}, a, b) = a / b From e351c1616f9b5e9f9d6876918acb0d2304a3ee35 Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Thu, 15 Jan 2026 06:14:31 -0500 Subject: [PATCH 10/25] test: add unit tests for newton_div Co-authored-by: Gregory L. Wagner --- test/runtests.jl | 1 + test/test_newton_div.jl | 74 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 test/test_newton_div.jl diff --git a/test/runtests.jl b/test/runtests.jl index a794a61f1b4..3be02399b2b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -51,6 +51,7 @@ CUDA.allowscalar() do include("test_stokes_drift.jl") include("test_utils.jl") include("test_schedules.jl") + include("test_newton_div.jl") end end diff --git a/test/test_newton_div.jl b/test/test_newton_div.jl new file mode 100644 index 00000000000..24b6b3c246b --- /dev/null +++ b/test/test_newton_div.jl @@ -0,0 +1,74 @@ +include("dependencies_for_runtests.jl") + +using CUDA + +# Generate some random points in a single binade [1;2) interval +function test_data_in_single_binade(::Type{FT}, size) where {FT} + prng = Random.Xoshiro(44) + return rand(prng, FT, size) .+ 1.0 +end + +@testset "CPU newton_div: $FT $WCT" for (FT, WCT) in Iterators.product((Float32, Float64), + (Oceananigans.Utils.NormalDivision, + Oceananigans.Utils.ConvertingDivision{Float32})) + test_input = test_data_in_single_binade(FT, 1024) + + ref = similar(test_input) + output = similar(test_input) + + ref .= FT(π) ./ test_input + output .= Oceananigans.Utils.newton_div.(WCT, FT(π), test_input) + + @test isapprox(ref, output) +end + +if CUDA.functional() + + @testset "CUDA newton_div: $FT" for FT in (Float32, Float64) + test_input = CuArray(test_data_in_single_binade(FT, 1024)) + + WCT = Oceananigans.Utils.BackendOptimizedDivision + + ref = similar(test_input) + output = similar(test_input) + + ref .= FT(π) ./ test_input + output .= Oceananigans.Utils.newton_div.(WCT, FT(π), test_input) + + @test isapprox(ref, output) + end +end + + +function append_weight_computation_type!(list, weno::WENO{<:Any, <:Any, WCT}) where {WCT} + push!(list, WCT) + append_weight_computation_type!(list, weno.buffer_scheme) +end +append_weight_computation_type!(::Any, ::Any) = nothing + +# Extract all weight computation types from WENO +# Assumes a non-weno buffer scheme will not have WENO buffer scheme +function get_weight_computation_from_weno_advection(weno::WENO) + weight_computation_types = DataType[] + append_weight_computation_type!(weight_computation_types, weno) + return weight_computation_types +end + +@testset "Verify WENO schemes construction" begin + + # WENO + weno5 = WENO(order=7; weight_computation=Oceananigans.Utils.NormalDivision) + weight_computation_types = get_weight_computation_from_weno_advection(weno5) + @test all(weight_computation_types .== Oceananigans.Utils.NormalDivision) + + # Vector Invariant WENO + vector_weno = WENOVectorInvariant(order=9, weight_computation=Oceananigans.Utils.BackendOptimizedDivision) + + for field_name in fieldnames(typeof(vector_weno)) + field = getfield(vector_weno, field_name) + if field isa WENO + weight_computation_types = get_weight_computation_from_weno_advection(field) + @test all(weight_computation_types .== Oceananigans.Utils.BackendOptimizedDivision) + end + end +end From 6a6527446f93a1cc9860ec658e71c33a3018d643 Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Wed, 4 Mar 2026 04:45:55 -0500 Subject: [PATCH 11/25] refactor: BackendOptimizedDivision in ConvertingDivision on CPU Add a fallback to enable `BackendOptimizedDivision` to run on the CPU. The fallback should be overriden for each device backend in an approperiate extension module. --- src/Utils/newton_div.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Utils/newton_div.jl b/src/Utils/newton_div.jl index 9607f3ba7f7..0b820dd3e34 100644 --- a/src/Utils/newton_div.jl +++ b/src/Utils/newton_div.jl @@ -50,3 +50,6 @@ end # Exact division if requested @inline newton_div(::Type{NormalDivision}, a, b) = a / b + +# Backend-optimized on CPU means Float32 converting division +@inline newton_div(::Type{BackendOptimizedDivision}, a, b) = newton_div(ConvertingDivision{Float32}, a, b) From c0c0ed229646d00b05dfa7d29590a21eff0527d0 Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Wed, 4 Mar 2026 10:05:55 -0500 Subject: [PATCH 12/25] refactor: use BackendOptimizedDivision by default --- src/Advection/vector_invariant_advection.jl | 8 +++--- src/Advection/weno_reconstruction.jl | 30 ++++++++++----------- src/Oceananigans.jl | 12 ++++++++- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl index 59186fedadc..6da080920d9 100644 --- a/src/Advection/vector_invariant_advection.jl +++ b/src/Advection/vector_invariant_advection.jl @@ -193,11 +193,11 @@ julia> using Oceananigans julia> WENOVectorInvariant() WENOVectorInvariant{5, Float64}(vorticity_order=9, vertical_order=5) -├── vorticity_scheme: WENO{5, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=9) +├── vorticity_scheme: WENO{5, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=9) ├── vorticity_stencil: Oceananigans.Advection.VelocityStencil -├── vertical_advection_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) -├── kinetic_energy_gradient_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) -├── divergence_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) +├── vertical_advection_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) +├── kinetic_energy_gradient_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) +├── divergence_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) └── upwinding: OnlySelfUpwinding ``` """ diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl index 21629020843..ae3bc59a296 100644 --- a/src/Advection/weno_reconstruction.jl +++ b/src/Advection/weno_reconstruction.jl @@ -32,7 +32,7 @@ Arguments Keyword arguments ================= - `weight_computation`: The type of approximate division to used when computing WENO weights. - Default: `Oceananigans.Utils.ConvertingDivision{Float32}` + Default: `Oceananigans.defaults.weno_weight_computation` - `order`: The order of the WENO advection scheme. Default: 5 - `bounds` (experimental): Whether to use bounds-preserving WENO, which produces a reconstruction that attempts to restrict a quantity to lie between a `bounds` tuple. @@ -51,8 +51,8 @@ To build the default 5th-order scheme: julia> using Oceananigans julia> WENO() -WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) -├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=3) +WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) +├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=4) ``` @@ -62,10 +62,10 @@ yet minimally-dissipative advection scheme): ```jldoctest weno julia> WENO(order=9) -WENO{5, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=9) -├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) -│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=3) +WENO{5, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=9) +├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) +│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` @@ -75,19 +75,19 @@ which uses `Centered(order=2)` as the innermost buffer scheme: ```jldoctest weno julia> WENO(order=9, minimum_buffer_upwind_order=5) -WENO{5, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=9) -├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5) +WENO{5, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=9) +├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` ```jldoctest weno julia> WENO(order=9, bounds=(0, 1)) -WENO{5, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=9, bounds=(0.0, 1.0)) -├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=7, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=5, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.ConvertingDivision{Float32}}(order=3, bounds=(0.0, 1.0)) +WENO{5, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=9, bounds=(0.0, 1.0)) +├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=7, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=3, bounds=(0.0, 1.0)) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` @@ -102,7 +102,7 @@ WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) ``` """ function WENO(FT::DataType=Oceananigans.defaults.FloatType; - weight_computation::DataType=Oceananigans.Utils.ConvertingDivision{Float32}, + weight_computation::DataType=Oceananigans.defaults.weno_weight_computation, order = 5, buffer_scheme = DecreasingOrderAdvectionScheme(), bounds = nothing, diff --git a/src/Oceananigans.jl b/src/Oceananigans.jl index 008f61c9df8..90c721f16c1 100644 --- a/src/Oceananigans.jl +++ b/src/Oceananigans.jl @@ -147,6 +147,7 @@ mutable struct Defaults gravitational_acceleration :: Float64 planet_radius :: Float64 planet_rotation_rate :: Float64 + weno_weight_computation end function Defaults(; @@ -159,10 +160,16 @@ function Defaults(; # [s⁻¹] Earth's angular speed; see https://en.wikipedia.org/wiki/Earth%27s_rotation#Angular_speed planet_rotation_rate = 7.292115e-5) + # Default optimisation for computing WENO weights + # Since the options live in Oceananigans.Utils, we put a placeholder here and + # set the default later in the module + weno_weight_computation = nothing + return Defaults(FloatType, gravitational_acceleration, planet_radius, - planet_rotation_rate) + planet_rotation_rate, + weno_weight_computation) end const defaults = Defaults() @@ -267,6 +274,9 @@ include("Models/Models.jl") # Abstractions for distributed and multi-region models include("MultiRegion/MultiRegion.jl") +### Set default WENO weight computation to backend-optimized division +defaults.weno_weight_computation = Utils.BackendOptimizedDivision + ##### ##### Needed so we can export names from sub-modules at the top-level ##### From 506fadff20c0ac88f33e2de752f2cf1c63a20a26 Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Wed, 4 Mar 2026 10:09:26 -0500 Subject: [PATCH 13/25] refactor: use normal division on the CPU The difference in latency between f32 and f64 is small. It is probably not enough to justify two conversions and FMAs. (not tested for performance, it is conjecture) --- src/Utils/newton_div.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Utils/newton_div.jl b/src/Utils/newton_div.jl index 0b820dd3e34..5b406d22c03 100644 --- a/src/Utils/newton_div.jl +++ b/src/Utils/newton_div.jl @@ -51,5 +51,7 @@ end # Exact division if requested @inline newton_div(::Type{NormalDivision}, a, b) = a / b -# Backend-optimized on CPU means Float32 converting division -@inline newton_div(::Type{BackendOptimizedDivision}, a, b) = newton_div(ConvertingDivision{Float32}, a, b) +# Backend-optimized on CPU uses fast division +# Since both f32 and f64 divisions are generally implemented in hardware, +# the latencies are comparable. +@inline newton_div(::Type{BackendOptimizedDivision}, a, b) = Base.FastMath.div_fast(a, b) From e025c51a132934dbec07988d0d27a8a47133cb2f Mon Sep 17 00:00:00 2001 From: "M.A. Kowalski" Date: Wed, 4 Mar 2026 10:19:20 -0500 Subject: [PATCH 14/25] refactor: do not use CUDA intrinsics under Reactant The BackendOptimizedDivision optimization for `weno_weights_computation` uses LLVM's NVPTX backend intrinsics which are not understood by Reactant. Thus we need to change the default when running under Reactant. --- ext/OceananigansReactantExt/OceananigansReactantExt.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ext/OceananigansReactantExt/OceananigansReactantExt.jl b/ext/OceananigansReactantExt/OceananigansReactantExt.jl index 4a7c75e8a6c..8f1cb11555b 100644 --- a/ext/OceananigansReactantExt/OceananigansReactantExt.jl +++ b/ext/OceananigansReactantExt/OceananigansReactantExt.jl @@ -335,6 +335,12 @@ end Base.getindex(array::OffsetVector{T, <:Reactant.AbstractConcreteArray{T, 1}}, ::Colon) where T = array +# Reactant uses CUDA version of the code to uplift program description to MLIR. +# Since default `weno_weight_computation` on CUDA uses LLVM's NVPTX intrinsics +# it causes Reactant to crash. +# We need to fall back to diffrent optimization when running with Reactant +Oceananigans.defaults.weno_weight_computation = Oceananigans.Utils.ConvertingDivision{Float32} + # These are additional modules that may need to be Reactantified in the future: # # include("Utils.jl") From d6588f8617ee3336fdac54a7ad6c327be442b737 Mon Sep 17 00:00:00 2001 From: "M. A. Kowalski" Date: Thu, 19 Mar 2026 15:37:48 +0000 Subject: [PATCH 15/25] feat: add materialize_advection to defer configuration options To resolve problems with Reactant not knowing about NVPTX intrinsics that we are using in the backend optimised implementation, we need to defer a choice of the default weno_weight_computation option until it is known on what backend the problem will be run. To do that we can rely on the `materialize` pattern used already in the similar circumstances. --- src/Advection/Advection.jl | 1 + src/Advection/materialize_advection.jl | 50 ++++++++++++++++++++++++++ test/runtests.jl | 1 + test/test_materialize_advection.jl | 39 ++++++++++++++++++++ 4 files changed, 91 insertions(+) create mode 100644 src/Advection/materialize_advection.jl create mode 100644 test/test_materialize_advection.jl diff --git a/src/Advection/Advection.jl b/src/Advection/Advection.jl index a2384c76309..d12eab8a088 100644 --- a/src/Advection/Advection.jl +++ b/src/Advection/Advection.jl @@ -79,5 +79,6 @@ include("tracer_advection_operators.jl") include("bounds_preserving_tracer_advection_operators.jl") include("cell_advection_timescale.jl") include("adapt_advection_order.jl") +include("materialize_advection.jl") end # module diff --git a/src/Advection/materialize_advection.jl b/src/Advection/materialize_advection.jl new file mode 100644 index 00000000000..23f91afd1cb --- /dev/null +++ b/src/Advection/materialize_advection.jl @@ -0,0 +1,50 @@ +""" + materialize_advection(advection, grid) + +Return a fully materialized advection scheme appropriate for `grid`. +It exists to allow advection schemes to defer specialising settings until +additional information about the backend from grid is available. + +For example it allows to set per-backend defaults for WENO weight computation +setting. +""" +materialize_advection(advection, grid) = advection +materialize_advection(::Nothing, grid) = nothing +materialize_advection(advection::FluxFormAdvection, grid) = FluxFormAdvection( + materialize_advection(advection.x, grid), + materialize_advection(advection.y, grid), + materialize_advection(advection.z, grid), +) + +# VectorInvariant wraps multiple sub-schemes; recurse into each +materialize_advection(vi::VectorInvariant{N,FT,M}, grid) where {N,FT,M} = + VectorInvariant{N,FT,M}( + materialize_advection(vi.vorticity_scheme, grid), + vi.vorticity_stencil, + materialize_advection(vi.vertical_advection_scheme, grid), + materialize_advection(vi.kinetic_energy_gradient_scheme, grid), + materialize_advection(vi.divergence_scheme, grid), + vi.upwinding, + ) + + +materialize_advection(weno::WENO{N,FT,WCT}, grid) where {N,FT,WCT} = WENO{N,FT,WCT}( + weno.bounds, + materialize_advection(weno.buffer_scheme, grid), + materialize_advection(weno.advecting_velocity_scheme, grid), +) + +materialize_advection(weno::WENO{N,FT,Nothing}, grid) where {N,FT} = + WENO{N,FT,Oceananigans.defaults.weno_weight_computation}( + weno.bounds, + materialize_advection(weno.buffer_scheme, grid), + materialize_advection(weno.advecting_velocity_scheme, grid), + ) + +materialize_advection(scheme::UpwindBiased{N,FT}, grid) where {N,FT} = UpwindBiased{N,FT}( + materialize_advection(scheme.buffer_scheme, grid), + materialize_advection(scheme.advecting_velocity_scheme, grid), +) + +materialize_advection(scheme::Centered{N,FT}, grid) where {N,FT} = + Centered{N,FT}(materialize_advection(scheme.buffer_scheme, grid)) diff --git a/test/runtests.jl b/test/runtests.jl index 3be02399b2b..9a2c22b4b15 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -52,6 +52,7 @@ CUDA.allowscalar() do include("test_utils.jl") include("test_schedules.jl") include("test_newton_div.jl") + include("test_materialize_advection.jl") end end diff --git a/test/test_materialize_advection.jl b/test/test_materialize_advection.jl new file mode 100644 index 00000000000..5c7186437e7 --- /dev/null +++ b/test/test_materialize_advection.jl @@ -0,0 +1,39 @@ +include("dependencies_for_runtests.jl") + +using Oceananigans.Advection: materialize_advection +using Oceananigans.Utils: NormalDivision, ConvertingDivision, BackendOptimizedDivision + +@testset "materialize weno scheme chain with placeholders" begin + + # Construct an advection chain by hand with intermediate non-WENO buffer schemes + level_0 = Centered(order = 2) + level_1 = + WENO(Float64; order = 3, weight_computation = Nothing, buffer_scheme = level_0) + level_2 = Centered(Float64; order = 4, buffer_scheme = level_1) + level_3 = UpwindBiased(Float64; order = 5, buffer_scheme = level_2) + level_4 = + WENO(Float64; order = 5, weight_computation = Nothing, buffer_scheme = level_3) + level_5 = WENO( + Float64; + order = 7, + weight_computation = NormalDivision, + buffer_scheme = level_4, + ) + + # Materialize using materialize_advection; Nothing WCTs are replaced by the global default + # (Oceananigans.defaults.weno_weight_computation == BackendOptimizedDivision) + materialized = materialize_advection(level_5, nothing) + + # Check that all WENO schemes in the chain have the correct weight computation type + get_nth_buffer_scheme(scheme, n) = + n == 1 ? scheme : get_nth_buffer_scheme(scheme.buffer_scheme, n - 1) + get_weight_computation(::WENO{<:Any,<:Any,WCT}) where {WCT} = WCT + + + @test NormalDivision == get_nth_buffer_scheme(materialized, 1) |> get_weight_computation + @test BackendOptimizedDivision == + get_nth_buffer_scheme(materialized, 2) |> get_weight_computation + @test BackendOptimizedDivision == + get_nth_buffer_scheme(materialized, 5) |> get_weight_computation + +end From c4adca29d1f145efa660ad089ad282122a6df0ec Mon Sep 17 00:00:00 2001 From: "M. A. Kowalski" Date: Thu, 19 Mar 2026 17:34:35 +0000 Subject: [PATCH 16/25] feat: use advection materialisation in models --- .../hydrostatic_free_surface_model.jl | 6 +++++- src/Models/NonhydrostaticModels/nonhydrostatic_model.jl | 6 +++++- src/Models/ShallowWaterModels/shallow_water_model.jl | 5 +++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl index 40332d494f6..3d00f2a7b45 100644 --- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl +++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl @@ -1,4 +1,4 @@ -using Oceananigans.Advection: AbstractAdvectionScheme, Centered, VectorInvariant, adapt_advection_order +using Oceananigans.Advection: AbstractAdvectionScheme, Centered, VectorInvariant, adapt_advection_order, materialize_advection using Oceananigans.Architectures: AbstractArchitecture using Oceananigans.Biogeochemistry: validate_biogeochemistry, AbstractBiogeochemistry, biogeochemical_auxiliary_fields using Oceananigans.BoundaryConditions: FieldBoundaryConditions, regularize_field_boundary_conditions @@ -200,6 +200,10 @@ function HydrostaticFreeSurfaceModel(grid; advection = merge(momentum_advection_tuple, tracer_advection_tuple) advection = NamedTuple(name => adapt_advection_order(scheme, grid) for (name, scheme) in pairs(advection)) + # Fill any settings in advection scheme that might have been deferred until + # the grid and backend is known + advection = NamedTuple(name => materialize_advection(scheme, grid) for (name, scheme) in pairs(advection)) + validate_buoyancy(buoyancy, tracernames(tracers)) buoyancy = materialize_buoyancy(buoyancy, grid) diff --git a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl index 87930eb47c3..8f9fb183c37 100644 --- a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl +++ b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl @@ -1,4 +1,4 @@ -using Oceananigans.Advection: Centered, adapt_advection_order +using Oceananigans.Advection: Centered, adapt_advection_order, materialize_advection using Oceananigans.Architectures: AbstractArchitecture using Oceananigans.Biogeochemistry: validate_biogeochemistry, AbstractBiogeochemistry, biogeochemical_auxiliary_fields using Oceananigans.BoundaryConditions: MixedBoundaryCondition @@ -217,6 +217,10 @@ function NonhydrostaticModel(grid; # direction advection = adapt_advection_order(advection, grid) + # Fill any settings in advection scheme that might have been deferred until + # the grid and backend is known + advection = materialize_advection(advection, grid) + # Adjust halos when the advection scheme or turbulence closure requires it. # Note that halos are isotropic by default; however we respect user-input here # by adjusting each (x, y, z) halo individually. diff --git a/src/Models/ShallowWaterModels/shallow_water_model.jl b/src/Models/ShallowWaterModels/shallow_water_model.jl index fdab9e62a3e..950dc22221f 100644 --- a/src/Models/ShallowWaterModels/shallow_water_model.jl +++ b/src/Models/ShallowWaterModels/shallow_water_model.jl @@ -10,6 +10,7 @@ using Oceananigans.Forcings: model_forcing using Oceananigans.Grids: topology, Flat, architecture, RectilinearGrid, Center using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid using Oceananigans.Models: validate_model_halo, validate_tracer_advection +using Oceananigans.Advection: materialize_advection using Oceananigans.TimeSteppers: Clock, TimeStepper, update_state! using Oceananigans.TurbulenceClosures: with_tracers, build_closure_fields using Oceananigans.Utils: tupleit @@ -179,6 +180,10 @@ function ShallowWaterModel(grid; advection = merge((momentum=momentum_advection, mass=mass_advection), tracer_advection_tuple) + # Fill any settings in advection scheme that might have been deferred until + # the grid and backend is known + advection = NamedTuple(name => materialize_advection(scheme, grid) for (name, scheme) in pairs(advection)) + bathymetry_field = CenterField(grid) if !isnothing(bathymetry) set!(bathymetry_field, bathymetry) From 3c8d143f03661413034169a4aee19ae9283628bc Mon Sep 17 00:00:00 2001 From: "M. A. Kowalski" Date: Fri, 20 Mar 2026 11:22:17 +0000 Subject: [PATCH 17/25] refactor: get rid of the global weight_computation setting It is no longer necessary since the default can be assigned in the materialization of the advection schemes. It can also be dependent on a specific architecture the problem will run on. To change the default setting user just needs to override the function `default_weno_weight_computation(arch)` --- ext/OceananigansReactantExt/Models.jl | 8 +++++ .../OceananigansReactantExt.jl | 5 --- src/Advection/materialize_advection.jl | 4 ++- src/Advection/vector_invariant_advection.jl | 8 ++--- src/Advection/weno_reconstruction.jl | 35 +++++++++++-------- src/Oceananigans.jl | 12 +------ 6 files changed, 36 insertions(+), 36 deletions(-) diff --git a/ext/OceananigansReactantExt/Models.jl b/ext/OceananigansReactantExt/Models.jl index 998828ceb4c..0226d1e40e2 100644 --- a/ext/OceananigansReactantExt/Models.jl +++ b/ext/OceananigansReactantExt/Models.jl @@ -17,6 +17,8 @@ using ..Grids: ShardedGrid, ShardedDistributed import Oceananigans.Models: complete_communication_and_compute_buffer!, interior_tendency_kernel_parameters +import Oceananigans.Advection: default_weno_weight_computation + const ReactantHFSM{TS, E} = Union{ HydrostaticFreeSurfaceModel{TS, E, <:ReactantState}, @@ -60,4 +62,10 @@ maybe_prepare_first_time_step!(model::ReactantHFSM, callbacks) = nothing complete_communication_and_compute_buffer!(model, ::ShardedGrid, ::ShardedDistributed) = nothing interior_tendency_kernel_parameters(::ShardedDistributed, grid) = :xyz +# Reactant uses CUDA version of the code to uplift program description to MLIR. +# Since default `weno_weight_computation` on CUDA uses LLVM's NVPTX intrinsics +# it causes Reactant to crash. +# We need to fall back to different optimization when running with Reactant +default_weno_weight_computation(::ReactantState) = Oceananigans.Utils.ConvertingDivision{Float32} + end # module diff --git a/ext/OceananigansReactantExt/OceananigansReactantExt.jl b/ext/OceananigansReactantExt/OceananigansReactantExt.jl index 8f1cb11555b..601b137dca5 100644 --- a/ext/OceananigansReactantExt/OceananigansReactantExt.jl +++ b/ext/OceananigansReactantExt/OceananigansReactantExt.jl @@ -335,11 +335,6 @@ end Base.getindex(array::OffsetVector{T, <:Reactant.AbstractConcreteArray{T, 1}}, ::Colon) where T = array -# Reactant uses CUDA version of the code to uplift program description to MLIR. -# Since default `weno_weight_computation` on CUDA uses LLVM's NVPTX intrinsics -# it causes Reactant to crash. -# We need to fall back to diffrent optimization when running with Reactant -Oceananigans.defaults.weno_weight_computation = Oceananigans.Utils.ConvertingDivision{Float32} # These are additional modules that may need to be Reactantified in the future: # diff --git a/src/Advection/materialize_advection.jl b/src/Advection/materialize_advection.jl index 23f91afd1cb..530659ce6ba 100644 --- a/src/Advection/materialize_advection.jl +++ b/src/Advection/materialize_advection.jl @@ -1,3 +1,5 @@ +import Oceananigans: architecture + """ materialize_advection(advection, grid) @@ -35,7 +37,7 @@ materialize_advection(weno::WENO{N,FT,WCT}, grid) where {N,FT,WCT} = WENO{N,FT,W ) materialize_advection(weno::WENO{N,FT,Nothing}, grid) where {N,FT} = - WENO{N,FT,Oceananigans.defaults.weno_weight_computation}( + WENO{N,FT,default_weno_weight_computation(architecture(grid))}( weno.bounds, materialize_advection(weno.buffer_scheme, grid), materialize_advection(weno.advecting_velocity_scheme, grid), diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl index 6da080920d9..696fb86c7c8 100644 --- a/src/Advection/vector_invariant_advection.jl +++ b/src/Advection/vector_invariant_advection.jl @@ -193,11 +193,11 @@ julia> using Oceananigans julia> WENOVectorInvariant() WENOVectorInvariant{5, Float64}(vorticity_order=9, vertical_order=5) -├── vorticity_scheme: WENO{5, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=9) +├── vorticity_scheme: WENO{5, Float64, Nothing}(order=9) ├── vorticity_stencil: Oceananigans.Advection.VelocityStencil -├── vertical_advection_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) -├── kinetic_energy_gradient_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) -├── divergence_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) +├── vertical_advection_scheme: WENO{3, Float64, Nothing}(order=5) +├── kinetic_energy_gradient_scheme: WENO{3, Float64, Nothing}(order=5) +├── divergence_scheme: WENO{3, Float64, Nothing}(order=5) └── upwinding: OnlySelfUpwinding ``` """ diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl index ae3bc59a296..86ed4af2860 100644 --- a/src/Advection/weno_reconstruction.jl +++ b/src/Advection/weno_reconstruction.jl @@ -32,7 +32,8 @@ Arguments Keyword arguments ================= - `weight_computation`: The type of approximate division to used when computing WENO weights. - Default: `Oceananigans.defaults.weno_weight_computation` + Default: `Nothing` (deferred; a architecture-dependent default is assigned in + `materialize_advection`) - `order`: The order of the WENO advection scheme. Default: 5 - `bounds` (experimental): Whether to use bounds-preserving WENO, which produces a reconstruction that attempts to restrict a quantity to lie between a `bounds` tuple. @@ -51,8 +52,8 @@ To build the default 5th-order scheme: julia> using Oceananigans julia> WENO() -WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) -├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=3) +WENO{3, Float64, Nothing}(order=5) +├── buffer_scheme: WENO{2, Float64, Nothing}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=4) ``` @@ -62,10 +63,10 @@ yet minimally-dissipative advection scheme): ```jldoctest weno julia> WENO(order=9) -WENO{5, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=9) -├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) -│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=3) +WENO{5, Float64, Nothing}(order=9) +├── buffer_scheme: WENO{4, Float64, Nothing}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Nothing}(order=5) +│ └── buffer_scheme: WENO{2, Float64, Nothing}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` @@ -75,19 +76,19 @@ which uses `Centered(order=2)` as the innermost buffer scheme: ```jldoctest weno julia> WENO(order=9, minimum_buffer_upwind_order=5) -WENO{5, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=9) -├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) +WENO{5, Float64, Nothing}(order=9) +├── buffer_scheme: WENO{4, Float64, Nothing}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Nothing}(order=5) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` ```jldoctest weno julia> WENO(order=9, bounds=(0, 1)) -WENO{5, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=9, bounds=(0.0, 1.0)) -├── buffer_scheme: WENO{4, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=7, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=3, bounds=(0.0, 1.0)) +WENO{5, Float64, Nothing}(order=9, bounds=(0.0, 1.0)) +├── buffer_scheme: WENO{4, Float64, Nothing}(order=7, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{3, Float64, Nothing}(order=5, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{2, Float64, Nothing}(order=3, bounds=(0.0, 1.0)) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` @@ -102,7 +103,7 @@ WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) ``` """ function WENO(FT::DataType=Oceananigans.defaults.FloatType; - weight_computation::DataType=Oceananigans.defaults.weno_weight_computation, + weight_computation::DataType=Nothing, order = 5, buffer_scheme = DecreasingOrderAdvectionScheme(), bounds = nothing, @@ -163,3 +164,7 @@ on_architecture(to, scheme::WENO{N, FT, WCT}) where {N, FT, WCT} = WENO{N, FT, WCT}(on_architecture(to, scheme.bounds), on_architecture(to, scheme.buffer_scheme), on_architecture(to, scheme.advecting_velocity_scheme)) + +# Select the default WENO weight computation +# Specific backends may override +default_weno_weight_computation(arch) = Oceananigans.Utils.BackendOptimizedDivision diff --git a/src/Oceananigans.jl b/src/Oceananigans.jl index 90c721f16c1..008f61c9df8 100644 --- a/src/Oceananigans.jl +++ b/src/Oceananigans.jl @@ -147,7 +147,6 @@ mutable struct Defaults gravitational_acceleration :: Float64 planet_radius :: Float64 planet_rotation_rate :: Float64 - weno_weight_computation end function Defaults(; @@ -160,16 +159,10 @@ function Defaults(; # [s⁻¹] Earth's angular speed; see https://en.wikipedia.org/wiki/Earth%27s_rotation#Angular_speed planet_rotation_rate = 7.292115e-5) - # Default optimisation for computing WENO weights - # Since the options live in Oceananigans.Utils, we put a placeholder here and - # set the default later in the module - weno_weight_computation = nothing - return Defaults(FloatType, gravitational_acceleration, planet_radius, - planet_rotation_rate, - weno_weight_computation) + planet_rotation_rate) end const defaults = Defaults() @@ -274,9 +267,6 @@ include("Models/Models.jl") # Abstractions for distributed and multi-region models include("MultiRegion/MultiRegion.jl") -### Set default WENO weight computation to backend-optimized division -defaults.weno_weight_computation = Utils.BackendOptimizedDivision - ##### ##### Needed so we can export names from sub-modules at the top-level ##### From f3a7456b0db69e19b6245180700481cfc51bce2f Mon Sep 17 00:00:00 2001 From: "M. A. Kowalski" Date: Fri, 20 Mar 2026 16:45:36 +0000 Subject: [PATCH 18/25] fix: failing reactant tests --- test/test_reactant_correctness.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_reactant_correctness.jl b/test/test_reactant_correctness.jl index 7c46ac5270f..5aa504c7da2 100644 --- a/test/test_reactant_correctness.jl +++ b/test/test_reactant_correctness.jl @@ -178,6 +178,8 @@ all_combos(xs...) = vec(collect(Iterators.product(xs...))) @testset "coriolis=nothing" begin @info "Testing compute_simple_Gu! on RectilinearGrid($topo) with coriolis=nothing..." for advection in advection_schemes + # We want to select differed setting based on reactant + advection = Oceananigans.Advection.materialize_advection(advection, reactant_grid) @testset "advection=$(adv_name(advection))" begin fill!(vanilla_Gu, 0) fill!(reactant_Gu, 0) @@ -199,6 +201,8 @@ all_combos(xs...) = vec(collect(Iterators.product(xs...))) @info "Testing compute_simple_Gu! on RectilinearGrid($topo) with FPlane Coriolis..." coriolis = FPlane(f=1e-4) for advection in advection_schemes + # We want to select differed setting based on reactant + advection = Oceananigans.Advection.materialize_advection(advection, reactant_grid) @testset "advection=$(adv_name(advection))" begin fill!(vanilla_Gu, 0) fill!(reactant_Gu, 0) @@ -248,6 +252,8 @@ all_combos(xs...) = vec(collect(Iterators.product(xs...))) @testset "coriolis=nothing" begin @info "Testing compute_simple_Gu! on LatitudeLongitudeGrid with coriolis=nothing..." for advection in advection_schemes + # We want to select differed setting based on reactant + advection = Oceananigans.Advection.materialize_advection(advection, reactant_grid) @testset "advection=$(adv_name(advection))" begin fill!(vanilla_Gu, 0) fill!(reactant_Gu, 0) @@ -269,6 +275,8 @@ all_combos(xs...) = vec(collect(Iterators.product(xs...))) @info "Testing compute_simple_Gu! on LatitudeLongitudeGrid with HydrostaticSphericalCoriolis..." coriolis = HydrostaticSphericalCoriolis() for advection in advection_schemes + # We want to select differed setting based on reactant + advection = Oceananigans.Advection.materialize_advection(advection, reactant_grid) @testset "advection=$(adv_name(advection))" begin fill!(vanilla_Gu, 0) fill!(reactant_Gu, 0) From 9fe17316717fc080ec29a51b962fcc692fe986a7 Mon Sep 17 00:00:00 2001 From: "M. A. Kowalski" Date: Thu, 26 Mar 2026 13:46:53 +0000 Subject: [PATCH 19/25] fix: add missing materialize_advection overloads --- src/Advection/materialize_advection.jl | 14 +++++++++++++- src/MultiRegion/multi_region_models.jl | 7 ++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/Advection/materialize_advection.jl b/src/Advection/materialize_advection.jl index 530659ce6ba..89814d64144 100644 --- a/src/Advection/materialize_advection.jl +++ b/src/Advection/materialize_advection.jl @@ -18,6 +18,18 @@ materialize_advection(advection::FluxFormAdvection, grid) = FluxFormAdvection( materialize_advection(advection.z, grid), ) +# Upwinding treatments hold a cross_scheme that may contain deferred WENO weight computation +materialize_advection(u::OnlySelfUpwinding, grid) = + OnlySelfUpwinding(materialize_advection(u.cross_scheme, grid), + u.δU_stencil, u.δV_stencil, u.δu²_stencil, u.δv²_stencil) + +materialize_advection(u::CrossAndSelfUpwinding, grid) = + CrossAndSelfUpwinding(materialize_advection(u.cross_scheme, grid), + u.divergence_stencil, u.δu²_stencil, u.δv²_stencil) + +materialize_advection(u::VelocityUpwinding, grid) = + VelocityUpwinding(materialize_advection(u.cross_scheme, grid)) + # VectorInvariant wraps multiple sub-schemes; recurse into each materialize_advection(vi::VectorInvariant{N,FT,M}, grid) where {N,FT,M} = VectorInvariant{N,FT,M}( @@ -26,7 +38,7 @@ materialize_advection(vi::VectorInvariant{N,FT,M}, grid) where {N,FT,M} = materialize_advection(vi.vertical_advection_scheme, grid), materialize_advection(vi.kinetic_energy_gradient_scheme, grid), materialize_advection(vi.divergence_scheme, grid), - vi.upwinding, + materialize_advection(vi.upwinding, grid), ) diff --git a/src/MultiRegion/multi_region_models.jl b/src/MultiRegion/multi_region_models.jl index db84965ca5a..ddcbe7aca78 100644 --- a/src/MultiRegion/multi_region_models.jl +++ b/src/MultiRegion/multi_region_models.jl @@ -1,4 +1,4 @@ -using Oceananigans.Advection: Advection, WENO, VectorInvariant, adapt_advection_order, cell_advection_timescale +using Oceananigans.Advection: Advection, WENO, VectorInvariant, adapt_advection_order, cell_advection_timescale, materialize_advection using Oceananigans.BuoyancyFormulations: BuoyancyFormulations, BuoyancyForce, NegativeZDirection, AbstractBuoyancyFormulation, validate_unit_vector using Oceananigans.TimeSteppers: TimeSteppers, QuasiAdamsBashforth2TimeStepper using Oceananigans.Models: Models, ExplicitFreeSurface, HydrostaticFreeSurfaceModel, ImplicitFreeSurface, PrescribedVelocityFields @@ -17,6 +17,11 @@ function Advection.adapt_advection_order(advection::MultiRegionObject, grid::Mul return new_advection end +function Advection.materialize_advection(advection::MultiRegionObject, grid::MultiRegionGrids) + @apply_regionally new_advection = materialize_advection(advection, grid) + return new_advection +end + # Utility to generate the inputs to complex `getregion`s function getregionalproperties(type, inner=true) names = fieldnames(type) From 3ab7ec9dd7f6e845d4ee34d592a262f3e03b3313 Mon Sep 17 00:00:00 2001 From: "M. A. Kowalski" Date: Thu, 26 Mar 2026 13:47:42 +0000 Subject: [PATCH 20/25] test: fix tests broken by changes to the API --- test/test_immersed_advection.jl | 8 +++++--- test/test_materialize_advection.jl | 8 +++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/test/test_immersed_advection.jl b/test/test_immersed_advection.jl index f2a52f871ab..1d4f1c9693d 100644 --- a/test/test_immersed_advection.jl +++ b/test/test_immersed_advection.jl @@ -12,7 +12,8 @@ using Oceananigans.Advection: _biased_interpolate_yᵃᶠᵃ, FluxFormAdvection, LeftBias, - RightBias + RightBias, + materialize_advection linear_advection_schemes = [Centered, UpwindBiased] advection_schemes = [linear_advection_schemes... WENO] @@ -21,7 +22,7 @@ advection_schemes = [linear_advection_schemes... WENO] @inline advective_order(buffer, AdvectionType) = buffer * 2 - 1 function run_tracer_interpolation_test(c, ibg, scheme) - + scheme = materialize_advection(scheme, ibg) for j in 6:19, i in 6:19 if typeof(scheme) <: Centered @test @allowscalar _symmetric_interpolate_xᶠᵃᵃ(i+1, j, 1, ibg, scheme, c) ≈ 1.0 @@ -35,7 +36,7 @@ function run_tracer_interpolation_test(c, ibg, scheme) end function run_tracer_conservation_test(grid, scheme) - + scheme = materialize_advection(scheme, grid) model = HydrostaticFreeSurfaceModel(grid; tracers = :c, free_surface = ExplicitFreeSurface(), tracer_advection = scheme) @@ -65,6 +66,7 @@ function run_tracer_conservation_test(grid, scheme) end function run_momentum_interpolation_test(u, v, ibg, scheme) + scheme = materialize_advection(scheme, ibg) # ensure also immersed boundaries have a value of 1 interior(u, 6, :, 1) .= 1.0 diff --git a/test/test_materialize_advection.jl b/test/test_materialize_advection.jl index 5c7186437e7..53f5e84aed4 100644 --- a/test/test_materialize_advection.jl +++ b/test/test_materialize_advection.jl @@ -3,6 +3,12 @@ include("dependencies_for_runtests.jl") using Oceananigans.Advection: materialize_advection using Oceananigans.Utils: NormalDivision, ConvertingDivision, BackendOptimizedDivision + +# We need to Mock a grid since it provides an architecture for materialization. +struct MockGrid end +Oceananigans.Grids.architecture(::MockGrid) = CPU() + + @testset "materialize weno scheme chain with placeholders" begin # Construct an advection chain by hand with intermediate non-WENO buffer schemes @@ -22,7 +28,7 @@ using Oceananigans.Utils: NormalDivision, ConvertingDivision, BackendOptimizedDi # Materialize using materialize_advection; Nothing WCTs are replaced by the global default # (Oceananigans.defaults.weno_weight_computation == BackendOptimizedDivision) - materialized = materialize_advection(level_5, nothing) + materialized = materialize_advection(level_5, MockGrid()) # Check that all WENO schemes in the chain have the correct weight computation type get_nth_buffer_scheme(scheme, n) = From 4e76efa3ed2ad5cf42a212388a924801184ec733 Mon Sep 17 00:00:00 2001 From: "M. A. Kowalski" Date: Thu, 26 Mar 2026 17:20:55 +0000 Subject: [PATCH 21/25] fix: add missing overload for Distributed grid --- src/DistributedComputations/DistributedComputations.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/DistributedComputations/DistributedComputations.jl b/src/DistributedComputations/DistributedComputations.jl index 87ab966fbb4..4ced5d6d026 100644 --- a/src/DistributedComputations/DistributedComputations.jl +++ b/src/DistributedComputations/DistributedComputations.jl @@ -59,4 +59,9 @@ function precondition!(p, preconditioner::DistributedFourierTridiagonalPoissonSo return p end +# Correctly pass architecture to determine the default weno_weight_computation +Oceananigans.Advection.default_weno_weight_computation(arch::Distributed) = + Oceananigans.Advection.default_weno_weight_computation(child_architecture(arch)) + + end # module From db30a4cc6e37683058cf5a6ea9bdebbf9805b520 Mon Sep 17 00:00:00 2001 From: "M. A. Kowalski" Date: Fri, 27 Mar 2026 09:31:40 +0000 Subject: [PATCH 22/25] test: add missing materialization step to test_forcing Move the MockGrid to the include file. Multiple test files need it and we need to make sure it is defined once to avoid struct redefinition. --- test/dependencies_for_runtests.jl | 6 ++++++ test/test_forcings.jl | 1 + test/test_materialize_advection.jl | 6 +----- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/dependencies_for_runtests.jl b/test/dependencies_for_runtests.jl index d88bf33dbb4..efac056b623 100644 --- a/test/dependencies_for_runtests.jl +++ b/test/dependencies_for_runtests.jl @@ -90,3 +90,9 @@ already_included[] = true float_types = (Float32, Float64) archs = test_architectures() + +# We need to Mock a grid since it provides an architecture for advection materialization +struct MockGrid{A <: AbstractArchitecture} + arch::A +end +Oceananigans.Grids.architecture(a::MockGrid) = a.arch diff --git a/test/test_forcings.jl b/test/test_forcings.jl index 79f72fedf14..459353d45ca 100644 --- a/test/test_forcings.jl +++ b/test/test_forcings.jl @@ -519,6 +519,7 @@ end @testset "Momentum flux zero at immersed peripheral nodes" begin @info " Testing momentum flux is zero at immersed peripheral nodes..." for scheme in (Centered(order=2), UpwindBiased(order=3), WENO(order=5)) + scheme = Oceananigans.Advection.materialize_advection(scheme, MockGrid(arch)) @test test_momentum_flux_zero_at_peripheral_nodes(scheme) end end diff --git a/test/test_materialize_advection.jl b/test/test_materialize_advection.jl index 53f5e84aed4..a47bde1fc05 100644 --- a/test/test_materialize_advection.jl +++ b/test/test_materialize_advection.jl @@ -4,10 +4,6 @@ using Oceananigans.Advection: materialize_advection using Oceananigans.Utils: NormalDivision, ConvertingDivision, BackendOptimizedDivision -# We need to Mock a grid since it provides an architecture for materialization. -struct MockGrid end -Oceananigans.Grids.architecture(::MockGrid) = CPU() - @testset "materialize weno scheme chain with placeholders" begin @@ -28,7 +24,7 @@ Oceananigans.Grids.architecture(::MockGrid) = CPU() # Materialize using materialize_advection; Nothing WCTs are replaced by the global default # (Oceananigans.defaults.weno_weight_computation == BackendOptimizedDivision) - materialized = materialize_advection(level_5, MockGrid()) + materialized = materialize_advection(level_5, MockGrid(CPU())) # Check that all WENO schemes in the chain have the correct weight computation type get_nth_buffer_scheme(scheme, n) = From fed538a9b8c47a9e6049b750ec8fa3dc5efc9aa1 Mon Sep 17 00:00:00 2001 From: Mikolaj Adam Kowalski <32641577+Mikolaj-A-Kowalski@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:24:10 +0000 Subject: [PATCH 23/25] fix: extra end-of-file newlines Co-authored-by: Simone Silvestri --- test/test_materialize_advection.jl | 2 -- test/test_newton_div.jl | 1 - 2 files changed, 3 deletions(-) diff --git a/test/test_materialize_advection.jl b/test/test_materialize_advection.jl index a47bde1fc05..c82293a1103 100644 --- a/test/test_materialize_advection.jl +++ b/test/test_materialize_advection.jl @@ -3,8 +3,6 @@ include("dependencies_for_runtests.jl") using Oceananigans.Advection: materialize_advection using Oceananigans.Utils: NormalDivision, ConvertingDivision, BackendOptimizedDivision - - @testset "materialize weno scheme chain with placeholders" begin # Construct an advection chain by hand with intermediate non-WENO buffer schemes diff --git a/test/test_newton_div.jl b/test/test_newton_div.jl index 24b6b3c246b..bf9d4697366 100644 --- a/test/test_newton_div.jl +++ b/test/test_newton_div.jl @@ -39,7 +39,6 @@ if CUDA.functional() end end - function append_weight_computation_type!(list, weno::WENO{<:Any, <:Any, WCT}) where {WCT} push!(list, WCT) append_weight_computation_type!(list, weno.buffer_scheme) From 4f295ea3ac0d1f950882f5461094afc167980b92 Mon Sep 17 00:00:00 2001 From: "M. A. Kowalski" Date: Wed, 1 Apr 2026 12:19:51 +0100 Subject: [PATCH 24/25] test: fix newton_div test There was a type instability and test input was getting promoted to Float64. As a result the Float32 was never verified and a typo in the intrinsic name was not caught ealier. --- ext/OceananigansCUDAExt.jl | 2 +- test/test_newton_div.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/OceananigansCUDAExt.jl b/ext/OceananigansCUDAExt.jl index d659e63413c..f1e66c6cb25 100644 --- a/ext/OceananigansCUDAExt.jl +++ b/ext/OceananigansCUDAExt.jl @@ -155,7 +155,7 @@ function fast_inv_cuda(a::Float32) # This instruction just computes reciprocal flushing subnormals to 0.0 # Hence for subnormal inputs it returns Inf # For large number whose reciprocal is subnormal it underflows to 0.0 - inv_a = ccall("llvm.nvvm.rcp.approx.ftz.f32", llvmcall, Float32, (Float32,), a) + inv_a = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), a) return inv_a end diff --git a/test/test_newton_div.jl b/test/test_newton_div.jl index bf9d4697366..2d17eacfc1c 100644 --- a/test/test_newton_div.jl +++ b/test/test_newton_div.jl @@ -5,7 +5,7 @@ using CUDA # Generate some random points in a single binade [1;2) interval function test_data_in_single_binade(::Type{FT}, size) where {FT} prng = Random.Xoshiro(44) - return rand(prng, FT, size) .+ 1.0 + return rand(prng, FT, size) .+ 1 end @testset "CPU newton_div: $FT $WCT" for (FT, WCT) in Iterators.product((Float32, Float64), From c6560705c8388eeadc99bfdf76da5ed6027acc42 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Wed, 15 Apr 2026 11:08:29 +0200 Subject: [PATCH 25/25] update minor version (numerical difference) --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 6b52569e3e8..64702bc12fa 100755 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Oceananigans" uuid = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09" -version = "0.106.5" +version = "0.106.6" authors = ["Climate Modeling Alliance and contributors"] [deps]