CliMA
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ext/OceananigansCUDAExt.jl‎
Lines changed: 27 additions & 0 deletions b/‎ext/OceananigansCUDAExt.jl‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎ext/OceananigansReactantExt/Models.jl‎
Lines changed: 8 additions & 0 deletions b/‎ext/OceananigansReactantExt/Models.jl‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎ext/OceananigansReactantExt/OceananigansReactantExt.jl‎
Lines changed: 1 addition & 0 deletions b/‎ext/OceananigansReactantExt/OceananigansReactantExt.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/Advection/Advection.jl‎
Lines changed: 1 addition & 0 deletions b/‎src/Advection/Advection.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/Advection/materialize_advection.jl‎
Lines changed: 64 additions & 0 deletions b/‎src/Advection/materialize_advection.jl‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎src/Advection/vector_invariant_advection.jl‎
Lines changed: 6 additions & 7 deletions b/‎src/Advection/vector_invariant_advection.jl‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎src/Advection/weno_interpolants.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/Advection/weno_interpolants.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/Advection/weno_reconstruction.jl‎
Lines changed: 44 additions & 30 deletions b/‎src/Advection/weno_reconstruction.jl‎
Lines changed: 44 additions & 30 deletions
diff --git a/‎src/DistributedComputations/DistributedComputations.jl‎
Lines changed: 5 additions & 0 deletions b/‎src/DistributedComputations/DistributedComputations.jl‎
Lines changed: 5 additions & 0 deletions
@@ -1,6 +1,6 @@
 name = "Oceananigans"
 uuid = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09"
-version = "0.106.5"
+version = "0.106.6"
 authors = ["Climate Modeling Alliance and contributors"]
 
 [deps]
 
@@ -132,4 +132,31 @@ end
 @inline UT.sync_device!(::CUDAGPU)       = CUDA.synchronize()
 @inline UT.sync_device!(::CUDABackend)   = CUDA.synchronize()
 
+# Use faster versions of `newton_div` on Nvidia GPUs
+CUDA.@device_override UT.newton_div(::Type{UT.BackendOptimizedDivision}, a, b) = a * fast_inv_cuda(b)
+
+function fast_inv_cuda(a::Float64)
+    # Get the approximate reciprocal
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
+    # This instruction chops off last 32bits of mantissa and computes inverse
+    # while treating all subnormal numbers as 0.0
+    # If reciprocal would be subnormal, underflows to 0.0
+    # 32 least significant bits of the result are filled with 0s
+    inv_a = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), a)
+
+    # Approximate the missing 32bits of mantissa with a single cubic iteration
+    e = fma(inv_a, -a, 1.0)
+    e = fma(e, e, e)
+    inv_a = fma(e, inv_a, inv_a)
+    return inv_a
+end
+
+function fast_inv_cuda(a::Float32)
+    # This instruction just computes reciprocal flushing subnormals to 0.0
+    # Hence for subnormal inputs it returns Inf
+    # For large number whose reciprocal is subnormal it underflows to 0.0
+    inv_a = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), a)
+    return inv_a
+end
+
 end # module OceananigansCUDAExt
@@ -17,6 +17,8 @@ using ..Grids: ShardedGrid, ShardedDistributed
 import Oceananigans.Models:
         complete_communication_and_compute_buffer!,
         interior_tendency_kernel_parameters
+import Oceananigans.Advection: default_weno_weight_computation
+
 
 const ReactantHFSM{TS, E} = Union{
     HydrostaticFreeSurfaceModel{TS, E, <:ReactantState},
@@ -60,4 +62,10 @@ maybe_prepare_first_time_step!(model::ReactantHFSM, callbacks) = nothing
 complete_communication_and_compute_buffer!(model, ::ShardedGrid, ::ShardedDistributed) = nothing
 interior_tendency_kernel_parameters(::ShardedDistributed, grid) = :xyz
 
+# Reactant uses CUDA version of the code to uplift program description to MLIR.
+# Since default `weno_weight_computation` on CUDA uses LLVM's NVPTX intrinsics
+# it causes Reactant to crash.
+# We need to fall back to different optimization when running with Reactant
+default_weno_weight_computation(::ReactantState) = Oceananigans.Utils.ConvertingDivision{Float32}
+
 end # module
@@ -335,6 +335,7 @@ end
 
 Base.getindex(array::OffsetVector{T, <:Reactant.AbstractConcreteArray{T, 1}}, ::Colon) where T = array
 
+
 # These are additional modules that may need to be Reactantified in the future:
 #
 # include("Utils.jl")
 
@@ -84,5 +84,6 @@ include("tracer_advection_operators.jl")
 include("bounds_preserving_tracer_advection_operators.jl")
 include("cell_advection_timescale.jl")
 include("adapt_advection_order.jl")
+include("materialize_advection.jl")
 
 end # module
@@ -0,0 +1,64 @@
+import Oceananigans: architecture
+
+"""
+    materialize_advection(advection, grid)
+
+Return a fully materialized advection scheme appropriate for `grid`.
+It exists to allow advection schemes to defer specialising settings until
+additional information about the backend from grid is available.
+
+For example it allows to set per-backend defaults for WENO weight computation
+setting.
+"""
+materialize_advection(advection, grid) = advection
+materialize_advection(::Nothing, grid) = nothing
+materialize_advection(advection::FluxFormAdvection, grid) = FluxFormAdvection(
+    materialize_advection(advection.x, grid),
+    materialize_advection(advection.y, grid),
+    materialize_advection(advection.z, grid),
+)
+
+# Upwinding treatments hold a cross_scheme that may contain deferred WENO weight computation
+materialize_advection(u::OnlySelfUpwinding, grid) =
+    OnlySelfUpwinding(materialize_advection(u.cross_scheme, grid),
+                      u.δU_stencil, u.δV_stencil, u.δu²_stencil, u.δv²_stencil)
+
+materialize_advection(u::CrossAndSelfUpwinding, grid) =
+    CrossAndSelfUpwinding(materialize_advection(u.cross_scheme, grid),
+                          u.divergence_stencil, u.δu²_stencil, u.δv²_stencil)
+
+materialize_advection(u::VelocityUpwinding, grid) =
+    VelocityUpwinding(materialize_advection(u.cross_scheme, grid))
+
+# VectorInvariant wraps multiple sub-schemes; recurse into each
+materialize_advection(vi::VectorInvariant{N,FT,M}, grid) where {N,FT,M} =
+    VectorInvariant{N,FT,M}(
+        materialize_advection(vi.vorticity_scheme, grid),
+        vi.vorticity_stencil,
+        materialize_advection(vi.vertical_advection_scheme, grid),
+        materialize_advection(vi.kinetic_energy_gradient_scheme, grid),
+        materialize_advection(vi.divergence_scheme, grid),
+        materialize_advection(vi.upwinding, grid),
+    )
+
+
+materialize_advection(weno::WENO{N,FT,WCT}, grid) where {N,FT,WCT} = WENO{N,FT,WCT}(
+    weno.bounds,
+    materialize_advection(weno.buffer_scheme, grid),
+    materialize_advection(weno.advecting_velocity_scheme, grid),
+)
+
+materialize_advection(weno::WENO{N,FT,Nothing}, grid) where {N,FT} =
+    WENO{N,FT,default_weno_weight_computation(architecture(grid))}(
+        weno.bounds,
+        materialize_advection(weno.buffer_scheme, grid),
+        materialize_advection(weno.advecting_velocity_scheme, grid),
+    )
+
+materialize_advection(scheme::UpwindBiased{N,FT}, grid) where {N,FT} = UpwindBiased{N,FT}(
+    materialize_advection(scheme.buffer_scheme, grid),
+    materialize_advection(scheme.advecting_velocity_scheme, grid),
+)
+
+materialize_advection(scheme::Centered{N,FT}, grid) where {N,FT} =
+    Centered{N,FT}(materialize_advection(scheme.buffer_scheme, grid))
@@ -137,8 +137,7 @@ function Base.summary(a::WENOVectorInvariant{N}) where N
     vertical_order = weno_order(a.vertical_advection_scheme)
     order = weno_order(a.vorticity_scheme)
     FT = eltype(a.vorticity_scheme)
-    FT2 = eltype2(a.vorticity_scheme)
-    return string("WENOVectorInvariant{$N, $FT, $FT2}(vorticity_order=$vorticity_order, vertical_order=$vertical_order)")
+    return string("WENOVectorInvariant{$N, $FT}(vorticity_order=$vorticity_order, vertical_order=$vertical_order)")
 end
 
 function Base.show(io::IO, a::VectorInvariant{N, FT}) where {N, FT}
@@ -193,12 +192,12 @@ Example
 julia> using Oceananigans
 
 julia> WENOVectorInvariant()
-WENOVectorInvariant{5, Float64, Float32}(vorticity_order=9, vertical_order=5)
-├── vorticity_scheme: WENO{5, Float64, Float32}(order=9)
+WENOVectorInvariant{5, Float64}(vorticity_order=9, vertical_order=5)
+├── vorticity_scheme: WENO{5, Float64, Nothing}(order=9)
 ├── vorticity_stencil: Oceananigans.Advection.VelocityStencil
-├── vertical_advection_scheme: WENO{3, Float64, Float32}(order=5)
-├── kinetic_energy_gradient_scheme: WENO{3, Float64, Float32}(order=5)
-├── divergence_scheme: WENO{3, Float64, Float32}(order=5)
+├── vertical_advection_scheme: WENO{3, Float64, Nothing}(order=5)
+├── kinetic_energy_gradient_scheme: WENO{3, Float64, Nothing}(order=5)
+├── divergence_scheme: WENO{3, Float64, Nothing}(order=5)
 └── upwinding: OnlySelfUpwinding
 ```
 """
 
@@ -291,7 +291,7 @@ end
 @inline function metaprogrammed_zweno_alpha_loop(buffer)
     elem = Vector(undef, buffer)
     for stencil = 1:buffer
-        elem[stencil] = :(C★(scheme, Val($(stencil-1))) * (1 + (newton_div(FT2, τ, β[$stencil] + ϵ))^2))
+        elem[stencil] = :(C★(scheme, Val($(stencil-1))) * (1 + (newton_div(WCT, τ, β[$stencil] + ϵ))^2))
     end
 
     return :($(elem...),)
@@ -301,7 +301,7 @@ for buffer in advection_buffers[2:end]
     @eval begin
         @inline         beta_sum(scheme::WENO{$buffer, FT}, β₁, β₂)    where FT = @inbounds $(metaprogrammed_beta_sum(buffer))
         @inline        beta_loop(scheme::WENO{$buffer, FT}, ψ)         where FT = @inbounds $(metaprogrammed_beta_loop(buffer))
-        @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, FT2}, β, τ) where {FT, FT2} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer))
+        @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, WCT}, β, τ) where {FT, WCT} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer))
     end
 end
 
 
@@ -4,20 +4,20 @@ import Oceananigans
 ##### Weighted Essentially Non-Oscillatory (WENO) advection scheme
 #####
 
-struct WENO{N, FT, FT2, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT}
+struct WENO{N, FT, WCT, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT}
     bounds :: PP
     buffer_scheme :: CA
     advecting_velocity_scheme :: SI
 
-    function WENO{N, FT, FT2}(bounds::PP, buffer_scheme::CA,
-                              advecting_velocity_scheme :: SI) where {N, FT, FT2, PP, CA, SI}
+    function WENO{N, FT, WCT}(bounds::PP, buffer_scheme::CA,
+                              advecting_velocity_scheme :: SI) where {N, FT, WCT, PP, CA, SI}
 
-        return new{N, FT, FT2, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme)
+        return new{N, FT, WCT, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme)
     end
 end
 
 """
-    WENO([FT=Float64, FT2=Float32;]
+    WENO([FT=Float64;]
          order = 5,
          bounds = nothing,
          minimum_buffer_upwind_order = 3)
@@ -28,11 +28,12 @@ Arguments
 =========
 
 - `FT`: The floating point type used in the scheme. Default: `Oceananigans.defaults.FloatType`
-- `FT2`: The floating point type used in some performance-critical parts of the scheme. Default: `Float32`
 
 Keyword arguments
 =================
-
+- `weight_computation`: The type of approximate division to used when computing WENO weights.
+                        Default: `Nothing` (deferred; a architecture-dependent default is assigned in
+                        `materialize_advection`)
 - `order`: The order of the WENO advection scheme. Default: 5
 - `bounds` (experimental): Whether to use bounds-preserving WENO, which produces a reconstruction
                            that attempts to restrict a quantity to lie between a `bounds` tuple.
@@ -51,8 +52,8 @@ To build the default 5th-order scheme:
 julia> using Oceananigans
 
 julia> WENO()
-WENO{3, Float64, Float32}(order=5)
-├── buffer_scheme: WENO{2, Float64, Float32}(order=3)
+WENO{3, Float64, Nothing}(order=5)
+├── buffer_scheme: WENO{2, Float64, Nothing}(order=3)
 │   └── buffer_scheme: Centered(order=2)
 └── advecting_velocity_scheme: Centered(order=4)
 ```
@@ -62,10 +63,10 @@ yet minimally-dissipative advection scheme):
 
 ```jldoctest weno
 julia> WENO(order=9)
-WENO{5, Float64, Float32}(order=9)
-├── buffer_scheme: WENO{4, Float64, Float32}(order=7)
-│   └── buffer_scheme: WENO{3, Float64, Float32}(order=5)
-│       └── buffer_scheme: WENO{2, Float64, Float32}(order=3)
+WENO{5, Float64, Nothing}(order=9)
+├── buffer_scheme: WENO{4, Float64, Nothing}(order=7)
+│   └── buffer_scheme: WENO{3, Float64, Nothing}(order=5)
+│       └── buffer_scheme: WENO{2, Float64, Nothing}(order=3)
 │           └── buffer_scheme: Centered(order=2)
 └── advecting_velocity_scheme: Centered(order=8)
 ```
@@ -75,24 +76,34 @@ which uses `Centered(order=2)` as the innermost buffer scheme:
 
 ```jldoctest weno
 julia> WENO(order=9, minimum_buffer_upwind_order=5)
-WENO{5, Float64, Float32}(order=9)
-├── buffer_scheme: WENO{4, Float64, Float32}(order=7)
-│   └── buffer_scheme: WENO{3, Float64, Float32}(order=5)
+WENO{5, Float64, Nothing}(order=9)
+├── buffer_scheme: WENO{4, Float64, Nothing}(order=7)
+│   └── buffer_scheme: WENO{3, Float64, Nothing}(order=5)
 │       └── buffer_scheme: Centered(order=2)
 └── advecting_velocity_scheme: Centered(order=8)
 ```
 
 ```jldoctest weno
 julia> WENO(order=9, bounds=(0, 1))
-WENO{5, Float64, Float32}(order=9, bounds=(0.0, 1.0))
-├── buffer_scheme: WENO{4, Float64, Float32}(order=7, bounds=(0.0, 1.0))
-│   └── buffer_scheme: WENO{3, Float64, Float32}(order=5, bounds=(0.0, 1.0))
-│       └── buffer_scheme: WENO{2, Float64, Float32}(order=3, bounds=(0.0, 1.0))
+WENO{5, Float64, Nothing}(order=9, bounds=(0.0, 1.0))
+├── buffer_scheme: WENO{4, Float64, Nothing}(order=7, bounds=(0.0, 1.0))
+│   └── buffer_scheme: WENO{3, Float64, Nothing}(order=5, bounds=(0.0, 1.0))
+│       └── buffer_scheme: WENO{2, Float64, Nothing}(order=3, bounds=(0.0, 1.0))
 │           └── buffer_scheme: Centered(order=2)
 └── advecting_velocity_scheme: Centered(order=8)
 ```
+
+To build a WENO scheme that uses approximate division on a GPU to execute faster:
+```jldoctest weno
+julia> WENO(;weight_computation=Oceananigans.Utils.BackendOptimizedDivision)
+WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5)
+├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=3)
+│   └── buffer_scheme: Centered(order=2)
+└── advecting_velocity_scheme: Centered(order=4)
+```
 """
-function WENO(FT::DataType=Oceananigans.defaults.FloatType, FT2::DataType=Float32;
+function WENO(FT::DataType=Oceananigans.defaults.FloatType;
+              weight_computation::DataType=Nothing,
               order = 5,
               buffer_scheme = DecreasingOrderAdvectionScheme(),
               bounds = nothing,
@@ -116,20 +127,19 @@ function WENO(FT::DataType=Oceananigans.defaults.FloatType, FT2::DataType=Float3
                 # At minimum order, switch to Centered scheme
                 buffer_scheme = Centered(FT; order=2)
             else
-                buffer_scheme = WENO(FT, FT2; order=order-2, bounds, minimum_buffer_upwind_order)
+                buffer_scheme = WENO(FT; order=order-2, bounds, minimum_buffer_upwind_order, weight_computation)
             end
         end
 
         N = Int((order + 1) ÷ 2)
-        return WENO{N, FT, FT2}(bounds, buffer_scheme, advecting_velocity_scheme)
+        return WENO{N, FT, weight_computation}(bounds, buffer_scheme, advecting_velocity_scheme)
     end
 end
 
 weno_order(::WENO{N}) where N = 2N-1
 Base.eltype(::WENO{N, FT}) where {N, FT} = FT
-eltype2(::WENO{N, FT, FT2}) where {N, FT, FT2} = FT2
-Base.summary(a::WENO{N, FT, FT2, Nothing}) where {N, FT, FT2} = string("WENO{$N, $FT, $FT2}(order=", 2N-1, ")")
-Base.summary(a::WENO{N, FT, FT2, PP}) where {N, FT, FT2, PP} = string("WENO{$N, $FT, $FT2}(order=", 2N-1, ", bounds=", string(a.bounds), ")")
+Base.summary(a::WENO{N, FT, WCT, Nothing}) where {N, FT, WCT} = string("WENO{$N, $FT, $WCT}(order=", 2N-1, ")")
+Base.summary(a::WENO{N, FT, WCT, PP}) where {N, FT, WCT, PP} = string("WENO{$N, $FT, $WCT}(order=", 2N-1, ", bounds=", string(a.bounds), ")")
 
 function Base.show(io::IO, a::WENO)
     print(io, summary(a), '\n')
@@ -145,12 +155,16 @@ function Base.show(io::IO, a::WENO)
     print(io, "└── advecting_velocity_scheme: ", summary(a.advecting_velocity_scheme))
 end
 
-Adapt.adapt_structure(to, scheme::WENO{N, FT, FT2}) where {N, FT, FT2} =
-     WENO{N, FT, FT2}(Adapt.adapt(to, scheme.bounds),
+Adapt.adapt_structure(to, scheme::WENO{N, FT, WCT}) where {N, FT, WCT} =
+     WENO{N, FT, WCT}(Adapt.adapt(to, scheme.bounds),
                       Adapt.adapt(to, scheme.buffer_scheme),
                       Adapt.adapt(to, scheme.advecting_velocity_scheme))
 
-on_architecture(to, scheme::WENO{N, FT, FT2}) where {N, FT, FT2} =
-    WENO{N, FT, FT2}(on_architecture(to, scheme.bounds),
+on_architecture(to, scheme::WENO{N, FT, WCT}) where {N, FT, WCT} =
+    WENO{N, FT, WCT}(on_architecture(to, scheme.bounds),
                      on_architecture(to, scheme.buffer_scheme),
                      on_architecture(to, scheme.advecting_velocity_scheme))
+
+# Select the default WENO weight computation
+# Specific backends may override
+default_weno_weight_computation(arch) = Oceananigans.Utils.BackendOptimizedDivision
@@ -59,4 +59,9 @@ function precondition!(p, preconditioner::DistributedFourierTridiagonalPoissonSo
     return p
 end
 
+# Correctly pass architecture to determine the default weno_weight_computation
+Oceananigans.Advection.default_weno_weight_computation(arch::Distributed) =
+    Oceananigans.Advection.default_weno_weight_computation(child_architecture(arch))
+
+
 end # module
Original file line number	Diff line number	Diff line change
`@@ -335,6 +335,7 @@ end`
`335`	`335`
`336`	`336`	`Base.getindex(array::OffsetVector{T, <:Reactant.AbstractConcreteArray{T, 1}}, ::Colon) where T = array`
`337`	`337`
	`338`	`+`
`338`	`339`	`# These are additional modules that may need to be Reactantified in the future:`
`339`	`340`	`#`
`340`	`341`	`# include("Utils.jl")`