diff --git a/Project.toml b/Project.toml index 6b52569e3e8..64702bc12fa 100755 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Oceananigans" uuid = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09" -version = "0.106.5" +version = "0.106.6" authors = ["Climate Modeling Alliance and contributors"] [deps] diff --git a/ext/OceananigansCUDAExt.jl b/ext/OceananigansCUDAExt.jl index fd705879c04..f1e66c6cb25 100644 --- a/ext/OceananigansCUDAExt.jl +++ b/ext/OceananigansCUDAExt.jl @@ -132,4 +132,31 @@ end @inline UT.sync_device!(::CUDAGPU) = CUDA.synchronize() @inline UT.sync_device!(::CUDABackend) = CUDA.synchronize() +# Use faster versions of `newton_div` on Nvidia GPUs +CUDA.@device_override UT.newton_div(::Type{UT.BackendOptimizedDivision}, a, b) = a * fast_inv_cuda(b) + +function fast_inv_cuda(a::Float64) + # Get the approximate reciprocal + # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64 + # This instruction chops off last 32bits of mantissa and computes inverse + # while treating all subnormal numbers as 0.0 + # If reciprocal would be subnormal, underflows to 0.0 + # 32 least significant bits of the result are filled with 0s + inv_a = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), a) + + # Approximate the missing 32bits of mantissa with a single cubic iteration + e = fma(inv_a, -a, 1.0) + e = fma(e, e, e) + inv_a = fma(e, inv_a, inv_a) + return inv_a +end + +function fast_inv_cuda(a::Float32) + # This instruction just computes reciprocal flushing subnormals to 0.0 + # Hence for subnormal inputs it returns Inf + # For large number whose reciprocal is subnormal it underflows to 0.0 + inv_a = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), a) + return inv_a +end + end # module OceananigansCUDAExt diff --git a/ext/OceananigansReactantExt/Models.jl b/ext/OceananigansReactantExt/Models.jl index 998828ceb4c..0226d1e40e2 100644 --- a/ext/OceananigansReactantExt/Models.jl +++ b/ext/OceananigansReactantExt/Models.jl @@ -17,6 +17,8 @@ using ..Grids: ShardedGrid, ShardedDistributed import Oceananigans.Models: complete_communication_and_compute_buffer!, interior_tendency_kernel_parameters +import Oceananigans.Advection: default_weno_weight_computation + const ReactantHFSM{TS, E} = Union{ HydrostaticFreeSurfaceModel{TS, E, <:ReactantState}, @@ -60,4 +62,10 @@ maybe_prepare_first_time_step!(model::ReactantHFSM, callbacks) = nothing complete_communication_and_compute_buffer!(model, ::ShardedGrid, ::ShardedDistributed) = nothing interior_tendency_kernel_parameters(::ShardedDistributed, grid) = :xyz +# Reactant uses CUDA version of the code to uplift program description to MLIR. +# Since default `weno_weight_computation` on CUDA uses LLVM's NVPTX intrinsics +# it causes Reactant to crash. +# We need to fall back to different optimization when running with Reactant +default_weno_weight_computation(::ReactantState) = Oceananigans.Utils.ConvertingDivision{Float32} + end # module diff --git a/ext/OceananigansReactantExt/OceananigansReactantExt.jl b/ext/OceananigansReactantExt/OceananigansReactantExt.jl index 4a7c75e8a6c..601b137dca5 100644 --- a/ext/OceananigansReactantExt/OceananigansReactantExt.jl +++ b/ext/OceananigansReactantExt/OceananigansReactantExt.jl @@ -335,6 +335,7 @@ end Base.getindex(array::OffsetVector{T, <:Reactant.AbstractConcreteArray{T, 1}}, ::Colon) where T = array + # These are additional modules that may need to be Reactantified in the future: # # include("Utils.jl") diff --git a/src/Advection/Advection.jl b/src/Advection/Advection.jl index 8065f433ad5..0fcb9c71841 100644 --- a/src/Advection/Advection.jl +++ b/src/Advection/Advection.jl @@ -84,5 +84,6 @@ include("tracer_advection_operators.jl") include("bounds_preserving_tracer_advection_operators.jl") include("cell_advection_timescale.jl") include("adapt_advection_order.jl") +include("materialize_advection.jl") end # module diff --git a/src/Advection/materialize_advection.jl b/src/Advection/materialize_advection.jl new file mode 100644 index 00000000000..89814d64144 --- /dev/null +++ b/src/Advection/materialize_advection.jl @@ -0,0 +1,64 @@ +import Oceananigans: architecture + +""" + materialize_advection(advection, grid) + +Return a fully materialized advection scheme appropriate for `grid`. +It exists to allow advection schemes to defer specialising settings until +additional information about the backend from grid is available. + +For example it allows to set per-backend defaults for WENO weight computation +setting. +""" +materialize_advection(advection, grid) = advection +materialize_advection(::Nothing, grid) = nothing +materialize_advection(advection::FluxFormAdvection, grid) = FluxFormAdvection( + materialize_advection(advection.x, grid), + materialize_advection(advection.y, grid), + materialize_advection(advection.z, grid), +) + +# Upwinding treatments hold a cross_scheme that may contain deferred WENO weight computation +materialize_advection(u::OnlySelfUpwinding, grid) = + OnlySelfUpwinding(materialize_advection(u.cross_scheme, grid), + u.δU_stencil, u.δV_stencil, u.δu²_stencil, u.δv²_stencil) + +materialize_advection(u::CrossAndSelfUpwinding, grid) = + CrossAndSelfUpwinding(materialize_advection(u.cross_scheme, grid), + u.divergence_stencil, u.δu²_stencil, u.δv²_stencil) + +materialize_advection(u::VelocityUpwinding, grid) = + VelocityUpwinding(materialize_advection(u.cross_scheme, grid)) + +# VectorInvariant wraps multiple sub-schemes; recurse into each +materialize_advection(vi::VectorInvariant{N,FT,M}, grid) where {N,FT,M} = + VectorInvariant{N,FT,M}( + materialize_advection(vi.vorticity_scheme, grid), + vi.vorticity_stencil, + materialize_advection(vi.vertical_advection_scheme, grid), + materialize_advection(vi.kinetic_energy_gradient_scheme, grid), + materialize_advection(vi.divergence_scheme, grid), + materialize_advection(vi.upwinding, grid), + ) + + +materialize_advection(weno::WENO{N,FT,WCT}, grid) where {N,FT,WCT} = WENO{N,FT,WCT}( + weno.bounds, + materialize_advection(weno.buffer_scheme, grid), + materialize_advection(weno.advecting_velocity_scheme, grid), +) + +materialize_advection(weno::WENO{N,FT,Nothing}, grid) where {N,FT} = + WENO{N,FT,default_weno_weight_computation(architecture(grid))}( + weno.bounds, + materialize_advection(weno.buffer_scheme, grid), + materialize_advection(weno.advecting_velocity_scheme, grid), + ) + +materialize_advection(scheme::UpwindBiased{N,FT}, grid) where {N,FT} = UpwindBiased{N,FT}( + materialize_advection(scheme.buffer_scheme, grid), + materialize_advection(scheme.advecting_velocity_scheme, grid), +) + +materialize_advection(scheme::Centered{N,FT}, grid) where {N,FT} = + Centered{N,FT}(materialize_advection(scheme.buffer_scheme, grid)) diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl index 0741aa4701d..cbc0a1c6e5c 100644 --- a/src/Advection/vector_invariant_advection.jl +++ b/src/Advection/vector_invariant_advection.jl @@ -137,8 +137,7 @@ function Base.summary(a::WENOVectorInvariant{N}) where N vertical_order = weno_order(a.vertical_advection_scheme) order = weno_order(a.vorticity_scheme) FT = eltype(a.vorticity_scheme) - FT2 = eltype2(a.vorticity_scheme) - return string("WENOVectorInvariant{$N, $FT, $FT2}(vorticity_order=$vorticity_order, vertical_order=$vertical_order)") + return string("WENOVectorInvariant{$N, $FT}(vorticity_order=$vorticity_order, vertical_order=$vertical_order)") end function Base.show(io::IO, a::VectorInvariant{N, FT}) where {N, FT} @@ -193,12 +192,12 @@ Example julia> using Oceananigans julia> WENOVectorInvariant() -WENOVectorInvariant{5, Float64, Float32}(vorticity_order=9, vertical_order=5) -├── vorticity_scheme: WENO{5, Float64, Float32}(order=9) +WENOVectorInvariant{5, Float64}(vorticity_order=9, vertical_order=5) +├── vorticity_scheme: WENO{5, Float64, Nothing}(order=9) ├── vorticity_stencil: Oceananigans.Advection.VelocityStencil -├── vertical_advection_scheme: WENO{3, Float64, Float32}(order=5) -├── kinetic_energy_gradient_scheme: WENO{3, Float64, Float32}(order=5) -├── divergence_scheme: WENO{3, Float64, Float32}(order=5) +├── vertical_advection_scheme: WENO{3, Float64, Nothing}(order=5) +├── kinetic_energy_gradient_scheme: WENO{3, Float64, Nothing}(order=5) +├── divergence_scheme: WENO{3, Float64, Nothing}(order=5) └── upwinding: OnlySelfUpwinding ``` """ diff --git a/src/Advection/weno_interpolants.jl b/src/Advection/weno_interpolants.jl index dc4be2ddd71..bf38ac3fad3 100644 --- a/src/Advection/weno_interpolants.jl +++ b/src/Advection/weno_interpolants.jl @@ -291,7 +291,7 @@ end @inline function metaprogrammed_zweno_alpha_loop(buffer) elem = Vector(undef, buffer) for stencil = 1:buffer - elem[stencil] = :(C★(scheme, Val($(stencil-1))) * (1 + (newton_div(FT2, τ, β[$stencil] + ϵ))^2)) + elem[stencil] = :(C★(scheme, Val($(stencil-1))) * (1 + (newton_div(WCT, τ, β[$stencil] + ϵ))^2)) end return :($(elem...),) @@ -301,7 +301,7 @@ for buffer in advection_buffers[2:end] @eval begin @inline beta_sum(scheme::WENO{$buffer, FT}, β₁, β₂) where FT = @inbounds $(metaprogrammed_beta_sum(buffer)) @inline beta_loop(scheme::WENO{$buffer, FT}, ψ) where FT = @inbounds $(metaprogrammed_beta_loop(buffer)) - @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, FT2}, β, τ) where {FT, FT2} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer)) + @inline zweno_alpha_loop(scheme::WENO{$buffer, FT, WCT}, β, τ) where {FT, WCT} = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer)) end end diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl index 65a90346923..86ed4af2860 100644 --- a/src/Advection/weno_reconstruction.jl +++ b/src/Advection/weno_reconstruction.jl @@ -4,20 +4,20 @@ import Oceananigans ##### Weighted Essentially Non-Oscillatory (WENO) advection scheme ##### -struct WENO{N, FT, FT2, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT} +struct WENO{N, FT, WCT, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT} bounds :: PP buffer_scheme :: CA advecting_velocity_scheme :: SI - function WENO{N, FT, FT2}(bounds::PP, buffer_scheme::CA, - advecting_velocity_scheme :: SI) where {N, FT, FT2, PP, CA, SI} + function WENO{N, FT, WCT}(bounds::PP, buffer_scheme::CA, + advecting_velocity_scheme :: SI) where {N, FT, WCT, PP, CA, SI} - return new{N, FT, FT2, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme) + return new{N, FT, WCT, PP, CA, SI}(bounds, buffer_scheme, advecting_velocity_scheme) end end """ - WENO([FT=Float64, FT2=Float32;] + WENO([FT=Float64;] order = 5, bounds = nothing, minimum_buffer_upwind_order = 3) @@ -28,11 +28,12 @@ Arguments ========= - `FT`: The floating point type used in the scheme. Default: `Oceananigans.defaults.FloatType` -- `FT2`: The floating point type used in some performance-critical parts of the scheme. Default: `Float32` Keyword arguments ================= - +- `weight_computation`: The type of approximate division to used when computing WENO weights. + Default: `Nothing` (deferred; a architecture-dependent default is assigned in + `materialize_advection`) - `order`: The order of the WENO advection scheme. Default: 5 - `bounds` (experimental): Whether to use bounds-preserving WENO, which produces a reconstruction that attempts to restrict a quantity to lie between a `bounds` tuple. @@ -51,8 +52,8 @@ To build the default 5th-order scheme: julia> using Oceananigans julia> WENO() -WENO{3, Float64, Float32}(order=5) -├── buffer_scheme: WENO{2, Float64, Float32}(order=3) +WENO{3, Float64, Nothing}(order=5) +├── buffer_scheme: WENO{2, Float64, Nothing}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=4) ``` @@ -62,10 +63,10 @@ yet minimally-dissipative advection scheme): ```jldoctest weno julia> WENO(order=9) -WENO{5, Float64, Float32}(order=9) -├── buffer_scheme: WENO{4, Float64, Float32}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Float32}(order=5) -│ └── buffer_scheme: WENO{2, Float64, Float32}(order=3) +WENO{5, Float64, Nothing}(order=9) +├── buffer_scheme: WENO{4, Float64, Nothing}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Nothing}(order=5) +│ └── buffer_scheme: WENO{2, Float64, Nothing}(order=3) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` @@ -75,24 +76,34 @@ which uses `Centered(order=2)` as the innermost buffer scheme: ```jldoctest weno julia> WENO(order=9, minimum_buffer_upwind_order=5) -WENO{5, Float64, Float32}(order=9) -├── buffer_scheme: WENO{4, Float64, Float32}(order=7) -│ └── buffer_scheme: WENO{3, Float64, Float32}(order=5) +WENO{5, Float64, Nothing}(order=9) +├── buffer_scheme: WENO{4, Float64, Nothing}(order=7) +│ └── buffer_scheme: WENO{3, Float64, Nothing}(order=5) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` ```jldoctest weno julia> WENO(order=9, bounds=(0, 1)) -WENO{5, Float64, Float32}(order=9, bounds=(0.0, 1.0)) -├── buffer_scheme: WENO{4, Float64, Float32}(order=7, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{3, Float64, Float32}(order=5, bounds=(0.0, 1.0)) -│ └── buffer_scheme: WENO{2, Float64, Float32}(order=3, bounds=(0.0, 1.0)) +WENO{5, Float64, Nothing}(order=9, bounds=(0.0, 1.0)) +├── buffer_scheme: WENO{4, Float64, Nothing}(order=7, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{3, Float64, Nothing}(order=5, bounds=(0.0, 1.0)) +│ └── buffer_scheme: WENO{2, Float64, Nothing}(order=3, bounds=(0.0, 1.0)) │ └── buffer_scheme: Centered(order=2) └── advecting_velocity_scheme: Centered(order=8) ``` + +To build a WENO scheme that uses approximate division on a GPU to execute faster: +```jldoctest weno +julia> WENO(;weight_computation=Oceananigans.Utils.BackendOptimizedDivision) +WENO{3, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=5) +├── buffer_scheme: WENO{2, Float64, Oceananigans.Utils.BackendOptimizedDivision}(order=3) +│ └── buffer_scheme: Centered(order=2) +└── advecting_velocity_scheme: Centered(order=4) +``` """ -function WENO(FT::DataType=Oceananigans.defaults.FloatType, FT2::DataType=Float32; +function WENO(FT::DataType=Oceananigans.defaults.FloatType; + weight_computation::DataType=Nothing, order = 5, buffer_scheme = DecreasingOrderAdvectionScheme(), bounds = nothing, @@ -116,20 +127,19 @@ function WENO(FT::DataType=Oceananigans.defaults.FloatType, FT2::DataType=Float3 # At minimum order, switch to Centered scheme buffer_scheme = Centered(FT; order=2) else - buffer_scheme = WENO(FT, FT2; order=order-2, bounds, minimum_buffer_upwind_order) + buffer_scheme = WENO(FT; order=order-2, bounds, minimum_buffer_upwind_order, weight_computation) end end N = Int((order + 1) ÷ 2) - return WENO{N, FT, FT2}(bounds, buffer_scheme, advecting_velocity_scheme) + return WENO{N, FT, weight_computation}(bounds, buffer_scheme, advecting_velocity_scheme) end end weno_order(::WENO{N}) where N = 2N-1 Base.eltype(::WENO{N, FT}) where {N, FT} = FT -eltype2(::WENO{N, FT, FT2}) where {N, FT, FT2} = FT2 -Base.summary(a::WENO{N, FT, FT2, Nothing}) where {N, FT, FT2} = string("WENO{$N, $FT, $FT2}(order=", 2N-1, ")") -Base.summary(a::WENO{N, FT, FT2, PP}) where {N, FT, FT2, PP} = string("WENO{$N, $FT, $FT2}(order=", 2N-1, ", bounds=", string(a.bounds), ")") +Base.summary(a::WENO{N, FT, WCT, Nothing}) where {N, FT, WCT} = string("WENO{$N, $FT, $WCT}(order=", 2N-1, ")") +Base.summary(a::WENO{N, FT, WCT, PP}) where {N, FT, WCT, PP} = string("WENO{$N, $FT, $WCT}(order=", 2N-1, ", bounds=", string(a.bounds), ")") function Base.show(io::IO, a::WENO) print(io, summary(a), '\n') @@ -145,12 +155,16 @@ function Base.show(io::IO, a::WENO) print(io, "└── advecting_velocity_scheme: ", summary(a.advecting_velocity_scheme)) end -Adapt.adapt_structure(to, scheme::WENO{N, FT, FT2}) where {N, FT, FT2} = - WENO{N, FT, FT2}(Adapt.adapt(to, scheme.bounds), +Adapt.adapt_structure(to, scheme::WENO{N, FT, WCT}) where {N, FT, WCT} = + WENO{N, FT, WCT}(Adapt.adapt(to, scheme.bounds), Adapt.adapt(to, scheme.buffer_scheme), Adapt.adapt(to, scheme.advecting_velocity_scheme)) -on_architecture(to, scheme::WENO{N, FT, FT2}) where {N, FT, FT2} = - WENO{N, FT, FT2}(on_architecture(to, scheme.bounds), +on_architecture(to, scheme::WENO{N, FT, WCT}) where {N, FT, WCT} = + WENO{N, FT, WCT}(on_architecture(to, scheme.bounds), on_architecture(to, scheme.buffer_scheme), on_architecture(to, scheme.advecting_velocity_scheme)) + +# Select the default WENO weight computation +# Specific backends may override +default_weno_weight_computation(arch) = Oceananigans.Utils.BackendOptimizedDivision diff --git a/src/DistributedComputations/DistributedComputations.jl b/src/DistributedComputations/DistributedComputations.jl index 87ab966fbb4..4ced5d6d026 100644 --- a/src/DistributedComputations/DistributedComputations.jl +++ b/src/DistributedComputations/DistributedComputations.jl @@ -59,4 +59,9 @@ function precondition!(p, preconditioner::DistributedFourierTridiagonalPoissonSo return p end +# Correctly pass architecture to determine the default weno_weight_computation +Oceananigans.Advection.default_weno_weight_computation(arch::Distributed) = + Oceananigans.Advection.default_weno_weight_computation(child_architecture(arch)) + + end # module diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl index 9aa8bd8e9c5..6817e3da3f8 100644 --- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl +++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl @@ -1,4 +1,4 @@ -using Oceananigans.Advection: AbstractAdvectionScheme, Centered, VectorInvariant, adapt_advection_order +using Oceananigans.Advection: AbstractAdvectionScheme, Centered, VectorInvariant, adapt_advection_order, materialize_advection using Oceananigans.Architectures: AbstractArchitecture using Oceananigans.Biogeochemistry: validate_biogeochemistry, AbstractBiogeochemistry, biogeochemical_auxiliary_fields using Oceananigans.BoundaryConditions: FieldBoundaryConditions, regularize_field_boundary_conditions @@ -200,6 +200,10 @@ function HydrostaticFreeSurfaceModel(grid; advection = merge(momentum_advection_tuple, tracer_advection_tuple) advection = NamedTuple(name => adapt_advection_order(scheme, grid) for (name, scheme) in pairs(advection)) + # Fill any settings in advection scheme that might have been deferred until + # the grid and backend is known + advection = NamedTuple(name => materialize_advection(scheme, grid) for (name, scheme) in pairs(advection)) + validate_buoyancy(buoyancy, tracernames(tracers)) buoyancy = materialize_buoyancy(buoyancy, grid) diff --git a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl index 87930eb47c3..8f9fb183c37 100644 --- a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl +++ b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl @@ -1,4 +1,4 @@ -using Oceananigans.Advection: Centered, adapt_advection_order +using Oceananigans.Advection: Centered, adapt_advection_order, materialize_advection using Oceananigans.Architectures: AbstractArchitecture using Oceananigans.Biogeochemistry: validate_biogeochemistry, AbstractBiogeochemistry, biogeochemical_auxiliary_fields using Oceananigans.BoundaryConditions: MixedBoundaryCondition @@ -217,6 +217,10 @@ function NonhydrostaticModel(grid; # direction advection = adapt_advection_order(advection, grid) + # Fill any settings in advection scheme that might have been deferred until + # the grid and backend is known + advection = materialize_advection(advection, grid) + # Adjust halos when the advection scheme or turbulence closure requires it. # Note that halos are isotropic by default; however we respect user-input here # by adjusting each (x, y, z) halo individually. diff --git a/src/Models/ShallowWaterModels/shallow_water_model.jl b/src/Models/ShallowWaterModels/shallow_water_model.jl index fdab9e62a3e..950dc22221f 100644 --- a/src/Models/ShallowWaterModels/shallow_water_model.jl +++ b/src/Models/ShallowWaterModels/shallow_water_model.jl @@ -10,6 +10,7 @@ using Oceananigans.Forcings: model_forcing using Oceananigans.Grids: topology, Flat, architecture, RectilinearGrid, Center using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid using Oceananigans.Models: validate_model_halo, validate_tracer_advection +using Oceananigans.Advection: materialize_advection using Oceananigans.TimeSteppers: Clock, TimeStepper, update_state! using Oceananigans.TurbulenceClosures: with_tracers, build_closure_fields using Oceananigans.Utils: tupleit @@ -179,6 +180,10 @@ function ShallowWaterModel(grid; advection = merge((momentum=momentum_advection, mass=mass_advection), tracer_advection_tuple) + # Fill any settings in advection scheme that might have been deferred until + # the grid and backend is known + advection = NamedTuple(name => materialize_advection(scheme, grid) for (name, scheme) in pairs(advection)) + bathymetry_field = CenterField(grid) if !isnothing(bathymetry) set!(bathymetry_field, bathymetry) diff --git a/src/MultiRegion/multi_region_models.jl b/src/MultiRegion/multi_region_models.jl index db84965ca5a..ddcbe7aca78 100644 --- a/src/MultiRegion/multi_region_models.jl +++ b/src/MultiRegion/multi_region_models.jl @@ -1,4 +1,4 @@ -using Oceananigans.Advection: Advection, WENO, VectorInvariant, adapt_advection_order, cell_advection_timescale +using Oceananigans.Advection: Advection, WENO, VectorInvariant, adapt_advection_order, cell_advection_timescale, materialize_advection using Oceananigans.BuoyancyFormulations: BuoyancyFormulations, BuoyancyForce, NegativeZDirection, AbstractBuoyancyFormulation, validate_unit_vector using Oceananigans.TimeSteppers: TimeSteppers, QuasiAdamsBashforth2TimeStepper using Oceananigans.Models: Models, ExplicitFreeSurface, HydrostaticFreeSurfaceModel, ImplicitFreeSurface, PrescribedVelocityFields @@ -17,6 +17,11 @@ function Advection.adapt_advection_order(advection::MultiRegionObject, grid::Mul return new_advection end +function Advection.materialize_advection(advection::MultiRegionObject, grid::MultiRegionGrids) + @apply_regionally new_advection = materialize_advection(advection, grid) + return new_advection +end + # Utility to generate the inputs to complex `getregion`s function getregionalproperties(type, inner=true) names = fieldnames(type) diff --git a/src/Utils/Utils.jl b/src/Utils/Utils.jl index 89495894fc6..561382b03ff 100644 --- a/src/Utils/Utils.jl +++ b/src/Utils/Utils.jl @@ -11,7 +11,7 @@ export seconds_to_nanosecond, period_to_seconds, time_difference_seconds, add_ti export TimeInterval, IterationInterval, WallTimeInterval, SpecifiedTimes, AndSchedule, OrSchedule, ConsecutiveIterations export apply_regionally!, construct_regionally, @apply_regionally, MultiRegionObject export isregional, getregion, _getregion, regions, sync_device! -export newton_div +export newton_div, NormalDivision, ConvertingDivision, BackendOptimizedDivision export TabulatedFunction export interpolator, _interpolate export ϕ₁, ϕ₂, ϕ₃, ϕ₄, ϕ₅, ϕ₆, ϕ₇, ϕ₈ diff --git a/src/Utils/newton_div.jl b/src/Utils/newton_div.jl index 86da3e1d04c..5b406d22c03 100644 --- a/src/Utils/newton_div.jl +++ b/src/Utils/newton_div.jl @@ -1,11 +1,37 @@ """ - newton_div(inv_FT, a, b::FT) +Abstract supertype grouping configuration options for `newton_div`. +Use one of the concrete subtypes as the first argument to `newton_div` +to select the implementation. +""" +abstract type NewtonDivConfig end + +""" +Configuration selecting regular, full-precision division. +""" +struct NormalDivision <: NewtonDivConfig end -Compute an approximation of `a / b` that uses `inv_FT` type to compute -`1/b`, and then performs a single Newton iteration to add a few more bits of precision -afterwards. """ -@inline function newton_div(inv_FT, a, b::FT) where FT +Configuration selecting approximate division via one Newton iteration. +The reciprocal `1/b` is first evaluated in a lower-precision type `FT` +to obtain a fast initial guess, then refined with a single Newton step +in the original precision. +""" +struct ConvertingDivision{FT} <: NewtonDivConfig end + +""" +Configuration selecting a backend-optimized implementation of approximate division. +The actual algorithm may differ across CPU and different GPU backends. +""" +struct BackendOptimizedDivision <: NewtonDivConfig end + +""" + newton_div(::Type{T}, a, b) + +Compute an approximate division `a/b` using a method specified by selector type `T`. +""" +function newton_div end + +@inline function newton_div(::Type{ConvertingDivision{inv_FT}}, a, b::FT) where {inv_FT, FT} # Low precision division: b_low = convert(inv_FT, b) inv_b = Base.FastMath.inv_fast(b_low) @@ -19,5 +45,13 @@ afterwards. return x end -# Fallback for no precision lowering -@inline newton_div(::Type{FT}, a, b::FT) where FT = a * Base.FastMath.inv_fast(b) +# Case of matching precisions +@inline newton_div(::Type{ConvertingDivision{FT}}, a, b::FT) where FT = a * Base.FastMath.inv_fast(b) + +# Exact division if requested +@inline newton_div(::Type{NormalDivision}, a, b) = a / b + +# Backend-optimized on CPU uses fast division +# Since both f32 and f64 divisions are generally implemented in hardware, +# the latencies are comparable. +@inline newton_div(::Type{BackendOptimizedDivision}, a, b) = Base.FastMath.div_fast(a, b) diff --git a/test/dependencies_for_runtests.jl b/test/dependencies_for_runtests.jl index d88bf33dbb4..efac056b623 100644 --- a/test/dependencies_for_runtests.jl +++ b/test/dependencies_for_runtests.jl @@ -90,3 +90,9 @@ already_included[] = true float_types = (Float32, Float64) archs = test_architectures() + +# We need to Mock a grid since it provides an architecture for advection materialization +struct MockGrid{A <: AbstractArchitecture} + arch::A +end +Oceananigans.Grids.architecture(a::MockGrid) = a.arch diff --git a/test/runtests.jl b/test/runtests.jl index 144961f8e25..86d7a2093d1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -51,6 +51,8 @@ CUDA.allowscalar() do include("test_stokes_drift.jl") include("test_utils.jl") include("test_schedules.jl") + include("test_newton_div.jl") + include("test_materialize_advection.jl") end end diff --git a/test/test_forcings.jl b/test/test_forcings.jl index 79f72fedf14..459353d45ca 100644 --- a/test/test_forcings.jl +++ b/test/test_forcings.jl @@ -519,6 +519,7 @@ end @testset "Momentum flux zero at immersed peripheral nodes" begin @info " Testing momentum flux is zero at immersed peripheral nodes..." for scheme in (Centered(order=2), UpwindBiased(order=3), WENO(order=5)) + scheme = Oceananigans.Advection.materialize_advection(scheme, MockGrid(arch)) @test test_momentum_flux_zero_at_peripheral_nodes(scheme) end end diff --git a/test/test_immersed_advection.jl b/test/test_immersed_advection.jl index f2a52f871ab..1d4f1c9693d 100644 --- a/test/test_immersed_advection.jl +++ b/test/test_immersed_advection.jl @@ -12,7 +12,8 @@ using Oceananigans.Advection: _biased_interpolate_yᵃᶠᵃ, FluxFormAdvection, LeftBias, - RightBias + RightBias, + materialize_advection linear_advection_schemes = [Centered, UpwindBiased] advection_schemes = [linear_advection_schemes... WENO] @@ -21,7 +22,7 @@ advection_schemes = [linear_advection_schemes... WENO] @inline advective_order(buffer, AdvectionType) = buffer * 2 - 1 function run_tracer_interpolation_test(c, ibg, scheme) - + scheme = materialize_advection(scheme, ibg) for j in 6:19, i in 6:19 if typeof(scheme) <: Centered @test @allowscalar _symmetric_interpolate_xᶠᵃᵃ(i+1, j, 1, ibg, scheme, c) ≈ 1.0 @@ -35,7 +36,7 @@ function run_tracer_interpolation_test(c, ibg, scheme) end function run_tracer_conservation_test(grid, scheme) - + scheme = materialize_advection(scheme, grid) model = HydrostaticFreeSurfaceModel(grid; tracers = :c, free_surface = ExplicitFreeSurface(), tracer_advection = scheme) @@ -65,6 +66,7 @@ function run_tracer_conservation_test(grid, scheme) end function run_momentum_interpolation_test(u, v, ibg, scheme) + scheme = materialize_advection(scheme, ibg) # ensure also immersed boundaries have a value of 1 interior(u, 6, :, 1) .= 1.0 diff --git a/test/test_materialize_advection.jl b/test/test_materialize_advection.jl new file mode 100644 index 00000000000..c82293a1103 --- /dev/null +++ b/test/test_materialize_advection.jl @@ -0,0 +1,39 @@ +include("dependencies_for_runtests.jl") + +using Oceananigans.Advection: materialize_advection +using Oceananigans.Utils: NormalDivision, ConvertingDivision, BackendOptimizedDivision + +@testset "materialize weno scheme chain with placeholders" begin + + # Construct an advection chain by hand with intermediate non-WENO buffer schemes + level_0 = Centered(order = 2) + level_1 = + WENO(Float64; order = 3, weight_computation = Nothing, buffer_scheme = level_0) + level_2 = Centered(Float64; order = 4, buffer_scheme = level_1) + level_3 = UpwindBiased(Float64; order = 5, buffer_scheme = level_2) + level_4 = + WENO(Float64; order = 5, weight_computation = Nothing, buffer_scheme = level_3) + level_5 = WENO( + Float64; + order = 7, + weight_computation = NormalDivision, + buffer_scheme = level_4, + ) + + # Materialize using materialize_advection; Nothing WCTs are replaced by the global default + # (Oceananigans.defaults.weno_weight_computation == BackendOptimizedDivision) + materialized = materialize_advection(level_5, MockGrid(CPU())) + + # Check that all WENO schemes in the chain have the correct weight computation type + get_nth_buffer_scheme(scheme, n) = + n == 1 ? scheme : get_nth_buffer_scheme(scheme.buffer_scheme, n - 1) + get_weight_computation(::WENO{<:Any,<:Any,WCT}) where {WCT} = WCT + + + @test NormalDivision == get_nth_buffer_scheme(materialized, 1) |> get_weight_computation + @test BackendOptimizedDivision == + get_nth_buffer_scheme(materialized, 2) |> get_weight_computation + @test BackendOptimizedDivision == + get_nth_buffer_scheme(materialized, 5) |> get_weight_computation + +end diff --git a/test/test_newton_div.jl b/test/test_newton_div.jl new file mode 100644 index 00000000000..2d17eacfc1c --- /dev/null +++ b/test/test_newton_div.jl @@ -0,0 +1,73 @@ +include("dependencies_for_runtests.jl") + +using CUDA + +# Generate some random points in a single binade [1;2) interval +function test_data_in_single_binade(::Type{FT}, size) where {FT} + prng = Random.Xoshiro(44) + return rand(prng, FT, size) .+ 1 +end + +@testset "CPU newton_div: $FT $WCT" for (FT, WCT) in Iterators.product((Float32, Float64), + (Oceananigans.Utils.NormalDivision, + Oceananigans.Utils.ConvertingDivision{Float32})) + test_input = test_data_in_single_binade(FT, 1024) + + ref = similar(test_input) + output = similar(test_input) + + ref .= FT(π) ./ test_input + output .= Oceananigans.Utils.newton_div.(WCT, FT(π), test_input) + + @test isapprox(ref, output) +end + +if CUDA.functional() + + @testset "CUDA newton_div: $FT" for FT in (Float32, Float64) + test_input = CuArray(test_data_in_single_binade(FT, 1024)) + + WCT = Oceananigans.Utils.BackendOptimizedDivision + + ref = similar(test_input) + output = similar(test_input) + + ref .= FT(π) ./ test_input + output .= Oceananigans.Utils.newton_div.(WCT, FT(π), test_input) + + @test isapprox(ref, output) + end +end + +function append_weight_computation_type!(list, weno::WENO{<:Any, <:Any, WCT}) where {WCT} + push!(list, WCT) + append_weight_computation_type!(list, weno.buffer_scheme) +end +append_weight_computation_type!(::Any, ::Any) = nothing + +# Extract all weight computation types from WENO +# Assumes a non-weno buffer scheme will not have WENO buffer scheme +function get_weight_computation_from_weno_advection(weno::WENO) + weight_computation_types = DataType[] + append_weight_computation_type!(weight_computation_types, weno) + return weight_computation_types +end + +@testset "Verify WENO schemes construction" begin + + # WENO + weno5 = WENO(order=7; weight_computation=Oceananigans.Utils.NormalDivision) + weight_computation_types = get_weight_computation_from_weno_advection(weno5) + @test all(weight_computation_types .== Oceananigans.Utils.NormalDivision) + + # Vector Invariant WENO + vector_weno = WENOVectorInvariant(order=9, weight_computation=Oceananigans.Utils.BackendOptimizedDivision) + + for field_name in fieldnames(typeof(vector_weno)) + field = getfield(vector_weno, field_name) + if field isa WENO + weight_computation_types = get_weight_computation_from_weno_advection(field) + @test all(weight_computation_types .== Oceananigans.Utils.BackendOptimizedDivision) + end + end +end diff --git a/test/test_reactant_correctness.jl b/test/test_reactant_correctness.jl index 7c46ac5270f..5aa504c7da2 100644 --- a/test/test_reactant_correctness.jl +++ b/test/test_reactant_correctness.jl @@ -178,6 +178,8 @@ all_combos(xs...) = vec(collect(Iterators.product(xs...))) @testset "coriolis=nothing" begin @info "Testing compute_simple_Gu! on RectilinearGrid($topo) with coriolis=nothing..." for advection in advection_schemes + # We want to select differed setting based on reactant + advection = Oceananigans.Advection.materialize_advection(advection, reactant_grid) @testset "advection=$(adv_name(advection))" begin fill!(vanilla_Gu, 0) fill!(reactant_Gu, 0) @@ -199,6 +201,8 @@ all_combos(xs...) = vec(collect(Iterators.product(xs...))) @info "Testing compute_simple_Gu! on RectilinearGrid($topo) with FPlane Coriolis..." coriolis = FPlane(f=1e-4) for advection in advection_schemes + # We want to select differed setting based on reactant + advection = Oceananigans.Advection.materialize_advection(advection, reactant_grid) @testset "advection=$(adv_name(advection))" begin fill!(vanilla_Gu, 0) fill!(reactant_Gu, 0) @@ -248,6 +252,8 @@ all_combos(xs...) = vec(collect(Iterators.product(xs...))) @testset "coriolis=nothing" begin @info "Testing compute_simple_Gu! on LatitudeLongitudeGrid with coriolis=nothing..." for advection in advection_schemes + # We want to select differed setting based on reactant + advection = Oceananigans.Advection.materialize_advection(advection, reactant_grid) @testset "advection=$(adv_name(advection))" begin fill!(vanilla_Gu, 0) fill!(reactant_Gu, 0) @@ -269,6 +275,8 @@ all_combos(xs...) = vec(collect(Iterators.product(xs...))) @info "Testing compute_simple_Gu! on LatitudeLongitudeGrid with HydrostaticSphericalCoriolis..." coriolis = HydrostaticSphericalCoriolis() for advection in advection_schemes + # We want to select differed setting based on reactant + advection = Oceananigans.Advection.materialize_advection(advection, reactant_grid) @testset "advection=$(adv_name(advection))" begin fill!(vanilla_Gu, 0) fill!(reactant_Gu, 0)