diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl index 710abb210d..d7bde024b9 100644 --- a/CUDACore/src/device/intrinsics/math.jl +++ b/CUDACore/src/device/intrinsics/math.jl @@ -395,9 +395,26 @@ end @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest)) @device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) +@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y) @device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x) -@device_override FastMath.inv_fast(x::Union{Float32, Float64}) = @fastmath one(x) / x +@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x) + +@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x) +@device_override function FastMath.inv_fast(x::Float64) + # Get the approximate reciprocal + # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64 + # This instruction chops off last 32bits of mantissa and computes inverse + # while treating all subnormal numbers as 0.0 + # If reciprocal would be subnormal, underflows to 0.0 + # 32 least significant bits of the result are filled with 0s + inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x) + + # Approximate the missing 32bits of mantissa with a single cubic iteration + e = fma(inv_x, -x, 1.0) + e = fma(e, e, e) + inv_x = fma(e, inv_x, inv_x) +end ## distributions diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl index 5c7737f578..b86b0ce414 100644 --- a/perf/volumerhs.jl +++ b/perf/volumerhs.jl @@ -37,28 +37,6 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub)) end end -let (jlf, f) = (:div_arcp, :div) - for (T, llvmT) in ((:Float32, "float"), (:Float64, "double")) - ir = """ - %x = f$f fast $llvmT %0, %1 - ret $llvmT %x - """ - @eval begin - # the @pure is necessary so that we can constant propagate. - @inline Base.@pure function $jlf(a::$T, b::$T) - Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) - end - end - end - @eval function $jlf(args...) - Base.$jlf(args...) - end -end -rcp(x) = div_arcp(one(x), x) # still leads to rcp.rn which is also a function call - -# div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) -# rcp(x) = div_fast(one(x), x) - # note the order of the fields below is also assumed in the code. const _nstate = 5 const _ρ, _U, _V, _W, _E = 1:_nstate @@ -130,8 +108,8 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem) # GPU performance trick # Allow optimizations to use the reciprocal of an argument rather than perform division. # IEEE floating-point division is implemented as a function call - ρinv = rcp(ρ) - ρ2inv = rcp(2ρ) + ρinv = inv(ρ) + ρ2inv = inv(2ρ) # ρ2inv = 0.5f0 * pinv P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z)