Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion CUDACore/src/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -395,9 +395,26 @@ end
@device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))

@device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y)

@device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x)
@device_override FastMath.inv_fast(x::Union{Float32, Float64}) = @fastmath one(x) / x
@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x)

@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x)
@device_override function FastMath.inv_fast(x::Float64)
# Get the approximate reciprocal
# https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
# This instruction chops off last 32bits of mantissa and computes inverse
# while treating all subnormal numbers as 0.0
# If reciprocal would be subnormal, underflows to 0.0
# 32 least significant bits of the result are filled with 0s
inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x)

# Approximate the missing 32bits of mantissa with a single cubic iteration
e = fma(inv_x, -x, 1.0)
e = fma(e, e, e)
inv_x = fma(e, inv_x, inv_x)
end

## distributions

Expand Down
26 changes: 2 additions & 24 deletions perf/volumerhs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,28 +37,6 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
end
end

let (jlf, f) = (:div_arcp, :div)
for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
ir = """
%x = f$f fast $llvmT %0, %1
ret $llvmT %x
"""
@eval begin
# the @pure is necessary so that we can constant propagate.
@inline Base.@pure function $jlf(a::$T, b::$T)
Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
end
end
end
@eval function $jlf(args...)
Base.$jlf(args...)
end
end
rcp(x) = div_arcp(one(x), x) # still leads to rcp.rn which is also a function call

# div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
# rcp(x) = div_fast(one(x), x)

# note the order of the fields below is also assumed in the code.
const _nstate = 5
const _ρ, _U, _V, _W, _E = 1:_nstate
Expand Down Expand Up @@ -130,8 +108,8 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
# GPU performance trick
# Allow optimizations to use the reciprocal of an argument rather than perform division.
# IEEE floating-point division is implemented as a function call
ρinv = rcp(ρ)
ρ2inv = rcp(2ρ)
ρinv = inv(ρ)
ρ2inv = inv(2ρ)
# ρ2inv = 0.5f0 * pinv

P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z)
Expand Down