Implement fast_div using fast rcp

vchuravy · Mikolaj-A-Kowalski · efaulhaber · vchuravy · commit a1adfb4a394a · 2026-04-13T15:59:13.000+02:00
Co-authored-by: M. A. Kowalski &lt;mak60@cam.ac.uk&gt;
Co-authored-by: Erik Faulhaber &lt;erik.faulhaber@web.de&gt;
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
@@ -395,9 +395,24 @@ end
 @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))
 
 @device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
+@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y)
 
 @device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x)
-@device_override FastMath.inv_fast(x::Union{Float32, Float64}) = @fastmath one(x) / x
+@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x)
+@device_override function FastMath.inv_fast(x::Float64)
+    # Get the approximate reciprocal
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
+    # This instruction chops off last 32bits of mantissa and computes inverse
+    # while treating all subnormal numbers as 0.0
+    # If reciprocal would be subnormal, underflows to 0.0
+    # 32 least significant bits of the result are filled with 0s
+    inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x)
+
+    # Approximate the missing 32bits of mantissa with a single cubic iteration
+    e = fma(inv_x, -x, 1.0)
+    e = fma(e, e, e)
+    inv_x = fma(e, inv_x, inv_x)
+end
 
 ## distributions