JuliaGPU · maleadt · Apr 14, 2026 · Apr 2, 2026 · Apr 13, 2026
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
@@ -395,9 +395,26 @@ end
 @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))
 
 @device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
+@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y)
 
 @device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x)
-@device_override FastMath.inv_fast(x::Union{Float32, Float64}) = @fastmath one(x) / x
+@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x)
+
+@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x)
+@device_override function FastMath.inv_fast(x::Float64)
+    # Get the approximate reciprocal
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
+    # This instruction chops off last 32bits of mantissa and computes inverse
+    # while treating all subnormal numbers as 0.0
+    # If reciprocal would be subnormal, underflows to 0.0
+    # 32 least significant bits of the result are filled with 0s
+    inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x)
+
+    # Approximate the missing 32bits of mantissa with a single cubic iteration
+    e = fma(inv_x, -x, 1.0)
+    e = fma(e, e, e)
+    inv_x = fma(e, inv_x, inv_x)
+end
 
 ## distributions
 

diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
@@ -37,28 +37,6 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
     end
 end
 
-let (jlf, f) = (:div_arcp, :div)
-    for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
-        ir = """
-            %x = f$f fast $llvmT %0, %1
-            ret $llvmT %x
-        """
-        @eval begin
-            # the @pure is necessary so that we can constant propagate.
-            @inline Base.@pure function $jlf(a::$T, b::$T)
-                Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
-            end
-        end
-    end
-    @eval function $jlf(args...)
-        Base.$jlf(args...)
-    end
-end
-rcp(x) = div_arcp(one(x), x) # still leads to rcp.rn which is also a function call
-
-# div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-# rcp(x) = div_fast(one(x), x)
-
 # note the order of the fields below is also assumed in the code.
 const _nstate = 5
 const _ρ, _U, _V, _W, _E = 1:_nstate
@@ -130,8 +108,8 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
             # GPU performance trick
             # Allow optimizations to use the reciprocal of an argument rather than perform division.
             # IEEE floating-point division is implemented as a function call
-            ρinv = rcp(ρ)
-            ρ2inv = rcp(2ρ)
+            ρinv = inv(ρ)
+            ρ2inv = inv(2ρ)
             # ρ2inv = 0.5f0 * pinv
 
             P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z)