Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

kmp5VT · web-flow · commit 04a39d755fd9 · 2026-04-15T10:33:34.000-07:00
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
@@ -2,7 +2,7 @@
 
 @public fma, rsqrt, saturate, byte_perm, assume
 
-using Base: FastMath
+using Base: FastMath, @assume_effects
 
 
 ## helpers
@@ -248,17 +248,42 @@ end
 @device_override Base.:(^)(x::Float64, y::Float64) = ccall("extern __nv_pow", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 @device_override Base.:(^)(x::Float32, y::Float32) = ccall("extern __nv_powf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 @device_override FastMath.pow_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_powf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
+# pow_fast: Base methods call llvm.powi which NVPTX cannot lower (#3065)
+@device_override @assume_effects :foldable @inline function FastMath.pow_fast(x::Float64, y::Integer)
+    y == -1 && return inv(x)
+    y == 0 && return one(x)
+    y == 1 && return x
+    y == 2 && return x*x
+    y == 3 && return x*x*x
+    x ^ y  # no fast variant for Float64; uses __nv_powi
+end
+@device_override @assume_effects :foldable @inline function FastMath.pow_fast(x::Float32, y::Integer)
+    y == -1 && return inv(x)
+    y == 0 && return one(x)
+    y == 1 && return x
+    y == 2 && return x*x
+    y == 3 && return x*x*x
+    FastMath.pow_fast(x, Float32(y))  # uses __nv_fast_powf
+end
+@device_override @assume_effects :foldable @inline function FastMath.pow_fast(x::Float16, y::Integer)
+    y == -1 && return inv(x)
+    y == 0 && return one(x)
+    y == 1 && return x
+    y == 2 && return x*x
+    y == 3 && return x*x*x
+    Float16(FastMath.pow_fast(Float32(x), Float32(y)))
+end
 @device_override Base.:(^)(x::Float64, y::Int32) = ccall("extern __nv_powi", llvmcall, Cdouble, (Cdouble, Int32), x, y)
 @device_override Base.:(^)(x::Float32, y::Int32) = ccall("extern __nv_powif", llvmcall, Cfloat, (Cfloat, Int32), x, y)
-@device_override @inline function Base.:(^)(x::Float32, y::Int64)
+@device_override @assume_effects :foldable @inline function Base.:(^)(x::Float32, y::Int64)
     y == -1 && return inv(x)
     y == 0 && return one(x)
     y == 1 && return x
     y == 2 && return x*x
     y == 3 && return x*x*x
     x ^ Float32(y)
 end
-@device_override @inline function Base.:(^)(x::Float64, y::Int64)
+@device_override @assume_effects :foldable @inline function Base.:(^)(x::Float64, y::Int64)
     y == -1 && return inv(x)
     y == 0 && return one(x)
     y == 1 && return x
@@ -435,10 +460,14 @@ end
 @device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 @device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 
-@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("extern __nv_fma", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
-@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("extern __nv_fmaf", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
+@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("llvm.fma.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
+@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("llvm.fma.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
 @device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
 
+@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
+@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
+@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
+
 @device_function sad(x::Int32, y::Int32, z::Int32) = ccall("extern __nv_sad", llvmcall, Int32, (Int32, Int32, Int32), x, y, z)
 @device_function sad(x::UInt32, y::UInt32, z::UInt32) = convert(UInt32, ccall("extern __nv_usad", llvmcall, Int32, (Int32, Int32, Int32), x, y, z))
 
diff --git a/CUDACore/src/device/quirks.jl b/CUDACore/src/device/quirks.jl
@@ -103,3 +103,7 @@ for op in (:(<), :(<=), :cmp)
         @device_override Base.$op(q::Rational, x::AbstractFloat) = $op(float(q), x)
     end
 end
+
+# reshape.jl
+@device_override Base._throw_dmrs(n, str, dims) =
+    @gputhrow "DimensionMismatch" "Dimensions mismatch when reshaping. New dimensions must be consistent with array size"
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
@@ -22,6 +22,31 @@ end
     @test !occursin("@__nv_fmaf", ir)
 end
 
+@testset "fma uses LLVM intrinsic" begin
+    function fma_kernel(ptr)
+        unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+        return
+    end
+
+    for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
+        ir = sprint(io->CUDA.code_llvm(io, fma_kernel, Tuple{Ptr{T}}))
+        @test occursin("llvm.fma.$suffix", ir)
+        @test !occursin("__nv_fma", ir)
+    end
+end
+
+@testset "muladd uses LLVM intrinsic" begin
+    function muladd_kernel(ptr)
+        unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+        return
+    end
+
+    for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
+        ir = sprint(io->CUDA.code_llvm(io, muladd_kernel, Tuple{Ptr{T}}))
+        @test occursin("llvm.fmuladd.$suffix", ir)
+    end
+end
+
 @testset "assume" begin
     foo(i) = cld(42, i)
     ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int}))
@@ -180,6 +205,29 @@ end
     @test occursin("sqrt.approx.ftz", asm)
 end
 
+@testset "fma/muladd emit fma.rn" begin
+    # fma and muladd should both lower to fma.rn in PTX
+    function fma_kernel(a, b, c)
+        @inbounds a[] = fma(b[], c[], a[])
+        return
+    end
+    function muladd_kernel(a, b, c)
+        @inbounds a[] = muladd(b[], c[], a[])
+        return
+    end
+
+    for T in (Float16, Float32, Float64)
+        asm = sprint(io->CUDA.code_ptx(io, fma_kernel,
+            NTuple{3,CuDeviceArray{T,1,AS.Global}}))
+        @test occursin("fma.rn", asm)
+        @test !occursin("__nv_fma", asm)
+
+        asm = sprint(io->CUDA.code_ptx(io, muladd_kernel,
+            NTuple{3,CuDeviceArray{T,1,AS.Global}}))
+        @test occursin("fma.rn", asm)
+    end
+end
+
 end
 
 ############################################################################################
diff --git a/test/core/device/array.jl b/test/core/device/array.jl
@@ -146,6 +146,24 @@ end
     @test array == Array(array_dev)
 end
 
+@testset "reshape of view" begin
+    function kernel(out, data, n)
+        i = threadIdx().x
+        if i <= n * n
+            mat = reshape(@view(data[1:n*n]), (n, n))
+            out[i] = mat[i]
+        end
+        return
+    end
+
+    n = 4
+    data = CuArray(Float32.(1:n*n))
+    out = CUDA.zeros(Float32, n * n)
+
+    @cuda threads=n*n kernel(out, data, n)
+    @test Array(out) == Float32.(1:n*n)
+end
+
 @testset "non-Int index to unsafe_load" begin
     function kernel(a)
         a[UInt64(1)] = 1
diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
@@ -85,6 +85,13 @@ using SpecialFunctions
         end
     end
 
+    @testset "muladd" begin
+        for T in (Float16, Float32, Float64)
+            @test testf((x,y,z)->muladd.(x,y,z), rand(T, 1), rand(T, 1), rand(T, 1))
+            @test testf((x,y,z)->muladd.(x,y,z), rand(T, 1), -rand(T, 1), -rand(T, 1))
+        end
+    end
+
     # something from SpecialFunctions.jl
     @testset "erf" begin
         @test testf(a->SpecialFunctions.erf.(a), Float32[1.0])
@@ -112,6 +119,20 @@ using SpecialFunctions
         # JuliaGPU/CUDA.jl#2886: LLVM below v18 emits non-existing min.NaN.f64/max.NaN.f64
         f(a, b) = @fastmath max(a, b)
         @test Array(map(f, CuArray([1.0, 2.0]), CuArray([4.0, 3.0]))) == [4.0, 3.0]
+
+        # JuliaGPU/CUDA.jl#3065: pow_fast with integer exponent used unsupported llvm.powi
+        function fastpow_kernel(A, y)
+            i = threadIdx().x
+            @inbounds @fastmath A[i] = A[i]^y
+            return nothing
+        end
+        for T in (Float32, Float64)
+            A = CUDA.ones(T, 4)
+            @cuda threads=4 fastpow_kernel(A, Int32(3))
+            @test Array(A) == ones(T, 4)
+            @cuda threads=4 fastpow_kernel(A, 3)
+            @test Array(A) == ones(T, 4)
+        end
     end
 
     @testset "byte_perm" begin
@@ -153,6 +174,68 @@ using SpecialFunctions
         @assert !contains(asm, "__nv")  # from libdevice
     end
 
+    @testset "inv" begin
+        # Base.inv should use accurate rcp instructions (rcp.rn)
+        for T in (Float32, Float64)
+            @test testf(x -> inv.(x), rand(T, 10) .+ T(0.1))
+            @test testf(x -> inv.(x), T[0.1, 0.5, 1.0, 2.0, 10.0, 100.0])
+        end
+
+        function kernel_inv_f32(a)
+            @inbounds a[] = inv(a[])
+            return
+        end
+        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
+        @test contains(asm, "rcp.rn.f32")
+
+        function kernel_inv_f64(a)
+            @inbounds a[] = inv(a[])
+            return
+        end
+        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
+        @test contains(asm, "rcp.rn.f64")
+    end
+
+    @testset "inv_fast" begin
+        # inv_fast(Float32) uses rcp.approx.ftz.f32 (~14 bits of mantissa)
+        function kernel_inv_fast_f32(a)
+            @inbounds a[] = @fastmath inv(a[])
+            return
+        end
+        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
+        @test contains(asm, "rcp.approx.ftz.f32")
+
+        fast_inv(x) = @fastmath inv(x)
+        xs32 = Float32[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
+        @test Array(map(fast_inv, cu(xs32))) ≈ inv.(xs32) rtol = 1.0f-4
+
+        # inv_fast(Float64) uses rcp.approx.ftz.f64 refined with Newton-Raphson
+        function kernel_inv_fast_f64(a)
+            @inbounds a[] = @fastmath inv(a[])
+            return
+        end
+        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
+        @test contains(asm, "rcp.approx.ftz.f64")
+
+        xs64 = Float64[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
+        @test Array(map(fast_inv, CuArray(xs64))) ≈ inv.(xs64) rtol = 1.0e-10
+    end
+
+    @testset "div_fast Float64" begin
+        # FastMath.div_fast(Float64) uses fast reciprocal: x * inv_fast(y)
+        function kernel_div_fast_f64(a, b, c)
+            @inbounds c[] = @fastmath a[] / b[]
+            return
+        end
+        asm = sprint(io -> CUDA.code_ptx(io, kernel_div_fast_f64, NTuple{3, CuDeviceArray{Float64, 1, AS.Global}}))
+        @test contains(asm, "rcp.approx.ftz.f64")
+
+        fast_div(x, y) = @fastmath x / y
+        xs = rand(Float64, 10) .+ 0.1
+        ys = rand(Float64, 10) .+ 0.1
+        @test Array(map(fast_div, CuArray(xs), CuArray(ys))) ≈ xs ./ ys rtol = 1.0e-10
+    end
+
     @testset "JuliaGPU/CUDA.jl#2111: min/max should return NaN" begin
         for T in [Float32, Float64]
             AT = CuArray{T}