Skip to content

Commit 04a39d7

Browse files
authored
Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor
2 parents 5d6044b + c27d64b commit 04a39d7

5 files changed

Lines changed: 187 additions & 5 deletions

File tree

CUDACore/src/device/intrinsics/math.jl

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
@public fma, rsqrt, saturate, byte_perm, assume
44

5-
using Base: FastMath
5+
using Base: FastMath, @assume_effects
66

77

88
## helpers
@@ -248,17 +248,42 @@ end
248248
@device_override Base.:(^)(x::Float64, y::Float64) = ccall("extern __nv_pow", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
249249
@device_override Base.:(^)(x::Float32, y::Float32) = ccall("extern __nv_powf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
250250
@device_override FastMath.pow_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_powf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
251+
# pow_fast: Base methods call llvm.powi which NVPTX cannot lower (#3065)
252+
@device_override @assume_effects :foldable @inline function FastMath.pow_fast(x::Float64, y::Integer)
253+
y == -1 && return inv(x)
254+
y == 0 && return one(x)
255+
y == 1 && return x
256+
y == 2 && return x*x
257+
y == 3 && return x*x*x
258+
x ^ y # no fast variant for Float64; uses __nv_powi
259+
end
260+
@device_override @assume_effects :foldable @inline function FastMath.pow_fast(x::Float32, y::Integer)
261+
y == -1 && return inv(x)
262+
y == 0 && return one(x)
263+
y == 1 && return x
264+
y == 2 && return x*x
265+
y == 3 && return x*x*x
266+
FastMath.pow_fast(x, Float32(y)) # uses __nv_fast_powf
267+
end
268+
@device_override @assume_effects :foldable @inline function FastMath.pow_fast(x::Float16, y::Integer)
269+
y == -1 && return inv(x)
270+
y == 0 && return one(x)
271+
y == 1 && return x
272+
y == 2 && return x*x
273+
y == 3 && return x*x*x
274+
Float16(FastMath.pow_fast(Float32(x), Float32(y)))
275+
end
251276
@device_override Base.:(^)(x::Float64, y::Int32) = ccall("extern __nv_powi", llvmcall, Cdouble, (Cdouble, Int32), x, y)
252277
@device_override Base.:(^)(x::Float32, y::Int32) = ccall("extern __nv_powif", llvmcall, Cfloat, (Cfloat, Int32), x, y)
253-
@device_override @inline function Base.:(^)(x::Float32, y::Int64)
278+
@device_override @assume_effects :foldable @inline function Base.:(^)(x::Float32, y::Int64)
254279
y == -1 && return inv(x)
255280
y == 0 && return one(x)
256281
y == 1 && return x
257282
y == 2 && return x*x
258283
y == 3 && return x*x*x
259284
x ^ Float32(y)
260285
end
261-
@device_override @inline function Base.:(^)(x::Float64, y::Int64)
286+
@device_override @assume_effects :foldable @inline function Base.:(^)(x::Float64, y::Int64)
262287
y == -1 && return inv(x)
263288
y == 0 && return one(x)
264289
y == 1 && return x
@@ -435,10 +460,14 @@ end
435460
@device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
436461
@device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
437462

438-
@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("extern __nv_fma", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
439-
@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("extern __nv_fmaf", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
463+
@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("llvm.fma.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
464+
@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("llvm.fma.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
440465
@device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
441466

467+
@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
468+
@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
469+
@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
470+
442471
@device_function sad(x::Int32, y::Int32, z::Int32) = ccall("extern __nv_sad", llvmcall, Int32, (Int32, Int32, Int32), x, y, z)
443472
@device_function sad(x::UInt32, y::UInt32, z::UInt32) = convert(UInt32, ccall("extern __nv_usad", llvmcall, Int32, (Int32, Int32, Int32), x, y, z))
444473

CUDACore/src/device/quirks.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,7 @@ for op in (:(<), :(<=), :cmp)
103103
@device_override Base.$op(q::Rational, x::AbstractFloat) = $op(float(q), x)
104104
end
105105
end
106+
107+
# reshape.jl
108+
@device_override Base._throw_dmrs(n, str, dims) =
109+
@gputhrow "DimensionMismatch" "Dimensions mismatch when reshaping. New dimensions must be consistent with array size"

test/core/codegen.jl

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,31 @@ end
2222
@test !occursin("@__nv_fmaf", ir)
2323
end
2424

25+
@testset "fma uses LLVM intrinsic" begin
26+
function fma_kernel(ptr)
27+
unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
28+
return
29+
end
30+
31+
for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
32+
ir = sprint(io->CUDA.code_llvm(io, fma_kernel, Tuple{Ptr{T}}))
33+
@test occursin("llvm.fma.$suffix", ir)
34+
@test !occursin("__nv_fma", ir)
35+
end
36+
end
37+
38+
@testset "muladd uses LLVM intrinsic" begin
39+
function muladd_kernel(ptr)
40+
unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
41+
return
42+
end
43+
44+
for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
45+
ir = sprint(io->CUDA.code_llvm(io, muladd_kernel, Tuple{Ptr{T}}))
46+
@test occursin("llvm.fmuladd.$suffix", ir)
47+
end
48+
end
49+
2550
@testset "assume" begin
2651
foo(i) = cld(42, i)
2752
ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int}))
@@ -180,6 +205,29 @@ end
180205
@test occursin("sqrt.approx.ftz", asm)
181206
end
182207

208+
@testset "fma/muladd emit fma.rn" begin
209+
# fma and muladd should both lower to fma.rn in PTX
210+
function fma_kernel(a, b, c)
211+
@inbounds a[] = fma(b[], c[], a[])
212+
return
213+
end
214+
function muladd_kernel(a, b, c)
215+
@inbounds a[] = muladd(b[], c[], a[])
216+
return
217+
end
218+
219+
for T in (Float16, Float32, Float64)
220+
asm = sprint(io->CUDA.code_ptx(io, fma_kernel,
221+
NTuple{3,CuDeviceArray{T,1,AS.Global}}))
222+
@test occursin("fma.rn", asm)
223+
@test !occursin("__nv_fma", asm)
224+
225+
asm = sprint(io->CUDA.code_ptx(io, muladd_kernel,
226+
NTuple{3,CuDeviceArray{T,1,AS.Global}}))
227+
@test occursin("fma.rn", asm)
228+
end
229+
end
230+
183231
end
184232

185233
############################################################################################

test/core/device/array.jl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,24 @@ end
146146
@test array == Array(array_dev)
147147
end
148148

149+
@testset "reshape of view" begin
150+
function kernel(out, data, n)
151+
i = threadIdx().x
152+
if i <= n * n
153+
mat = reshape(@view(data[1:n*n]), (n, n))
154+
out[i] = mat[i]
155+
end
156+
return
157+
end
158+
159+
n = 4
160+
data = CuArray(Float32.(1:n*n))
161+
out = CUDA.zeros(Float32, n * n)
162+
163+
@cuda threads=n*n kernel(out, data, n)
164+
@test Array(out) == Float32.(1:n*n)
165+
end
166+
149167
@testset "non-Int index to unsafe_load" begin
150168
function kernel(a)
151169
a[UInt64(1)] = 1

test/core/device/intrinsics/math.jl

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,13 @@ using SpecialFunctions
8585
end
8686
end
8787

88+
@testset "muladd" begin
89+
for T in (Float16, Float32, Float64)
90+
@test testf((x,y,z)->muladd.(x,y,z), rand(T, 1), rand(T, 1), rand(T, 1))
91+
@test testf((x,y,z)->muladd.(x,y,z), rand(T, 1), -rand(T, 1), -rand(T, 1))
92+
end
93+
end
94+
8895
# something from SpecialFunctions.jl
8996
@testset "erf" begin
9097
@test testf(a->SpecialFunctions.erf.(a), Float32[1.0])
@@ -112,6 +119,20 @@ using SpecialFunctions
112119
# JuliaGPU/CUDA.jl#2886: LLVM below v18 emits non-existing min.NaN.f64/max.NaN.f64
113120
f(a, b) = @fastmath max(a, b)
114121
@test Array(map(f, CuArray([1.0, 2.0]), CuArray([4.0, 3.0]))) == [4.0, 3.0]
122+
123+
# JuliaGPU/CUDA.jl#3065: pow_fast with integer exponent used unsupported llvm.powi
124+
function fastpow_kernel(A, y)
125+
i = threadIdx().x
126+
@inbounds @fastmath A[i] = A[i]^y
127+
return nothing
128+
end
129+
for T in (Float32, Float64)
130+
A = CUDA.ones(T, 4)
131+
@cuda threads=4 fastpow_kernel(A, Int32(3))
132+
@test Array(A) == ones(T, 4)
133+
@cuda threads=4 fastpow_kernel(A, 3)
134+
@test Array(A) == ones(T, 4)
135+
end
115136
end
116137

117138
@testset "byte_perm" begin
@@ -153,6 +174,68 @@ using SpecialFunctions
153174
@assert !contains(asm, "__nv") # from libdevice
154175
end
155176

177+
@testset "inv" begin
178+
# Base.inv should use accurate rcp instructions (rcp.rn)
179+
for T in (Float32, Float64)
180+
@test testf(x -> inv.(x), rand(T, 10) .+ T(0.1))
181+
@test testf(x -> inv.(x), T[0.1, 0.5, 1.0, 2.0, 10.0, 100.0])
182+
end
183+
184+
function kernel_inv_f32(a)
185+
@inbounds a[] = inv(a[])
186+
return
187+
end
188+
asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
189+
@test contains(asm, "rcp.rn.f32")
190+
191+
function kernel_inv_f64(a)
192+
@inbounds a[] = inv(a[])
193+
return
194+
end
195+
asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
196+
@test contains(asm, "rcp.rn.f64")
197+
end
198+
199+
@testset "inv_fast" begin
200+
# inv_fast(Float32) uses rcp.approx.ftz.f32 (~14 bits of mantissa)
201+
function kernel_inv_fast_f32(a)
202+
@inbounds a[] = @fastmath inv(a[])
203+
return
204+
end
205+
asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
206+
@test contains(asm, "rcp.approx.ftz.f32")
207+
208+
fast_inv(x) = @fastmath inv(x)
209+
xs32 = Float32[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
210+
@test Array(map(fast_inv, cu(xs32))) inv.(xs32) rtol = 1.0f-4
211+
212+
# inv_fast(Float64) uses rcp.approx.ftz.f64 refined with Newton-Raphson
213+
function kernel_inv_fast_f64(a)
214+
@inbounds a[] = @fastmath inv(a[])
215+
return
216+
end
217+
asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
218+
@test contains(asm, "rcp.approx.ftz.f64")
219+
220+
xs64 = Float64[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
221+
@test Array(map(fast_inv, CuArray(xs64))) inv.(xs64) rtol = 1.0e-10
222+
end
223+
224+
@testset "div_fast Float64" begin
225+
# FastMath.div_fast(Float64) uses fast reciprocal: x * inv_fast(y)
226+
function kernel_div_fast_f64(a, b, c)
227+
@inbounds c[] = @fastmath a[] / b[]
228+
return
229+
end
230+
asm = sprint(io -> CUDA.code_ptx(io, kernel_div_fast_f64, NTuple{3, CuDeviceArray{Float64, 1, AS.Global}}))
231+
@test contains(asm, "rcp.approx.ftz.f64")
232+
233+
fast_div(x, y) = @fastmath x / y
234+
xs = rand(Float64, 10) .+ 0.1
235+
ys = rand(Float64, 10) .+ 0.1
236+
@test Array(map(fast_div, CuArray(xs), CuArray(ys))) xs ./ ys rtol = 1.0e-10
237+
end
238+
156239
@testset "JuliaGPU/CUDA.jl#2111: min/max should return NaN" begin
157240
for T in [Float32, Float64]
158241
AT = CuArray{T}

0 commit comments

Comments
 (0)