Skip to content

Commit 14712fa

Browse files
Merge branch 'master' into add_reshape_view_dispatch
2 parents 774cffc + 5065018 commit 14712fa

4 files changed

Lines changed: 237 additions & 25 deletions

File tree

CUDACore/src/device/intrinsics/math.jl

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,9 +395,26 @@ end
395395
@device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))
396396

397397
@device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
398+
@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y)
398399

399400
@device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x)
400-
@device_override FastMath.inv_fast(x::Union{Float32, Float64}) = @fastmath one(x) / x
401+
@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x)
402+
403+
@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x)
404+
@device_override function FastMath.inv_fast(x::Float64)
405+
# Get the approximate reciprocal
406+
# https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
407+
# This instruction chops off last 32bits of mantissa and computes inverse
408+
# while treating all subnormal numbers as 0.0
409+
# If reciprocal would be subnormal, underflows to 0.0
410+
# 32 least significant bits of the result are filled with 0s
411+
inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x)
412+
413+
# Approximate the missing 32bits of mantissa with a single cubic iteration
414+
e = fma(inv_x, -x, 1.0)
415+
e = fma(e, e, e)
416+
inv_x = fma(e, inv_x, inv_x)
417+
end
401418

402419
## distributions
403420

lib/cusparse/src/array.jl

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,94 @@ function Base.getindex(A::CuSparseArrayCSR{Tv, Ti, N}, i0::Integer, i1::Integer,
521521
CuSparseMatrixCSR(A.rowPtr[:,idxs...], A.colVal[:,idxs...], nonzeros(A)[:,idxs...], size(A)[1:2])[i0, i1]
522522
end
523523

524+
# slice matrix by masking rows and columns
525+
526+
function Base.getindex(A::CuSparseMatrixCSR{Tv, Ti}, Imask::CuVector{Bool}, Jmask::CuVector{Bool}) where {Tv, Ti}
527+
@boundscheck checkbounds(A, Imask, Jmask)
528+
529+
m, n = size(A)
530+
rowmap = cumsum(Ti.(Imask))
531+
colmap = cumsum(Ti.(Jmask))
532+
new_m = m > 0 ? Int(CUDACore.@allowscalar rowmap[end]) : 0
533+
new_n = n > 0 ? Int(CUDACore.@allowscalar colmap[end]) : 0
534+
535+
# pass 1: count kept entries per new row
536+
counts = CUDACore.zeros(Ti, new_m)
537+
if new_m > 0 && new_n > 0
538+
threads = min(256, m)
539+
blocks = cld(m, threads)
540+
@cuda threads = threads blocks = blocks _csr_count_kernel!(counts, A.rowPtr, A.colVal, Imask, Jmask, rowmap)
541+
end
542+
543+
# build new rowPtr from counts: [1, 1+cumsum(counts)...]
544+
new_rowPtr = vcat(CuVector{Ti}([one(Ti)]), cumsum(counts) .+ one(Ti))
545+
new_nnz = Int(CUDACore.@allowscalar new_rowPtr[end]) - 1
546+
547+
# pass 2: fill entries
548+
new_colVal = CuVector{Ti}(undef, new_nnz)
549+
new_nzVal = CuVector{Tv}(undef, new_nnz)
550+
if new_nnz > 0
551+
threads = min(256, m)
552+
blocks = cld(m, threads)
553+
@cuda threads = threads blocks = blocks _csr_fill_kernel!(
554+
new_colVal, new_nzVal, new_rowPtr, A.rowPtr, A.colVal, A.nzVal,
555+
Imask, Jmask, rowmap, colmap
556+
)
557+
end
558+
559+
return CuSparseMatrixCSR{Tv, Ti}(new_rowPtr, new_colVal, new_nzVal, (new_m, new_n))
560+
end
561+
562+
# CSR: one thread per original row — count entries where column is selected
563+
function _csr_count_kernel!(counts, rowPtr, colVal, Imask, Jmask, rowmap)
564+
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
565+
i > length(Imask) && return nothing
566+
@inbounds begin
567+
Imask[i] || return nothing
568+
new_i = rowmap[i]
569+
c = zero(eltype(counts))
570+
for j in rowPtr[i]:(rowPtr[i + 1] - one(eltype(rowPtr)))
571+
if Jmask[colVal[j]]
572+
c += one(eltype(counts))
573+
end
574+
end
575+
counts[new_i] = c
576+
end
577+
return nothing
578+
end
579+
580+
# CSR: one thread per original row — fill entries with remapped column indices
581+
function _csr_fill_kernel!(
582+
new_colVal, new_nzVal, new_rowPtr, rowPtr, colVal, nzVal,
583+
Imask, Jmask, rowmap, colmap
584+
)
585+
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
586+
i > length(Imask) && return nothing
587+
@inbounds begin
588+
Imask[i] || return nothing
589+
offset = new_rowPtr[rowmap[i]]
590+
for j in rowPtr[i]:(rowPtr[i + 1] - one(eltype(rowPtr)))
591+
col = colVal[j]
592+
if Jmask[col]
593+
new_colVal[offset] = colmap[col]
594+
new_nzVal[offset] = nzVal[j]
595+
offset += one(eltype(new_rowPtr))
596+
end
597+
end
598+
end
599+
return nothing
600+
end
601+
602+
# CSC: reinterpret as transposed CSR, index with swapped masks, reinterpret back.
603+
# A CSC (colPtr, rowVal, nzVal, (m,n)) is the same layout as CSR (rowPtr, colVal, nzVal, (n,m)).
604+
function Base.getindex(A::CuSparseMatrixCSC{Tv, Ti}, Imask::CuVector{Bool}, Jmask::CuVector{Bool}) where {Tv, Ti}
605+
@boundscheck checkbounds(A, Imask, Jmask)
606+
A_as_csr = CuSparseMatrixCSR{Tv, Ti}(A.colPtr, A.rowVal, A.nzVal, reverse(size(A)))
607+
result_csr = A_as_csr[Jmask, Imask]
608+
return CuSparseMatrixCSC{Tv, Ti}(result_csr.rowPtr, result_csr.colVal, result_csr.nzVal, reverse(size(result_csr)))
609+
end
610+
611+
524612
## interop with sparse CPU arrays
525613

526614
# cpu to gpu

lib/cusparse/test/interfaces.jl

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,4 +329,133 @@ nB = 2
329329
@test ref_cuda_sparse.colPtr == cuda_spdiagm.colPtr
330330
end
331331
end
332+
333+
@testset "getindex with boolean masks" begin
334+
A = sprand(elty, m, n, 0.4)
335+
rowmask = rand(Bool, m)
336+
colmask = rand(Bool, n)
337+
S_cpu = A[rowmask, colmask]
338+
339+
rowmask_d = CuVector(rowmask)
340+
colmask_d = CuVector(colmask)
341+
342+
# test slicing of CSC format
343+
A_csc = CuSparseMatrixCSC(A)
344+
S_csc = A_csc[rowmask_d, colmask_d]
345+
@test S_csc isa CuSparseMatrixCSC
346+
@test S_cpu collect(S_csc)
347+
348+
# test slicing of CSR format
349+
# Conversion between CSC and CSR is broken in many ways on CUDA 12.0,
350+
# therefore we construct the CSR matrix manually from the transposed CSC.
351+
Aᵀ_csc = CuSparseMatrixCSC(Transpose(A))
352+
A_csr = CuSparseMatrixCSR{eltype(A), Int32}(
353+
copy(Aᵀ_csc.colPtr), # rowPtr is the same as colPtr of the transposed CSC
354+
copy(Aᵀ_csc.rowVal), # colVal is the same as rowVal of the transposed CSC
355+
copy(Aᵀ_csc.nzVal), # nzVal is unchanged by transposition
356+
size(A)
357+
)
358+
# collect calls CSR→CSC conversion again which is broken, so we test on scalar level
359+
CUDA.@allowscalar for i in eachindex(A, A_csr)
360+
@test A[i] A_csr[i]
361+
end
362+
S_csr = A_csr[rowmask_d, colmask_d]
363+
@test S_csr isa CuSparseMatrixCSR
364+
CUDA.@allowscalar for i in eachindex(S_cpu, S_csr)
365+
@test S_cpu[i] S_csr[i]
366+
end
367+
368+
# wrong mask size: throws BoundsError for both too-long and too-short, matching the behaviour of dense Array.
369+
@test_throws BoundsError A_csc[CuVector(trues(m + 1)), colmask_d]
370+
@test_throws BoundsError A_csc[rowmask_d, CuVector(trues(n + 1))]
371+
@test_throws BoundsError A_csc[CuVector(trues(m - 1)), colmask_d]
372+
@test_throws BoundsError A_csc[rowmask_d, CuVector(trues(n - 1))]
373+
@test_throws BoundsError A_csr[CuVector(trues(m + 1)), colmask_d]
374+
@test_throws BoundsError A_csr[rowmask_d, CuVector(trues(n + 1))]
375+
@test_throws BoundsError A_csr[CuVector(trues(m - 1)), colmask_d]
376+
@test_throws BoundsError A_csr[rowmask_d, CuVector(trues(n - 1))]
377+
378+
# empty mask (all zeros): cumsum gives all-zero rowmap/colmap, new_m or new_n = 0,
379+
# both kernels are guarded by `new_m > 0 && new_n > 0`, so nothing executes.
380+
# new_rowPtr collapses to [1] (or all-ones), nnz = 0. Same as CPU SparseArrays.
381+
S_empty_rows_csc = A_csc[CuVector(falses(m)), CuVector(trues(n))]
382+
@test S_empty_rows_csc isa CuSparseMatrixCSC
383+
@test size(S_empty_rows_csc) == (0, n)
384+
@test nnz(S_empty_rows_csc) == 0
385+
386+
S_empty_cols_csc = A_csc[CuVector(trues(m)), CuVector(falses(n))]
387+
@test S_empty_cols_csc isa CuSparseMatrixCSC
388+
@test size(S_empty_cols_csc) == (m, 0)
389+
@test nnz(S_empty_cols_csc) == 0
390+
391+
S_empty_rows_csr = A_csr[CuVector(falses(m)), CuVector(trues(n))]
392+
@test S_empty_rows_csr isa CuSparseMatrixCSR
393+
@test size(S_empty_rows_csr) == (0, n)
394+
@test nnz(S_empty_rows_csr) == 0
395+
396+
S_empty_cols_csr = A_csr[CuVector(trues(m)), CuVector(falses(n))]
397+
@test S_empty_cols_csr isa CuSparseMatrixCSR
398+
@test size(S_empty_cols_csr) == (m, 0)
399+
@test nnz(S_empty_cols_csr) == 0
400+
401+
S_empty_both_csc = A_csc[CuVector(falses(m)), CuVector(falses(n))]
402+
@test S_empty_both_csc isa CuSparseMatrixCSC
403+
@test size(S_empty_both_csc) == (0, 0)
404+
@test nnz(S_empty_both_csc) == 0
405+
406+
S_empty_both_csr = A_csr[CuVector(falses(m)), CuVector(falses(n))]
407+
@test S_empty_both_csr isa CuSparseMatrixCSR
408+
@test size(S_empty_both_csr) == (0, 0)
409+
@test nnz(S_empty_both_csr) == 0
410+
411+
# all-ones mask: rowmap = 1:m, colmap = 1:n, both kernels run unfiltered.
412+
# Result should equal the full matrix. Same as CPU SparseArrays.
413+
S_all_csc = A_csc[CuVector(trues(m)), CuVector(trues(n))]
414+
@test S_all_csc isa CuSparseMatrixCSC
415+
@test collect(S_all_csc) Matrix(A)
416+
417+
S_all_csr = A_csr[CuVector(trues(m)), CuVector(trues(n))]
418+
@test S_all_csr isa CuSparseMatrixCSR
419+
CUDA.@allowscalar for i in eachindex(A, S_all_csr)
420+
@test A[i] S_all_csr[i]
421+
end
422+
423+
# zero-dimension matrix: accessing rowmap[end] / colmap[end] on an empty CuVector
424+
# would crash without the `m > 0` / `n > 0` guard in the implementation.
425+
A_zero_rows_csr = CuSparseMatrixCSR(spzeros(elty, 0, n))
426+
S_zr = A_zero_rows_csr[CuVector{Bool}([]), CuVector(trues(n))]
427+
@test S_zr isa CuSparseMatrixCSR
428+
@test size(S_zr) == (0, n)
429+
@test nnz(S_zr) == 0
430+
431+
A_zero_cols_csr = CuSparseMatrixCSR(spzeros(elty, m, 0))
432+
S_zc = A_zero_cols_csr[CuVector(trues(m)), CuVector{Bool}([])]
433+
@test S_zc isa CuSparseMatrixCSR
434+
@test size(S_zc) == (m, 0)
435+
@test nnz(S_zc) == 0
436+
437+
A_zero_both_csr = CuSparseMatrixCSR(spzeros(elty, 0, 0))
438+
S_zb = A_zero_both_csr[CuVector{Bool}([]), CuVector{Bool}([])]
439+
@test S_zb isa CuSparseMatrixCSR
440+
@test size(S_zb) == (0, 0)
441+
@test nnz(S_zb) == 0
442+
443+
A_zero_rows_csc = CuSparseMatrixCSC(spzeros(elty, 0, n))
444+
S_zr_csc = A_zero_rows_csc[CuVector{Bool}([]), CuVector(trues(n))]
445+
@test S_zr_csc isa CuSparseMatrixCSC
446+
@test size(S_zr_csc) == (0, n)
447+
@test nnz(S_zr_csc) == 0
448+
449+
A_zero_cols_csc = CuSparseMatrixCSC(spzeros(elty, m, 0))
450+
S_zc_csc = A_zero_cols_csc[CuVector(trues(m)), CuVector{Bool}([])]
451+
@test S_zc_csc isa CuSparseMatrixCSC
452+
@test size(S_zc_csc) == (m, 0)
453+
@test nnz(S_zc_csc) == 0
454+
455+
A_zero_both_csc = CuSparseMatrixCSC(spzeros(elty, 0, 0))
456+
S_zb_csc = A_zero_both_csc[CuVector{Bool}([]), CuVector{Bool}([])]
457+
@test S_zb_csc isa CuSparseMatrixCSC
458+
@test size(S_zb_csc) == (0, 0)
459+
@test nnz(S_zb_csc) == 0
460+
end
332461
end

perf/volumerhs.jl

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -37,28 +37,6 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
3737
end
3838
end
3939

40-
let (jlf, f) = (:div_arcp, :div)
41-
for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
42-
ir = """
43-
%x = f$f fast $llvmT %0, %1
44-
ret $llvmT %x
45-
"""
46-
@eval begin
47-
# the @pure is necessary so that we can constant propagate.
48-
@inline Base.@pure function $jlf(a::$T, b::$T)
49-
Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
50-
end
51-
end
52-
end
53-
@eval function $jlf(args...)
54-
Base.$jlf(args...)
55-
end
56-
end
57-
rcp(x) = div_arcp(one(x), x) # still leads to rcp.rn which is also a function call
58-
59-
# div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
60-
# rcp(x) = div_fast(one(x), x)
61-
6240
# note the order of the fields below is also assumed in the code.
6341
const _nstate = 5
6442
const _ρ, _U, _V, _W, _E = 1:_nstate
@@ -130,8 +108,8 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
130108
# GPU performance trick
131109
# Allow optimizations to use the reciprocal of an argument rather than perform division.
132110
# IEEE floating-point division is implemented as a function call
133-
ρinv = rcp(ρ)
134-
ρ2inv = rcp(2ρ)
111+
ρinv = inv(ρ)
112+
ρ2inv = inv(2ρ)
135113
# ρ2inv = 0.5f0 * pinv
136114

137115
P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z)

0 commit comments

Comments
 (0)