Merge branch 'master' into add_reshape_view_dispatch

Abdelrahman912 · web-flow · commit 14712fafc4ff · 2026-04-14T20:30:08.000+02:00
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
@@ -395,9 +395,26 @@ end
 @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))
 
 @device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
+@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y)
 
 @device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x)
-@device_override FastMath.inv_fast(x::Union{Float32, Float64}) = @fastmath one(x) / x
+@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x)
+
+@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x)
+@device_override function FastMath.inv_fast(x::Float64)
+    # Get the approximate reciprocal
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
+    # This instruction chops off last 32bits of mantissa and computes inverse
+    # while treating all subnormal numbers as 0.0
+    # If reciprocal would be subnormal, underflows to 0.0
+    # 32 least significant bits of the result are filled with 0s
+    inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x)
+
+    # Approximate the missing 32bits of mantissa with a single cubic iteration
+    e = fma(inv_x, -x, 1.0)
+    e = fma(e, e, e)
+    inv_x = fma(e, inv_x, inv_x)
+end
 
 ## distributions
 
diff --git a/lib/cusparse/src/array.jl b/lib/cusparse/src/array.jl
@@ -521,6 +521,94 @@ function Base.getindex(A::CuSparseArrayCSR{Tv, Ti, N}, i0::Integer, i1::Integer,
     CuSparseMatrixCSR(A.rowPtr[:,idxs...], A.colVal[:,idxs...], nonzeros(A)[:,idxs...], size(A)[1:2])[i0, i1]
 end
 
+# slice matrix by masking rows and columns
+
+function Base.getindex(A::CuSparseMatrixCSR{Tv, Ti}, Imask::CuVector{Bool}, Jmask::CuVector{Bool}) where {Tv, Ti}
+    @boundscheck checkbounds(A, Imask, Jmask)
+
+    m, n = size(A)
+    rowmap = cumsum(Ti.(Imask))
+    colmap = cumsum(Ti.(Jmask))
+    new_m = m > 0 ? Int(CUDACore.@allowscalar rowmap[end]) : 0
+    new_n = n > 0 ? Int(CUDACore.@allowscalar colmap[end]) : 0
+
+    # pass 1: count kept entries per new row
+    counts = CUDACore.zeros(Ti, new_m)
+    if new_m > 0 && new_n > 0
+        threads = min(256, m)
+        blocks = cld(m, threads)
+        @cuda threads = threads blocks = blocks _csr_count_kernel!(counts, A.rowPtr, A.colVal, Imask, Jmask, rowmap)
+    end
+
+    # build new rowPtr from counts: [1, 1+cumsum(counts)...]
+    new_rowPtr = vcat(CuVector{Ti}([one(Ti)]), cumsum(counts) .+ one(Ti))
+    new_nnz = Int(CUDACore.@allowscalar new_rowPtr[end]) - 1
+
+    # pass 2: fill entries
+    new_colVal = CuVector{Ti}(undef, new_nnz)
+    new_nzVal = CuVector{Tv}(undef, new_nnz)
+    if new_nnz > 0
+        threads = min(256, m)
+        blocks = cld(m, threads)
+        @cuda threads = threads blocks = blocks _csr_fill_kernel!(
+            new_colVal, new_nzVal, new_rowPtr, A.rowPtr, A.colVal, A.nzVal,
+            Imask, Jmask, rowmap, colmap
+        )
+    end
+
+    return CuSparseMatrixCSR{Tv, Ti}(new_rowPtr, new_colVal, new_nzVal, (new_m, new_n))
+end
+
+# CSR: one thread per original row — count entries where column is selected
+function _csr_count_kernel!(counts, rowPtr, colVal, Imask, Jmask, rowmap)
+    i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    i > length(Imask) && return nothing
+    @inbounds begin
+        Imask[i] || return nothing
+        new_i = rowmap[i]
+        c = zero(eltype(counts))
+        for j in rowPtr[i]:(rowPtr[i + 1] - one(eltype(rowPtr)))
+            if Jmask[colVal[j]]
+                c += one(eltype(counts))
+            end
+        end
+        counts[new_i] = c
+    end
+    return nothing
+end
+
+# CSR: one thread per original row — fill entries with remapped column indices
+function _csr_fill_kernel!(
+        new_colVal, new_nzVal, new_rowPtr, rowPtr, colVal, nzVal,
+        Imask, Jmask, rowmap, colmap
+    )
+    i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    i > length(Imask) && return nothing
+    @inbounds begin
+        Imask[i] || return nothing
+        offset = new_rowPtr[rowmap[i]]
+        for j in rowPtr[i]:(rowPtr[i + 1] - one(eltype(rowPtr)))
+            col = colVal[j]
+            if Jmask[col]
+                new_colVal[offset] = colmap[col]
+                new_nzVal[offset] = nzVal[j]
+                offset += one(eltype(new_rowPtr))
+            end
+        end
+    end
+    return nothing
+end
+
+# CSC: reinterpret as transposed CSR, index with swapped masks, reinterpret back.
+# A CSC (colPtr, rowVal, nzVal, (m,n)) is the same layout as CSR (rowPtr, colVal, nzVal, (n,m)).
+function Base.getindex(A::CuSparseMatrixCSC{Tv, Ti}, Imask::CuVector{Bool}, Jmask::CuVector{Bool}) where {Tv, Ti}
+    @boundscheck checkbounds(A, Imask, Jmask)
+    A_as_csr = CuSparseMatrixCSR{Tv, Ti}(A.colPtr, A.rowVal, A.nzVal, reverse(size(A)))
+    result_csr = A_as_csr[Jmask, Imask]
+    return CuSparseMatrixCSC{Tv, Ti}(result_csr.rowPtr, result_csr.colVal, result_csr.nzVal, reverse(size(result_csr)))
+end
+
+
 ## interop with sparse CPU arrays
 
 # cpu to gpu
diff --git a/lib/cusparse/test/interfaces.jl b/lib/cusparse/test/interfaces.jl
@@ -329,4 +329,133 @@ nB = 2
             @test ref_cuda_sparse.colPtr == cuda_spdiagm.colPtr
         end
     end
+
+    @testset "getindex with boolean masks" begin
+        A = sprand(elty, m, n, 0.4)
+        rowmask = rand(Bool, m)
+        colmask = rand(Bool, n)
+        S_cpu = A[rowmask, colmask]
+
+        rowmask_d = CuVector(rowmask)
+        colmask_d = CuVector(colmask)
+
+        # test slicing of CSC format
+        A_csc = CuSparseMatrixCSC(A)
+        S_csc = A_csc[rowmask_d, colmask_d]
+        @test S_csc isa CuSparseMatrixCSC
+        @test S_cpu ≈ collect(S_csc)
+
+        # test slicing of CSR format
+        # Conversion between CSC and CSR is broken in many ways on CUDA 12.0,
+        # therefore we construct the CSR matrix manually from the transposed CSC.
+        Aᵀ_csc = CuSparseMatrixCSC(Transpose(A))
+        A_csr = CuSparseMatrixCSR{eltype(A), Int32}(
+            copy(Aᵀ_csc.colPtr), # rowPtr is the same as colPtr of the transposed CSC
+            copy(Aᵀ_csc.rowVal), # colVal is the same as rowVal of the transposed CSC
+            copy(Aᵀ_csc.nzVal),  # nzVal is unchanged by transposition
+            size(A)
+        )
+        # collect calls CSR→CSC conversion again which is broken, so we test on scalar level
+        CUDA.@allowscalar for i in eachindex(A, A_csr)
+            @test A[i] ≈ A_csr[i]
+        end
+        S_csr = A_csr[rowmask_d, colmask_d]
+        @test S_csr isa CuSparseMatrixCSR
+        CUDA.@allowscalar for i in eachindex(S_cpu, S_csr)
+            @test S_cpu[i] ≈ S_csr[i]
+        end
+
+        # wrong mask size: throws BoundsError for both too-long and too-short, matching the behaviour of dense Array.
+        @test_throws BoundsError A_csc[CuVector(trues(m + 1)), colmask_d]
+        @test_throws BoundsError A_csc[rowmask_d, CuVector(trues(n + 1))]
+        @test_throws BoundsError A_csc[CuVector(trues(m - 1)), colmask_d]
+        @test_throws BoundsError A_csc[rowmask_d, CuVector(trues(n - 1))]
+        @test_throws BoundsError A_csr[CuVector(trues(m + 1)), colmask_d]
+        @test_throws BoundsError A_csr[rowmask_d, CuVector(trues(n + 1))]
+        @test_throws BoundsError A_csr[CuVector(trues(m - 1)), colmask_d]
+        @test_throws BoundsError A_csr[rowmask_d, CuVector(trues(n - 1))]
+
+        # empty mask (all zeros): cumsum gives all-zero rowmap/colmap, new_m or new_n = 0,
+        # both kernels are guarded by `new_m > 0 && new_n > 0`, so nothing executes.
+        # new_rowPtr collapses to [1] (or all-ones), nnz = 0. Same as CPU SparseArrays.
+        S_empty_rows_csc = A_csc[CuVector(falses(m)), CuVector(trues(n))]
+        @test S_empty_rows_csc isa CuSparseMatrixCSC
+        @test size(S_empty_rows_csc) == (0, n)
+        @test nnz(S_empty_rows_csc) == 0
+
+        S_empty_cols_csc = A_csc[CuVector(trues(m)), CuVector(falses(n))]
+        @test S_empty_cols_csc isa CuSparseMatrixCSC
+        @test size(S_empty_cols_csc) == (m, 0)
+        @test nnz(S_empty_cols_csc) == 0
+
+        S_empty_rows_csr = A_csr[CuVector(falses(m)), CuVector(trues(n))]
+        @test S_empty_rows_csr isa CuSparseMatrixCSR
+        @test size(S_empty_rows_csr) == (0, n)
+        @test nnz(S_empty_rows_csr) == 0
+
+        S_empty_cols_csr = A_csr[CuVector(trues(m)), CuVector(falses(n))]
+        @test S_empty_cols_csr isa CuSparseMatrixCSR
+        @test size(S_empty_cols_csr) == (m, 0)
+        @test nnz(S_empty_cols_csr) == 0
+
+        S_empty_both_csc = A_csc[CuVector(falses(m)), CuVector(falses(n))]
+        @test S_empty_both_csc isa CuSparseMatrixCSC
+        @test size(S_empty_both_csc) == (0, 0)
+        @test nnz(S_empty_both_csc) == 0
+
+        S_empty_both_csr = A_csr[CuVector(falses(m)), CuVector(falses(n))]
+        @test S_empty_both_csr isa CuSparseMatrixCSR
+        @test size(S_empty_both_csr) == (0, 0)
+        @test nnz(S_empty_both_csr) == 0
+
+        # all-ones mask: rowmap = 1:m, colmap = 1:n, both kernels run unfiltered.
+        # Result should equal the full matrix. Same as CPU SparseArrays.
+        S_all_csc = A_csc[CuVector(trues(m)), CuVector(trues(n))]
+        @test S_all_csc isa CuSparseMatrixCSC
+        @test collect(S_all_csc) ≈ Matrix(A)
+
+        S_all_csr = A_csr[CuVector(trues(m)), CuVector(trues(n))]
+        @test S_all_csr isa CuSparseMatrixCSR
+        CUDA.@allowscalar for i in eachindex(A, S_all_csr)
+            @test A[i] ≈ S_all_csr[i]
+        end
+
+        # zero-dimension matrix: accessing rowmap[end] / colmap[end] on an empty CuVector
+        # would crash without the `m > 0` / `n > 0` guard in the implementation.
+        A_zero_rows_csr = CuSparseMatrixCSR(spzeros(elty, 0, n))
+        S_zr = A_zero_rows_csr[CuVector{Bool}([]), CuVector(trues(n))]
+        @test S_zr isa CuSparseMatrixCSR
+        @test size(S_zr) == (0, n)
+        @test nnz(S_zr) == 0
+
+        A_zero_cols_csr = CuSparseMatrixCSR(spzeros(elty, m, 0))
+        S_zc = A_zero_cols_csr[CuVector(trues(m)), CuVector{Bool}([])]
+        @test S_zc isa CuSparseMatrixCSR
+        @test size(S_zc) == (m, 0)
+        @test nnz(S_zc) == 0
+
+        A_zero_both_csr = CuSparseMatrixCSR(spzeros(elty, 0, 0))
+        S_zb = A_zero_both_csr[CuVector{Bool}([]), CuVector{Bool}([])]
+        @test S_zb isa CuSparseMatrixCSR
+        @test size(S_zb) == (0, 0)
+        @test nnz(S_zb) == 0
+
+        A_zero_rows_csc = CuSparseMatrixCSC(spzeros(elty, 0, n))
+        S_zr_csc = A_zero_rows_csc[CuVector{Bool}([]), CuVector(trues(n))]
+        @test S_zr_csc isa CuSparseMatrixCSC
+        @test size(S_zr_csc) == (0, n)
+        @test nnz(S_zr_csc) == 0
+
+        A_zero_cols_csc = CuSparseMatrixCSC(spzeros(elty, m, 0))
+        S_zc_csc = A_zero_cols_csc[CuVector(trues(m)), CuVector{Bool}([])]
+        @test S_zc_csc isa CuSparseMatrixCSC
+        @test size(S_zc_csc) == (m, 0)
+        @test nnz(S_zc_csc) == 0
+
+        A_zero_both_csc = CuSparseMatrixCSC(spzeros(elty, 0, 0))
+        S_zb_csc = A_zero_both_csc[CuVector{Bool}([]), CuVector{Bool}([])]
+        @test S_zb_csc isa CuSparseMatrixCSC
+        @test size(S_zb_csc) == (0, 0)
+        @test nnz(S_zb_csc) == 0
+    end
 end
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
@@ -37,28 +37,6 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
     end
 end
 
-let (jlf, f) = (:div_arcp, :div)
-    for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
-        ir = """
-            %x = f$f fast $llvmT %0, %1
-            ret $llvmT %x
-        """
-        @eval begin
-            # the @pure is necessary so that we can constant propagate.
-            @inline Base.@pure function $jlf(a::$T, b::$T)
-                Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
-            end
-        end
-    end
-    @eval function $jlf(args...)
-        Base.$jlf(args...)
-    end
-end
-rcp(x) = div_arcp(one(x), x) # still leads to rcp.rn which is also a function call
-
-# div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-# rcp(x) = div_fast(one(x), x)
-
 # note the order of the fields below is also assumed in the code.
 const _nstate = 5
 const _ρ, _U, _V, _W, _E = 1:_nstate
@@ -130,8 +108,8 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
             # GPU performance trick
             # Allow optimizations to use the reciprocal of an argument rather than perform division.
             # IEEE floating-point division is implemented as a function call
-            ρinv = rcp(ρ)
-            ρ2inv = rcp(2ρ)
+            ρinv = inv(ρ)
+            ρ2inv = inv(2ρ)
             # ρ2inv = 0.5f0 * pinv
 
             P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z)