Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions lib/cusparse/src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,94 @@ function Base.getindex(A::CuSparseArrayCSR{Tv, Ti, N}, i0::Integer, i1::Integer,
CuSparseMatrixCSR(A.rowPtr[:,idxs...], A.colVal[:,idxs...], nonzeros(A)[:,idxs...], size(A)[1:2])[i0, i1]
end

# slice matrix by masking rows and columns

function Base.getindex(A::CuSparseMatrixCSR{Tv, Ti}, Imask::CuVector{Bool}, Jmask::CuVector{Bool}) where {Tv, Ti}
@boundscheck checkbounds(A, Imask, Jmask)

m, n = size(A)
rowmap = cumsum(Ti.(Imask))
colmap = cumsum(Ti.(Jmask))
new_m = m > 0 ? Int(CUDACore.@allowscalar rowmap[end]) : 0
new_n = n > 0 ? Int(CUDACore.@allowscalar colmap[end]) : 0

# pass 1: count kept entries per new row
counts = CUDACore.zeros(Ti, new_m)
if new_m > 0 && new_n > 0
threads = min(256, m)
blocks = cld(m, threads)
@cuda threads = threads blocks = blocks _csr_count_kernel!(counts, A.rowPtr, A.colVal, Imask, Jmask, rowmap)
end

# build new rowPtr from counts: [1, 1+cumsum(counts)...]
new_rowPtr = vcat(CuVector{Ti}([one(Ti)]), cumsum(counts) .+ one(Ti))
new_nnz = Int(CUDACore.@allowscalar new_rowPtr[end]) - 1

# pass 2: fill entries
new_colVal = CuVector{Ti}(undef, new_nnz)
new_nzVal = CuVector{Tv}(undef, new_nnz)
if new_nnz > 0
threads = min(256, m)
blocks = cld(m, threads)
@cuda threads = threads blocks = blocks _csr_fill_kernel!(
new_colVal, new_nzVal, new_rowPtr, A.rowPtr, A.colVal, A.nzVal,
Imask, Jmask, rowmap, colmap
)
end

return CuSparseMatrixCSR{Tv, Ti}(new_rowPtr, new_colVal, new_nzVal, (new_m, new_n))
end

# CSR: one thread per original row — count entries where column is selected
function _csr_count_kernel!(counts, rowPtr, colVal, Imask, Jmask, rowmap)
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
i > length(Imask) && return nothing
@inbounds begin
Imask[i] || return nothing
new_i = rowmap[i]
c = zero(eltype(counts))
for j in rowPtr[i]:(rowPtr[i + 1] - one(eltype(rowPtr)))
if Jmask[colVal[j]]
c += one(eltype(counts))
end
end
counts[new_i] = c
end
return nothing
end

# CSR: one thread per original row — fill entries with remapped column indices
function _csr_fill_kernel!(
new_colVal, new_nzVal, new_rowPtr, rowPtr, colVal, nzVal,
Imask, Jmask, rowmap, colmap
)
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
i > length(Imask) && return nothing
@inbounds begin
Imask[i] || return nothing
offset = new_rowPtr[rowmap[i]]
for j in rowPtr[i]:(rowPtr[i + 1] - one(eltype(rowPtr)))
col = colVal[j]
if Jmask[col]
new_colVal[offset] = colmap[col]
new_nzVal[offset] = nzVal[j]
offset += one(eltype(new_rowPtr))
end
end
end
return nothing
end

# CSC: reinterpret as transposed CSR, index with swapped masks, reinterpret back.
# A CSC (colPtr, rowVal, nzVal, (m,n)) is the same layout as CSR (rowPtr, colVal, nzVal, (n,m)).
function Base.getindex(A::CuSparseMatrixCSC{Tv, Ti}, Imask::CuVector{Bool}, Jmask::CuVector{Bool}) where {Tv, Ti}
@boundscheck checkbounds(A, Imask, Jmask)
A_as_csr = CuSparseMatrixCSR{Tv, Ti}(A.colPtr, A.rowVal, A.nzVal, reverse(size(A)))
result_csr = A_as_csr[Jmask, Imask]
return CuSparseMatrixCSC{Tv, Ti}(result_csr.rowPtr, result_csr.colVal, result_csr.nzVal, reverse(size(result_csr)))
end


## interop with sparse CPU arrays

# cpu to gpu
Expand Down
129 changes: 129 additions & 0 deletions lib/cusparse/test/interfaces.jl
Original file line number Diff line number Diff line change
Expand Up @@ -329,4 +329,133 @@ nB = 2
@test ref_cuda_sparse.colPtr == cuda_spdiagm.colPtr
end
end

@testset "getindex with boolean masks" begin
A = sprand(elty, m, n, 0.4)
rowmask = rand(Bool, m)
colmask = rand(Bool, n)
S_cpu = A[rowmask, colmask]

rowmask_d = CuVector(rowmask)
colmask_d = CuVector(colmask)

# test slicing of CSC format
A_csc = CuSparseMatrixCSC(A)
S_csc = A_csc[rowmask_d, colmask_d]
@test S_csc isa CuSparseMatrixCSC
@test S_cpu ≈ collect(S_csc)

# test slicing of CSR format
# Conversion between CSC and CSR is broken in many ways on CUDA 12.0,
# therefore we construct the CSR matrix manually from the transposed CSC.
Aᵀ_csc = CuSparseMatrixCSC(Transpose(A))
A_csr = CuSparseMatrixCSR{eltype(A), Int32}(
copy(Aᵀ_csc.colPtr), # rowPtr is the same as colPtr of the transposed CSC
copy(Aᵀ_csc.rowVal), # colVal is the same as rowVal of the transposed CSC
copy(Aᵀ_csc.nzVal), # nzVal is unchanged by transposition
size(A)
)
# collect calls CSR→CSC conversion again which is broken, so we test on scalar level
CUDA.@allowscalar for i in eachindex(A, A_csr)
@test A[i] ≈ A_csr[i]
end
S_csr = A_csr[rowmask_d, colmask_d]
@test S_csr isa CuSparseMatrixCSR
CUDA.@allowscalar for i in eachindex(S_cpu, S_csr)
@test S_cpu[i] ≈ S_csr[i]
end

# wrong mask size: throws BoundsError for both too-long and too-short, matching the behaviour of dense Array.
@test_throws BoundsError A_csc[CuVector(trues(m + 1)), colmask_d]
@test_throws BoundsError A_csc[rowmask_d, CuVector(trues(n + 1))]
@test_throws BoundsError A_csc[CuVector(trues(m - 1)), colmask_d]
@test_throws BoundsError A_csc[rowmask_d, CuVector(trues(n - 1))]
@test_throws BoundsError A_csr[CuVector(trues(m + 1)), colmask_d]
@test_throws BoundsError A_csr[rowmask_d, CuVector(trues(n + 1))]
@test_throws BoundsError A_csr[CuVector(trues(m - 1)), colmask_d]
@test_throws BoundsError A_csr[rowmask_d, CuVector(trues(n - 1))]

# empty mask (all zeros): cumsum gives all-zero rowmap/colmap, new_m or new_n = 0,
# both kernels are guarded by `new_m > 0 && new_n > 0`, so nothing executes.
# new_rowPtr collapses to [1] (or all-ones), nnz = 0. Same as CPU SparseArrays.
S_empty_rows_csc = A_csc[CuVector(falses(m)), CuVector(trues(n))]
@test S_empty_rows_csc isa CuSparseMatrixCSC
@test size(S_empty_rows_csc) == (0, n)
@test nnz(S_empty_rows_csc) == 0

S_empty_cols_csc = A_csc[CuVector(trues(m)), CuVector(falses(n))]
@test S_empty_cols_csc isa CuSparseMatrixCSC
@test size(S_empty_cols_csc) == (m, 0)
@test nnz(S_empty_cols_csc) == 0

S_empty_rows_csr = A_csr[CuVector(falses(m)), CuVector(trues(n))]
@test S_empty_rows_csr isa CuSparseMatrixCSR
@test size(S_empty_rows_csr) == (0, n)
@test nnz(S_empty_rows_csr) == 0

S_empty_cols_csr = A_csr[CuVector(trues(m)), CuVector(falses(n))]
@test S_empty_cols_csr isa CuSparseMatrixCSR
@test size(S_empty_cols_csr) == (m, 0)
@test nnz(S_empty_cols_csr) == 0

S_empty_both_csc = A_csc[CuVector(falses(m)), CuVector(falses(n))]
@test S_empty_both_csc isa CuSparseMatrixCSC
@test size(S_empty_both_csc) == (0, 0)
@test nnz(S_empty_both_csc) == 0

S_empty_both_csr = A_csr[CuVector(falses(m)), CuVector(falses(n))]
@test S_empty_both_csr isa CuSparseMatrixCSR
@test size(S_empty_both_csr) == (0, 0)
@test nnz(S_empty_both_csr) == 0

# all-ones mask: rowmap = 1:m, colmap = 1:n, both kernels run unfiltered.
# Result should equal the full matrix. Same as CPU SparseArrays.
S_all_csc = A_csc[CuVector(trues(m)), CuVector(trues(n))]
@test S_all_csc isa CuSparseMatrixCSC
@test collect(S_all_csc) ≈ Matrix(A)

S_all_csr = A_csr[CuVector(trues(m)), CuVector(trues(n))]
@test S_all_csr isa CuSparseMatrixCSR
CUDA.@allowscalar for i in eachindex(A, S_all_csr)
@test A[i] ≈ S_all_csr[i]
end

# zero-dimension matrix: accessing rowmap[end] / colmap[end] on an empty CuVector
# would crash without the `m > 0` / `n > 0` guard in the implementation.
A_zero_rows_csr = CuSparseMatrixCSR(spzeros(elty, 0, n))
S_zr = A_zero_rows_csr[CuVector{Bool}([]), CuVector(trues(n))]
@test S_zr isa CuSparseMatrixCSR
@test size(S_zr) == (0, n)
@test nnz(S_zr) == 0

A_zero_cols_csr = CuSparseMatrixCSR(spzeros(elty, m, 0))
S_zc = A_zero_cols_csr[CuVector(trues(m)), CuVector{Bool}([])]
@test S_zc isa CuSparseMatrixCSR
@test size(S_zc) == (m, 0)
@test nnz(S_zc) == 0

A_zero_both_csr = CuSparseMatrixCSR(spzeros(elty, 0, 0))
S_zb = A_zero_both_csr[CuVector{Bool}([]), CuVector{Bool}([])]
@test S_zb isa CuSparseMatrixCSR
@test size(S_zb) == (0, 0)
@test nnz(S_zb) == 0

A_zero_rows_csc = CuSparseMatrixCSC(spzeros(elty, 0, n))
S_zr_csc = A_zero_rows_csc[CuVector{Bool}([]), CuVector(trues(n))]
@test S_zr_csc isa CuSparseMatrixCSC
@test size(S_zr_csc) == (0, n)
@test nnz(S_zr_csc) == 0

A_zero_cols_csc = CuSparseMatrixCSC(spzeros(elty, m, 0))
S_zc_csc = A_zero_cols_csc[CuVector(trues(m)), CuVector{Bool}([])]
@test S_zc_csc isa CuSparseMatrixCSC
@test size(S_zc_csc) == (m, 0)
@test nnz(S_zc_csc) == 0

A_zero_both_csc = CuSparseMatrixCSC(spzeros(elty, 0, 0))
S_zb_csc = A_zero_both_csc[CuVector{Bool}([]), CuVector{Bool}([])]
@test S_zb_csc isa CuSparseMatrixCSC
@test size(S_zb_csc) == (0, 0)
@test nnz(S_zb_csc) == 0
end
end