JuliaGPU · maleadt · Apr 14, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/lib/cusparse/src/array.jl b/lib/cusparse/src/array.jl
@@ -521,6 +521,94 @@ function Base.getindex(A::CuSparseArrayCSR{Tv, Ti, N}, i0::Integer, i1::Integer,
     CuSparseMatrixCSR(A.rowPtr[:,idxs...], A.colVal[:,idxs...], nonzeros(A)[:,idxs...], size(A)[1:2])[i0, i1]
 end
 
+# slice matrix by masking rows and columns
+
+function Base.getindex(A::CuSparseMatrixCSR{Tv, Ti}, Imask::CuVector{Bool}, Jmask::CuVector{Bool}) where {Tv, Ti}
+    @boundscheck checkbounds(A, Imask, Jmask)
+
+    m, n = size(A)
+    rowmap = cumsum(Ti.(Imask))
+    colmap = cumsum(Ti.(Jmask))
+    new_m = m > 0 ? Int(CUDACore.@allowscalar rowmap[end]) : 0
+    new_n = n > 0 ? Int(CUDACore.@allowscalar colmap[end]) : 0
+
+    # pass 1: count kept entries per new row
+    counts = CUDACore.zeros(Ti, new_m)
+    if new_m > 0 && new_n > 0
+        threads = min(256, m)
+        blocks = cld(m, threads)
+        @cuda threads = threads blocks = blocks _csr_count_kernel!(counts, A.rowPtr, A.colVal, Imask, Jmask, rowmap)
+    end
+
+    # build new rowPtr from counts: [1, 1+cumsum(counts)...]
+    new_rowPtr = vcat(CuVector{Ti}([one(Ti)]), cumsum(counts) .+ one(Ti))
+    new_nnz = Int(CUDACore.@allowscalar new_rowPtr[end]) - 1
+
+    # pass 2: fill entries
+    new_colVal = CuVector{Ti}(undef, new_nnz)
+    new_nzVal = CuVector{Tv}(undef, new_nnz)
+    if new_nnz > 0
+        threads = min(256, m)
+        blocks = cld(m, threads)
+        @cuda threads = threads blocks = blocks _csr_fill_kernel!(
+            new_colVal, new_nzVal, new_rowPtr, A.rowPtr, A.colVal, A.nzVal,
+            Imask, Jmask, rowmap, colmap
+        )
+    end
+
+    return CuSparseMatrixCSR{Tv, Ti}(new_rowPtr, new_colVal, new_nzVal, (new_m, new_n))
+end
+
+# CSR: one thread per original row — count entries where column is selected
+function _csr_count_kernel!(counts, rowPtr, colVal, Imask, Jmask, rowmap)
+    i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    i > length(Imask) && return nothing
+    @inbounds begin
+        Imask[i] || return nothing
+        new_i = rowmap[i]
+        c = zero(eltype(counts))
+        for j in rowPtr[i]:(rowPtr[i + 1] - one(eltype(rowPtr)))
+            if Jmask[colVal[j]]
+                c += one(eltype(counts))
+            end
+        end
+        counts[new_i] = c
+    end
+    return nothing
+end
+
+# CSR: one thread per original row — fill entries with remapped column indices
+function _csr_fill_kernel!(
+        new_colVal, new_nzVal, new_rowPtr, rowPtr, colVal, nzVal,
+        Imask, Jmask, rowmap, colmap
+    )
+    i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    i > length(Imask) && return nothing
+    @inbounds begin
+        Imask[i] || return nothing
+        offset = new_rowPtr[rowmap[i]]
+        for j in rowPtr[i]:(rowPtr[i + 1] - one(eltype(rowPtr)))
+            col = colVal[j]
+            if Jmask[col]
+                new_colVal[offset] = colmap[col]
+                new_nzVal[offset] = nzVal[j]
+                offset += one(eltype(new_rowPtr))
+            end
+        end
+    end
+    return nothing
+end
+
+# CSC: reinterpret as transposed CSR, index with swapped masks, reinterpret back.
+# A CSC (colPtr, rowVal, nzVal, (m,n)) is the same layout as CSR (rowPtr, colVal, nzVal, (n,m)).
+function Base.getindex(A::CuSparseMatrixCSC{Tv, Ti}, Imask::CuVector{Bool}, Jmask::CuVector{Bool}) where {Tv, Ti}
+    @boundscheck checkbounds(A, Imask, Jmask)
+    A_as_csr = CuSparseMatrixCSR{Tv, Ti}(A.colPtr, A.rowVal, A.nzVal, reverse(size(A)))
+    result_csr = A_as_csr[Jmask, Imask]
+    return CuSparseMatrixCSC{Tv, Ti}(result_csr.rowPtr, result_csr.colVal, result_csr.nzVal, reverse(size(result_csr)))
+end
+
+
 ## interop with sparse CPU arrays
 
 # cpu to gpu

diff --git a/lib/cusparse/test/interfaces.jl b/lib/cusparse/test/interfaces.jl
@@ -329,4 +329,133 @@ nB = 2
             @test ref_cuda_sparse.colPtr == cuda_spdiagm.colPtr
         end
     end
+
+    @testset "getindex with boolean masks" begin
+        A = sprand(elty, m, n, 0.4)
+        rowmask = rand(Bool, m)
+        colmask = rand(Bool, n)
+        S_cpu = A[rowmask, colmask]
+
+        rowmask_d = CuVector(rowmask)
+        colmask_d = CuVector(colmask)
+
+        # test slicing of CSC format
+        A_csc = CuSparseMatrixCSC(A)
+        S_csc = A_csc[rowmask_d, colmask_d]
+        @test S_csc isa CuSparseMatrixCSC
+        @test S_cpu ≈ collect(S_csc)
+
+        # test slicing of CSR format
+        # Conversion between CSC and CSR is broken in many ways on CUDA 12.0,
+        # therefore we construct the CSR matrix manually from the transposed CSC.
+        Aᵀ_csc = CuSparseMatrixCSC(Transpose(A))
+        A_csr = CuSparseMatrixCSR{eltype(A), Int32}(
+            copy(Aᵀ_csc.colPtr), # rowPtr is the same as colPtr of the transposed CSC
+            copy(Aᵀ_csc.rowVal), # colVal is the same as rowVal of the transposed CSC
+            copy(Aᵀ_csc.nzVal),  # nzVal is unchanged by transposition
+            size(A)
+        )
+        # collect calls CSR→CSC conversion again which is broken, so we test on scalar level
+        CUDA.@allowscalar for i in eachindex(A, A_csr)
+            @test A[i] ≈ A_csr[i]
+        end
+        S_csr = A_csr[rowmask_d, colmask_d]
+        @test S_csr isa CuSparseMatrixCSR
+        CUDA.@allowscalar for i in eachindex(S_cpu, S_csr)
+            @test S_cpu[i] ≈ S_csr[i]
+        end
+
+        # wrong mask size: throws BoundsError for both too-long and too-short, matching the behaviour of dense Array.
+        @test_throws BoundsError A_csc[CuVector(trues(m + 1)), colmask_d]
+        @test_throws BoundsError A_csc[rowmask_d, CuVector(trues(n + 1))]
+        @test_throws BoundsError A_csc[CuVector(trues(m - 1)), colmask_d]
+        @test_throws BoundsError A_csc[rowmask_d, CuVector(trues(n - 1))]
+        @test_throws BoundsError A_csr[CuVector(trues(m + 1)), colmask_d]
+        @test_throws BoundsError A_csr[rowmask_d, CuVector(trues(n + 1))]
+        @test_throws BoundsError A_csr[CuVector(trues(m - 1)), colmask_d]
+        @test_throws BoundsError A_csr[rowmask_d, CuVector(trues(n - 1))]
+
+        # empty mask (all zeros): cumsum gives all-zero rowmap/colmap, new_m or new_n = 0,
+        # both kernels are guarded by `new_m > 0 && new_n > 0`, so nothing executes.
+        # new_rowPtr collapses to [1] (or all-ones), nnz = 0. Same as CPU SparseArrays.
+        S_empty_rows_csc = A_csc[CuVector(falses(m)), CuVector(trues(n))]
+        @test S_empty_rows_csc isa CuSparseMatrixCSC
+        @test size(S_empty_rows_csc) == (0, n)
+        @test nnz(S_empty_rows_csc) == 0
+
+        S_empty_cols_csc = A_csc[CuVector(trues(m)), CuVector(falses(n))]
+        @test S_empty_cols_csc isa CuSparseMatrixCSC
+        @test size(S_empty_cols_csc) == (m, 0)
+        @test nnz(S_empty_cols_csc) == 0
+
+        S_empty_rows_csr = A_csr[CuVector(falses(m)), CuVector(trues(n))]
+        @test S_empty_rows_csr isa CuSparseMatrixCSR
+        @test size(S_empty_rows_csr) == (0, n)
+        @test nnz(S_empty_rows_csr) == 0
+
+        S_empty_cols_csr = A_csr[CuVector(trues(m)), CuVector(falses(n))]
+        @test S_empty_cols_csr isa CuSparseMatrixCSR
+        @test size(S_empty_cols_csr) == (m, 0)
+        @test nnz(S_empty_cols_csr) == 0
+
+        S_empty_both_csc = A_csc[CuVector(falses(m)), CuVector(falses(n))]
+        @test S_empty_both_csc isa CuSparseMatrixCSC
+        @test size(S_empty_both_csc) == (0, 0)
+        @test nnz(S_empty_both_csc) == 0
+
+        S_empty_both_csr = A_csr[CuVector(falses(m)), CuVector(falses(n))]
+        @test S_empty_both_csr isa CuSparseMatrixCSR
+        @test size(S_empty_both_csr) == (0, 0)
+        @test nnz(S_empty_both_csr) == 0
+
+        # all-ones mask: rowmap = 1:m, colmap = 1:n, both kernels run unfiltered.
+        # Result should equal the full matrix. Same as CPU SparseArrays.
+        S_all_csc = A_csc[CuVector(trues(m)), CuVector(trues(n))]
+        @test S_all_csc isa CuSparseMatrixCSC
+        @test collect(S_all_csc) ≈ Matrix(A)
+
+        S_all_csr = A_csr[CuVector(trues(m)), CuVector(trues(n))]
+        @test S_all_csr isa CuSparseMatrixCSR
+        CUDA.@allowscalar for i in eachindex(A, S_all_csr)
+            @test A[i] ≈ S_all_csr[i]
+        end
+
+        # zero-dimension matrix: accessing rowmap[end] / colmap[end] on an empty CuVector
+        # would crash without the `m > 0` / `n > 0` guard in the implementation.
+        A_zero_rows_csr = CuSparseMatrixCSR(spzeros(elty, 0, n))
+        S_zr = A_zero_rows_csr[CuVector{Bool}([]), CuVector(trues(n))]
+        @test S_zr isa CuSparseMatrixCSR
+        @test size(S_zr) == (0, n)
+        @test nnz(S_zr) == 0
+
+        A_zero_cols_csr = CuSparseMatrixCSR(spzeros(elty, m, 0))
+        S_zc = A_zero_cols_csr[CuVector(trues(m)), CuVector{Bool}([])]
+        @test S_zc isa CuSparseMatrixCSR
+        @test size(S_zc) == (m, 0)
+        @test nnz(S_zc) == 0
+
+        A_zero_both_csr = CuSparseMatrixCSR(spzeros(elty, 0, 0))
+        S_zb = A_zero_both_csr[CuVector{Bool}([]), CuVector{Bool}([])]
+        @test S_zb isa CuSparseMatrixCSR
+        @test size(S_zb) == (0, 0)
+        @test nnz(S_zb) == 0
+
+        A_zero_rows_csc = CuSparseMatrixCSC(spzeros(elty, 0, n))
+        S_zr_csc = A_zero_rows_csc[CuVector{Bool}([]), CuVector(trues(n))]
+        @test S_zr_csc isa CuSparseMatrixCSC
+        @test size(S_zr_csc) == (0, n)
+        @test nnz(S_zr_csc) == 0
+
+        A_zero_cols_csc = CuSparseMatrixCSC(spzeros(elty, m, 0))
+        S_zc_csc = A_zero_cols_csc[CuVector(trues(m)), CuVector{Bool}([])]
+        @test S_zc_csc isa CuSparseMatrixCSC
+        @test size(S_zc_csc) == (m, 0)
+        @test nnz(S_zc_csc) == 0
+
+        A_zero_both_csc = CuSparseMatrixCSC(spzeros(elty, 0, 0))
+        S_zb_csc = A_zero_both_csc[CuVector{Bool}([]), CuVector{Bool}([])]
+        @test S_zb_csc isa CuSparseMatrixCSC
+        @test size(S_zb_csc) == (0, 0)
+        @test nnz(S_zb_csc) == 0
+    end
 end