Skip to content

Commit af6961a

Browse files
authored
Merge pull request JuliaGPU#3101 from JuliaGPU/tb/coo
CUSPARSE: fix COO scalar getindex crossing row boundaries (JuliaGPU#3100)
2 parents 877721d + c33c5af commit af6961a

3 files changed

Lines changed: 186 additions & 67 deletions

File tree

lib/cusparse/src/array.jl

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -462,39 +462,57 @@ Base.getindex(A::CuSparseMatrixCSR, ::Colon, j::Integer) = CuSparseVector(sparse
462462

463463
function Base.getindex(A::CuSparseVector{Tv, Ti}, i::Integer) where {Tv, Ti}
464464
@boundscheck checkbounds(A, i)
465-
ii = searchsortedfirst(A.iPtr, convert(Ti, i))
466-
(ii > nnz(A) || A.iPtr[ii] != i) && return zero(Tv)
467-
A.nzVal[ii]
465+
result = zero(Tv)
466+
for k in 1:nnz(A)
467+
A.iPtr[k] == i && (result = sum_duplicate(result, A.nzVal[k]))
468+
end
469+
return result
468470
end
469471

472+
# Scalar getindex methods linear-scan the minor axis rather than binary-searching
473+
# and sum across matching entries. cuSPARSE formats don't guarantee sorted indices
474+
# within a major-axis slice (e.g. SpGEMM output may leave CSR columns unsorted
475+
# within a row, and COO is only guaranteed row-sorted), nor uniqueness — duplicate
476+
# (i, j) entries are permitted and their values sum, matching the convention of
477+
# Julia's `sparse()` constructor and SciPy/CuPy. For Bool we OR instead of sum,
478+
# also matching `sparse()`, since Bool + Bool doesn't stay Bool.
479+
sum_duplicate(a, b) = a + b
480+
sum_duplicate(a::Bool, b::Bool) = a | b
481+
470482
function Base.getindex(A::CuSparseMatrixCSC{T}, i0::Integer, i1::Integer) where T
471483
@boundscheck checkbounds(A, i0, i1)
472484
r1 = Int(A.colPtr[i1])
473485
r2 = Int(A.colPtr[i1+1]-1)
474-
(r1 > r2) && return zero(T)
475-
r1 = searchsortedfirst(rowvals(A), i0, r1, r2, Base.Order.Forward)
476-
(r1 > r2 || rowvals(A)[r1] != i0) && return zero(T)
477-
nonzeros(A)[r1]
486+
result = zero(T)
487+
for k in r1:r2
488+
rowvals(A)[k] == i0 && (result = sum_duplicate(result, nonzeros(A)[k]))
489+
end
490+
return result
478491
end
479492

480493
function Base.getindex(A::CuSparseMatrixCSR{T}, i0::Integer, i1::Integer) where T
481494
@boundscheck checkbounds(A, i0, i1)
482495
c1 = Int(A.rowPtr[i0])
483496
c2 = Int(A.rowPtr[i0+1]-1)
484-
(c1 > c2) && return zero(T)
485-
c1 = searchsortedfirst(A.colVal, i1, c1, c2, Base.Order.Forward)
486-
(c1 > c2 || A.colVal[c1] != i1) && return zero(T)
487-
nonzeros(A)[c1]
497+
result = zero(T)
498+
for k in c1:c2
499+
A.colVal[k] == i1 && (result = sum_duplicate(result, nonzeros(A)[k]))
500+
end
501+
return result
488502
end
489503

490504
function Base.getindex(A::CuSparseMatrixCOO{T}, i0::Integer, i1::Integer) where T
491505
@boundscheck checkbounds(A, i0, i1)
506+
# cuSPARSE only guarantees COO is sorted by row, so binary-search the row
507+
# range but linear-scan for the column.
492508
r1 = searchsortedfirst(A.rowInd, i0, Base.Order.Forward)
493509
(r1 > length(A.rowInd) || A.rowInd[r1] > i0) && return zero(T)
494-
r2 = min(searchsortedfirst(A.rowInd, i0+1, Base.Order.Forward), length(A.rowInd))
495-
c1 = searchsortedfirst(A.colInd, i1, r1, r2, Base.Order.Forward)
496-
(c1 > r2 || c1 == length(A.colInd) + 1 || A.colInd[c1] > i1) && return zero(T)
497-
nonzeros(A)[c1]
510+
r2 = searchsortedlast(A.rowInd, i0, Base.Order.Forward)
511+
result = zero(T)
512+
for k in r1:r2
513+
A.colInd[k] == i1 && (result = sum_duplicate(result, nonzeros(A)[k]))
514+
end
515+
return result
498516
end
499517

500518
function Base.getindex(A::CuSparseMatrixBSR{T}, i0::Integer, i1::Integer) where T
@@ -504,10 +522,11 @@ function Base.getindex(A::CuSparseMatrixBSR{T}, i0::Integer, i1::Integer) where
504522
block_idx = (i0_idx - 1) * A.blockDim + i1_idx - 1
505523
c1 = Int(A.rowPtr[i0_block])
506524
c2 = Int(A.rowPtr[i0_block+1]-1)
507-
(c1 > c2) && return zero(T)
508-
c1 = searchsortedfirst(A.colVal, i1_block, c1, c2, Base.Order.Forward)
509-
(c1 > c2 || A.colVal[c1] != i1_block) && return zero(T)
510-
nonzeros(A)[c1+block_idx]
525+
result = zero(T)
526+
for k in c1:c2
527+
A.colVal[k] == i1_block && (result = sum_duplicate(result, nonzeros(A)[k+block_idx]))
528+
end
529+
return result
511530
end
512531

513532
# matrix slices

lib/cusparse/src/conversions.jl

Lines changed: 43 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,41 @@ for SparseMatrixType in (:CuSparseMatrixCSC, :CuSparseMatrixCSR, :CuSparseMatrix
327327
end
328328
end
329329

330+
# Wrapper around `cusparseCsr2cscEx2` that works around a cuSPARSE 12.0 bug:
331+
# invoking the routine with one-based indexing silently corrupts `cscVal` for
332+
# matrices with certain shapes (e.g. even `m`). On affected versions we shift
333+
# the index arrays to zero-based around the call; zero-based indexing returns
334+
# correct results.
335+
function _csr2cscEx2!(m, n, nnz_, csrVal::CuVector{T}, csrRowPtr, csrColInd,
336+
cscVal::CuVector{T}, cscColPtr, cscRowInd,
337+
action, index, algo) where T
338+
buggy = index == 'O' && v"12.0" <= version() < v"12.1"
339+
if buggy
340+
csrRowPtr = csrRowPtr .- one(Cint)
341+
csrColInd = csrColInd .- one(Cint)
342+
effidx = 'Z'
343+
else
344+
effidx = index
345+
end
346+
function bufferSize()
347+
out = Ref{Csize_t}(1)
348+
cusparseCsr2cscEx2_bufferSize(handle(), m, n, nnz_, csrVal,
349+
csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
350+
T, action, effidx, algo, out)
351+
return out[]
352+
end
353+
with_workspace(bufferSize) do buffer
354+
cusparseCsr2cscEx2(handle(), m, n, nnz_, csrVal,
355+
csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
356+
T, action, effidx, algo, buffer)
357+
end
358+
if buggy
359+
cscColPtr .+= one(Cint)
360+
cscRowInd .+= one(Cint)
361+
end
362+
return
363+
end
364+
330365
# by flipping rows and columns, we can use that to get CSC to CSR too
331366
for elty in (:Float32, :Float64, :ComplexF32, :ComplexF64)
332367
@eval begin
@@ -335,18 +370,8 @@ for elty in (:Float32, :Float64, :ComplexF32, :ComplexF64)
335370
colPtr = CUDACore.zeros(Cint, n+1)
336371
rowVal = CUDACore.zeros(Cint, nnz(csr))
337372
nzVal = CUDACore.zeros($elty, nnz(csr))
338-
function bufferSize()
339-
out = Ref{Csize_t}(1)
340-
cusparseCsr2cscEx2_bufferSize(handle(), m, n, nnz(csr), nonzeros(csr),
341-
csr.rowPtr, csr.colVal, nzVal, colPtr, rowVal,
342-
$elty, action, index, algo, out)
343-
return out[]
344-
end
345-
with_workspace(bufferSize) do buffer
346-
cusparseCsr2cscEx2(handle(), m, n, nnz(csr), nonzeros(csr),
347-
csr.rowPtr, csr.colVal, nzVal, colPtr, rowVal,
348-
$elty, action, index, algo, buffer)
349-
end
373+
_csr2cscEx2!(m, n, nnz(csr), nonzeros(csr), csr.rowPtr, csr.colVal,
374+
nzVal, colPtr, rowVal, action, index, algo)
350375
CuSparseMatrixCSC(colPtr,rowVal,nzVal,size(csr))
351376
end
352377
CuSparseMatrixCSC{$elty}(csr::CuSparseMatrixCSR{$elty, Ti}; index::SparseChar='O', action::cusparseAction_t=CUSPARSE_ACTION_NUMERIC, algo::cusparseCsr2CscAlg_t=CUSPARSE_CSR2CSC_ALG1) where {Ti} =
@@ -358,18 +383,8 @@ for elty in (:Float32, :Float64, :ComplexF32, :ComplexF64)
358383
rowPtr = CUDACore.zeros(Cint,m+1)
359384
colVal = CUDACore.zeros(Cint,nnz(csc))
360385
nzVal = CUDACore.zeros($elty,nnz(csc))
361-
function bufferSize()
362-
out = Ref{Csize_t}(1)
363-
cusparseCsr2cscEx2_bufferSize(handle(), n, m, nnz(csc), nonzeros(csc),
364-
csc.colPtr, rowvals(csc), nzVal, rowPtr, colVal,
365-
$elty, action, index, algo, out)
366-
return out[]
367-
end
368-
with_workspace(bufferSize) do buffer
369-
cusparseCsr2cscEx2(handle(), n, m, nnz(csc), nonzeros(csc),
370-
csc.colPtr, rowvals(csc), nzVal, rowPtr, colVal,
371-
$elty, action, index, algo, buffer)
372-
end
386+
_csr2cscEx2!(n, m, nnz(csc), nonzeros(csc), csc.colPtr, rowvals(csc),
387+
nzVal, rowPtr, colVal, action, index, algo)
373388
CuSparseMatrixCSR(rowPtr,colVal,nzVal,size(csc))
374389
end
375390
CuSparseMatrixCSR(csc::CuSparseMatrixCSC{$elty, Ti}; index::SparseChar='O', action::cusparseAction_t=CUSPARSE_ACTION_NUMERIC, algo::cusparseCsr2CscAlg_t=CUSPARSE_CSR2CSC_ALG1) where {Ti} =
@@ -390,18 +405,8 @@ for (elty, welty) in ((:Float16, :Float32),
390405
rowVal = CUDACore.zeros(Cint, nnz(csr))
391406
nzVal = CUDACore.zeros($elty, nnz(csr))
392407
if $elty == Float16 #broken for ComplexF16?
393-
function bufferSize()
394-
out = Ref{Csize_t}(1)
395-
cusparseCsr2cscEx2_bufferSize(handle(), m, n, nnz(csr), nonzeros(csr),
396-
csr.rowPtr, csr.colVal, nzVal, colPtr, rowVal,
397-
$elty, action, index, algo, out)
398-
return out[]
399-
end
400-
with_workspace(bufferSize) do buffer
401-
cusparseCsr2cscEx2(handle(), m, n, nnz(csr), nonzeros(csr),
402-
csr.rowPtr, csr.colVal, nzVal, colPtr, rowVal,
403-
$elty, action, index, algo, buffer)
404-
end
408+
_csr2cscEx2!(m, n, nnz(csr), nonzeros(csr), csr.rowPtr, csr.colVal,
409+
nzVal, colPtr, rowVal, action, index, algo)
405410
return CuSparseMatrixCSC(colPtr,rowVal,nzVal,size(csr))
406411
else
407412
wide_csr = CuSparseMatrixCSR(csr.rowPtr, csr.colVal, convert(CuVector{$welty}, nonzeros(csr)), size(csr))
@@ -419,18 +424,8 @@ for (elty, welty) in ((:Float16, :Float32),
419424
colVal = CUDACore.zeros(Cint,nnz(csc))
420425
nzVal = CUDACore.zeros($elty,nnz(csc))
421426
if $elty == Float16 #broken for ComplexF16?
422-
function bufferSize()
423-
out = Ref{Csize_t}(1)
424-
cusparseCsr2cscEx2_bufferSize(handle(), n, m, nnz(csc), nonzeros(csc),
425-
csc.colPtr, rowvals(csc), nzVal, rowPtr, colVal,
426-
$elty, action, index, algo, out)
427-
return out[]
428-
end
429-
with_workspace(bufferSize) do buffer
430-
cusparseCsr2cscEx2(handle(), n, m, nnz(csc), nonzeros(csc),
431-
csc.colPtr, rowvals(csc), nzVal, rowPtr, colVal,
432-
$elty, action, index, algo, buffer)
433-
end
427+
_csr2cscEx2!(n, m, nnz(csc), nonzeros(csc), csc.colPtr, rowvals(csc),
428+
nzVal, rowPtr, colVal, action, index, algo)
434429
return CuSparseMatrixCSR(rowPtr,colVal,nzVal,size(csc))
435430
else
436431
wide_csc = CuSparseMatrixCSC(csc.colPtr, csc.rowVal, convert(CuVector{$welty}, nonzeros(csc)), size(csc))

lib/cusparse/test/array.jl

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,111 @@
127127
@test Array(d_x[i, :]) == x[i, :]
128128
end
129129
end
130+
# regression test for #3100: scalar getindex at every (i, j)
131+
let dense = sparse(reshape(1:16, 4, 4)),
132+
d_dense = CuSparseMatrixCOO(dense)
133+
CUDACore.@allowscalar begin
134+
for j in axes(dense, 2), i in axes(dense, 1)
135+
@test d_dense[i, j] == dense[i, j]
136+
end
137+
end
138+
end
139+
# sparse case with empty rows and missing entries
140+
let s = sparse([1, 1, 3, 4], [1, 3, 2, 4], [10, 20, 30, 40], 4, 4),
141+
d_s = CuSparseMatrixCOO(s)
142+
CUDACore.@allowscalar begin
143+
for j in axes(s, 2), i in axes(s, 1)
144+
@test d_s[i, j] == s[i, j]
145+
end
146+
end
147+
end
148+
# COO sorted by row but not by column within each row — cuSPARSE's
149+
# documented invariant is row-sorted only, so getindex must handle this.
150+
let dense = sparse(reshape(1:16, 4, 4)),
151+
d_scrambled = CuSparseMatrixCOO(
152+
CuArray(Int32[1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4]),
153+
CuArray(Int32[4,2,1,3, 2,4,3,1, 3,1,4,2, 1,4,2,3]),
154+
CuArray([13,5,1,9, 6,14,10,2, 11,3,15,7, 4,16,8,12]),
155+
(4, 4))
156+
CUDACore.@allowscalar begin
157+
for j in axes(dense, 2), i in axes(dense, 1)
158+
@test d_scrambled[i, j] == dense[i, j]
159+
end
160+
end
161+
end
162+
# CSR with unsorted column indices within each row — can arise from
163+
# SpGEMM and other operations, so getindex must not binary-search.
164+
let dense = sparse(reshape(1:16, 4, 4)),
165+
d_scrambled = CuSparseMatrixCSR(
166+
CuArray(Int32[1, 5, 9, 13, 17]),
167+
CuArray(Int32[4,2,1,3, 2,4,3,1, 3,1,4,2, 1,4,2,3]),
168+
CuArray([13,5,1,9, 6,14,10,2, 11,3,15,7, 4,16,8,12]),
169+
(4, 4))
170+
CUDACore.@allowscalar begin
171+
for j in axes(dense, 2), i in axes(dense, 1)
172+
@test d_scrambled[i, j] == dense[i, j]
173+
end
174+
end
175+
end
176+
# CSC with unsorted row indices within each column
177+
let dense = sparse(reshape(1:16, 4, 4)),
178+
d_scrambled = CuSparseMatrixCSC(
179+
CuArray(Int32[1, 5, 9, 13, 17]),
180+
CuArray(Int32[4,2,1,3, 2,4,3,1, 3,1,4,2, 1,4,2,3]),
181+
CuArray([4,2,1,3, 6,8,7,5, 11,9,12,10, 13,16,14,15]),
182+
(4, 4))
183+
CUDACore.@allowscalar begin
184+
for j in axes(dense, 2), i in axes(dense, 1)
185+
@test d_scrambled[i, j] == dense[i, j]
186+
end
187+
end
188+
end
189+
# Duplicate (i, j) entries sum, matching SciPy/CuPy and Julia's sparse().
190+
# Each format stores three entries at (1, 1) whose values sum to 3.
191+
CUDACore.@allowscalar begin
192+
let d_csr = CuSparseMatrixCSR(
193+
CuArray(Int32[1, 4, 4]),
194+
CuArray(Int32[1, 1, 1]),
195+
CuArray([10, -3, -4]),
196+
(2, 2))
197+
@test d_csr[1, 1] == 3
198+
@test d_csr[2, 2] == 0
199+
@test d_csr[1, 2] == 0
200+
end
201+
let d_csc = CuSparseMatrixCSC(
202+
CuArray(Int32[1, 4, 4]),
203+
CuArray(Int32[1, 1, 1]),
204+
CuArray([10, -3, -4]),
205+
(2, 2))
206+
@test d_csc[1, 1] == 3
207+
@test d_csc[2, 2] == 0
208+
@test d_csc[2, 1] == 0
209+
end
210+
let d_coo = CuSparseMatrixCOO(
211+
CuArray(Int32[1, 1, 1]),
212+
CuArray(Int32[1, 1, 1]),
213+
CuArray([10, -3, -4]),
214+
(2, 2))
215+
@test d_coo[1, 1] == 3
216+
@test d_coo[2, 2] == 0
217+
@test d_coo[1, 2] == 0
218+
end
219+
let d_vec = CuSparseVector{Int}(CuArray(Int32[3, 1, 3]), CuArray([10, 7, -4]), 4)
220+
@test d_vec[1] == 7
221+
@test d_vec[3] == 6
222+
@test d_vec[2] == 0
223+
@test d_vec[4] == 0
224+
end
225+
# Bool duplicates combine via OR, not integer sum.
226+
let d_bool = CuSparseMatrixCSR(
227+
CuArray(Int32[1, 3, 3]),
228+
CuArray(Int32[1, 1]),
229+
CuArray([true, true]),
230+
(2, 2))
231+
@test d_bool[1, 1] === true
232+
@test d_bool[2, 2] === false
233+
end
234+
end
130235
y = sprand(k,n,0.2)
131236
d_y = CuSparseMatrixCOO(y)
132237
@test_throws ArgumentError copyto!(d_y,d_x)

0 commit comments

Comments
 (0)