Very rough implementation of bcast for CuSparseVector#2733
Conversation
Codecov ReportAll modified and coverable lines are covered by tests ✅
Additional details and impacted files@@ Coverage Diff @@
## master #2733 +/- ##
==========================================
+ Coverage 89.37% 89.40% +0.02%
==========================================
Files 153 153
Lines 13150 13185 +35
==========================================
+ Hits 11753 11788 +35
Misses 1397 1397 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
|
Cool! It's been a while since I looked at those kernels, so I only gave the changes a superficial look for now. |
|
Do you have any thoughts on my "pointer workspace" idea? I think this could really simplify things at the cost of additional allocations |
Can you elaborate what that would simplify? How large would the allocations be? They're asynchronous, so as long as it doesn't scale in terms of the dense size I think that would be fine. |
|
Right now we're iterating through |
6672df6 to
76ea3e0
Compare
|
76ea3e0 to
3bf68d2
Compare
|
I can work around one of the issues by using generated functions and @generated function sparse_to_dense_broadcast_kernel(::Type{<:CuSparseVector}, f,
output::CuDeviceArray{Tv},
args...) where {Tv}
N = length(args)
quote
# every thread processes an entire row
row = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
row > length(output) && return
# set the values for this row
vals = @ntuple $N i -> begin
arg = @inbounds args[i]
_getindex(arg, row, 0)::Tv
end
out_val = f(vals...)
@inbounds output[row] = out_val
return
end
end... but other issues persist. It's curious that this only happens with CuSparseVector, using code patterns ( |
c3e9530 to
fb06b73
Compare
But then why does it work fine if you only have sparse vectors in the broadcast? The failures only start when you add in a dense vector... |
|
Yeah not sure. Heterogeneity? The typed IR looks perfectly fine, but in the generated LLVM you can see that the closure passed to |
|
I wonder, if we made the |
|
I think the |
fb06b73 to
54c07d7
Compare
|
OK, made |
54c07d7 to
319536e
Compare
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cusparse/broadcast.jl b/lib/cusparse/broadcast.jl
index 965c0ce47..52a0cac7c 100644
--- a/lib/cusparse/broadcast.jl
+++ b/lib/cusparse/broadcast.jl
@@ -288,9 +288,13 @@ end
end
# helpers to index a sparse or dense array
-@inline function _getindex(arg::Union{CuSparseDeviceMatrixCSR{Tv},
- CuSparseDeviceMatrixCSC{Tv},
- CuSparseDeviceVector{Tv}}, I, ptr)::Tv where {Tv}
+@inline function _getindex(
+ arg::Union{
+ CuSparseDeviceMatrixCSR{Tv},
+ CuSparseDeviceMatrixCSC{Tv},
+ CuSparseDeviceVector{Tv},
+ }, I, ptr
+ )::Tv where {Tv}
if ptr == 0
return zero(Tv)
else
@@ -326,9 +330,11 @@ function _get_my_row(first_row)::Int32
return row_ix + first_row - 1i32
end
-function compute_offsets_kernel(::Type{<:CuSparseVector}, first_row::Ti, last_row::Ti,
- fpreszeros::Bool, offsets::AbstractVector{Pair{Ti, NTuple{N, Ti}}},
- args...) where {Ti, N}
+function compute_offsets_kernel(
+ ::Type{<:CuSparseVector}, first_row::Ti, last_row::Ti,
+ fpreszeros::Bool, offsets::AbstractVector{Pair{Ti, NTuple{N, Ti}}},
+ args...
+ ) where {Ti, N}
row = _get_my_row(first_row)
row > last_row && return
@@ -348,8 +354,9 @@ function compute_offsets_kernel(::Type{<:CuSparseVector}, first_row::Ti, last_ro
end
# kernel to count the number of non-zeros in a row, to determine the row offsets
-function compute_offsets_kernel(T::Type{<:Union{CuSparseMatrixCSR, CuSparseMatrixCSC}},
- offsets::AbstractVector{Ti},
+function compute_offsets_kernel(
+ T::Type{<:Union{CuSparseMatrixCSR, CuSparseMatrixCSC}},
+ offsets::AbstractVector{Ti},
args...) where Ti
# every thread processes an entire row
leading_dim = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
@@ -374,14 +381,16 @@ function compute_offsets_kernel(T::Type{<:Union{CuSparseMatrixCSR, CuSparseMatri
return
end
-function sparse_to_sparse_broadcast_kernel(f::F, output::CuSparseDeviceVector{Tv,Ti},
- offsets::AbstractVector{Pair{Ti, NTuple{N, Ti}}},
- args...) where {Tv, Ti, N, F}
+function sparse_to_sparse_broadcast_kernel(
+ f::F, output::CuSparseDeviceVector{Tv, Ti},
+ offsets::AbstractVector{Pair{Ti, NTuple{N, Ti}}},
+ args...
+ ) where {Tv, Ti, N, F}
row_ix = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
row_ix > output.nnz && return
row_and_ptrs = @inbounds offsets[row_ix]
- row = @inbounds row_and_ptrs[1]
- arg_ptrs = @inbounds row_and_ptrs[2]
+ row = @inbounds row_and_ptrs[1]
+ arg_ptrs = @inbounds row_and_ptrs[2]
vals = ntuple(Val(N)) do i
arg = @inbounds args[i]
# ptr is 0 if the sparse vector doesn't have an element at this row
@@ -390,14 +399,20 @@ function sparse_to_sparse_broadcast_kernel(f::F, output::CuSparseDeviceVector{Tv
_getindex(arg, row, ptr)::Tv
end
output_val = f(vals...)
- @inbounds output.iPtr[row_ix] = row
+ @inbounds output.iPtr[row_ix] = row
@inbounds output.nzVal[row_ix] = output_val
return
end
-function sparse_to_sparse_broadcast_kernel(f, output::T, offsets::Union{AbstractVector,Nothing},
- args...) where {Ti, T<:Union{CuSparseDeviceMatrixCSR{<:Any,Ti},
- CuSparseDeviceMatrixCSC{<:Any,Ti}}}
+function sparse_to_sparse_broadcast_kernel(
+ f, output::T, offsets::Union{AbstractVector, Nothing},
+ args...
+ ) where {
+ Ti, T <: Union{
+ CuSparseDeviceMatrixCSR{<:Any, Ti},
+ CuSparseDeviceMatrixCSC{<:Any, Ti},
+ },
+ }
# every thread processes an entire row
leading_dim = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
leading_dim_size = output isa CuSparseDeviceMatrixCSR ? size(output, 1) : size(output, 2)
@@ -433,8 +448,13 @@ function sparse_to_sparse_broadcast_kernel(f, output::T, offsets::Union{Abstract
return
end
-function sparse_to_dense_broadcast_kernel(T::Type{<:Union{CuSparseMatrixCSR{Tv, Ti},
- CuSparseMatrixCSC{Tv, Ti}}}, f,
+function sparse_to_dense_broadcast_kernel(
+ T::Type{
+ <:Union{
+ CuSparseMatrixCSR{Tv, Ti},
+ CuSparseMatrixCSC{Tv, Ti},
+ },
+ }, f,
output::CuDeviceArray, args...) where {Tv, Ti}
# every thread processes an entire row
leading_dim = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
@@ -459,16 +479,18 @@ function sparse_to_dense_broadcast_kernel(T::Type{<:Union{CuSparseMatrixCSR{Tv,
return
end
-function sparse_to_dense_broadcast_kernel(::Type{<:CuSparseVector}, f::F,
- output::CuDeviceArray{Tv},
- offsets::AbstractVector{Pair{Ti, NTuple{N, Ti}}},
- args...) where {Tv, F, N, Ti}
+function sparse_to_dense_broadcast_kernel(
+ ::Type{<:CuSparseVector}, f::F,
+ output::CuDeviceArray{Tv},
+ offsets::AbstractVector{Pair{Ti, NTuple{N, Ti}}},
+ args...
+ ) where {Tv, F, N, Ti}
# every thread processes an entire row
row_ix = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
row_ix > length(output) && return
row_and_ptrs = @inbounds offsets[row_ix]
- row = @inbounds row_and_ptrs[1]
- arg_ptrs = @inbounds row_and_ptrs[2]
+ row = @inbounds row_and_ptrs[1]
+ arg_ptrs = @inbounds row_and_ptrs[2]
vals = ntuple(Val(length(args))) do i
arg = @inbounds args[i]
# ptr is 0 if the sparse vector doesn't have an element at this row
@@ -493,14 +515,14 @@ function Broadcast.copy(bc::Broadcasted{<:Union{CuSparseVecStyle,CuSparseMatStyl
error("broadcast with multiple types of sparse arrays ($(join(sparse_types, ", "))) is not supported")
end
sparse_typ = typeof(bc.args[first(sparse_args)])
- sparse_typ <: Union{CuSparseMatrixCSR,CuSparseMatrixCSC,CuSparseVector} ||
+ sparse_typ <: Union{CuSparseMatrixCSR, CuSparseMatrixCSC, CuSparseVector} ||
error("broadcast with sparse arrays is currently only implemented for vectors and CSR and CSC matrices")
Ti = if sparse_typ <: CuSparseMatrixCSR
reduce(promote_type, map(i->eltype(bc.args[i].rowPtr), sparse_args))
elseif sparse_typ <: CuSparseMatrixCSC
reduce(promote_type, map(i->eltype(bc.args[i].colPtr), sparse_args))
elseif sparse_typ <: CuSparseVector
- reduce(promote_type, map(i->eltype(bc.args[i].iPtr), sparse_args))
+ reduce(promote_type, map(i -> eltype(bc.args[i].iPtr), sparse_args))
end
# determine the output type
@@ -528,15 +550,15 @@ function Broadcast.copy(bc::Broadcasted{<:Union{CuSparseVecStyle,CuSparseMatStyl
config = launch_configuration(kernel.fun)
if sparse_typ <: CuSparseMatrixCSR
threads = min(rows, config.threads)
- blocks = max(cld(rows, threads), config.blocks)
+ blocks = max(cld(rows, threads), config.blocks)
threads = cld(rows, blocks)
elseif sparse_typ <: CuSparseMatrixCSC
threads = min(cols, config.threads)
- blocks = max(cld(cols, threads), config.blocks)
+ blocks = max(cld(cols, threads), config.blocks)
threads = cld(cols, blocks)
elseif sparse_typ <: CuSparseVector
threads = 512
- blocks = max(cld(rows, threads), config.blocks)
+ blocks = max(cld(rows, threads), config.blocks)
end
(; threads, blocks)
end
@@ -571,14 +593,14 @@ function Broadcast.copy(bc::Broadcasted{<:Union{CuSparseVecStyle,CuSparseMatStyl
sparse_arg = bc.args[first(sparse_args)]
if sparse_typ <: CuSparseMatrixCSR
offsets = rowPtr = sparse_arg.rowPtr
- colVal = similar(sparse_arg.colVal)
- nzVal = similar(sparse_arg.nzVal, Tv)
- output = CuSparseMatrixCSR(rowPtr, colVal, nzVal, size(bc))
+ colVal = similar(sparse_arg.colVal)
+ nzVal = similar(sparse_arg.nzVal, Tv)
+ output = CuSparseMatrixCSR(rowPtr, colVal, nzVal, size(bc))
elseif sparse_typ <: CuSparseMatrixCSC
offsets = colPtr = sparse_arg.colPtr
- rowVal = similar(sparse_arg.rowVal)
- nzVal = similar(sparse_arg.nzVal, Tv)
- output = CuSparseMatrixCSC(colPtr, rowVal, nzVal, size(bc))
+ rowVal = similar(sparse_arg.rowVal)
+ nzVal = similar(sparse_arg.nzVal, Tv)
+ output = CuSparseMatrixCSC(colPtr, rowVal, nzVal, size(bc))
end
# NOTE: we don't use CUSPARSE's similar, because that copies the structure arrays,
# while we do that in our kernel (for consistency with other code paths)
@@ -601,7 +623,7 @@ function Broadcast.copy(bc::Broadcasted{<:Union{CuSparseVecStyle,CuSparseMatStyl
end
end
overall_first_row = min(arg_first_rows...)
- overall_last_row = max(arg_last_rows...)
+ overall_last_row = max(arg_last_rows...)
CuVector{Pair{Ti, NTuple{length(bc.args), Ti}}}(undef, overall_last_row - overall_first_row + 1)
end
let
@@ -621,22 +643,22 @@ function Broadcast.copy(bc::Broadcasted{<:Union{CuSparseVecStyle,CuSparseMatStyl
accumulate!(Base.add_sum, offsets, offsets)
total_nnz = @allowscalar last(offsets[end]) - 1
else
- sort!(offsets; by=first)
- total_nnz = mapreduce(x->first(x) != typemax(first(x)), +, offsets)
+ sort!(offsets; by = first)
+ total_nnz = mapreduce(x -> first(x) != typemax(first(x)), +, offsets)
end
output = if sparse_typ <: CuSparseMatrixCSR
colVal = CuArray{Ti}(undef, total_nnz)
- nzVal = CuArray{Tv}(undef, total_nnz)
+ nzVal = CuArray{Tv}(undef, total_nnz)
CuSparseMatrixCSR(offsets, colVal, nzVal, size(bc))
elseif sparse_typ <: CuSparseMatrixCSC
rowVal = CuArray{Ti}(undef, total_nnz)
- nzVal = CuArray{Tv}(undef, total_nnz)
+ nzVal = CuArray{Tv}(undef, total_nnz)
CuSparseMatrixCSC(offsets, rowVal, nzVal, size(bc))
elseif sparse_typ <: CuSparseVector && !fpreszeros
CuArray{Tv}(undef, size(bc))
elseif sparse_typ <: CuSparseVector && fpreszeros
- iPtr = CUDA.zeros(Ti, total_nnz)
- nzVal = CUDA.zeros(Tv, total_nnz)
+ iPtr = CUDA.zeros(Ti, total_nnz)
+ nzVal = CUDA.zeros(Tv, total_nnz)
CuSparseVector(iPtr, nzVal, rows)
end
if sparse_typ <: CuSparseVector && !fpreszeros
@@ -653,12 +675,12 @@ function Broadcast.copy(bc::Broadcasted{<:Union{CuSparseVecStyle,CuSparseMatStyl
end
# perform the actual broadcast
if output isa AbstractCuSparseArray
- args = (bc.f, output, offsets, bc.args...)
- kernel = @cuda launch=false sparse_to_sparse_broadcast_kernel(args...)
+ args = (bc.f, output, offsets, bc.args...)
+ kernel = @cuda launch = false sparse_to_sparse_broadcast_kernel(args...)
else
- args = sparse_typ <: CuSparseVector ? (sparse_typ, bc.f, output, offsets, bc.args...) :
- (sparse_typ, bc.f, output, bc.args...)
- kernel = @cuda launch=false sparse_to_dense_broadcast_kernel(args...)
+ args = sparse_typ <: CuSparseVector ? (sparse_typ, bc.f, output, offsets, bc.args...) :
+ (sparse_typ, bc.f, output, bc.args...)
+ kernel = @cuda launch = false sparse_to_dense_broadcast_kernel(args...)
end
threads, blocks = compute_launch_config(kernel)
kernel(args...; threads, blocks)
diff --git a/test/libraries/cusparse/broadcast.jl b/test/libraries/cusparse/broadcast.jl
index b1a89437b..e6cd0efc0 100644
--- a/test/libraries/cusparse/broadcast.jl
+++ b/test/libraries/cusparse/broadcast.jl
@@ -1,8 +1,8 @@
using CUDA.CUSPARSE, SparseArrays
@testset for elty in [Int32, Int64, Float32, Float64]
- @testset "$typ($elty)" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC]
- m,n = 5,6
+ @testset "$typ($elty)" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC]
+ m, n = 5, 6
p = 0.5
x = sprand(elty, m, n, p)
dx = typ(x)
@@ -32,7 +32,7 @@ using CUDA.CUSPARSE, SparseArrays
dz = dx .* dy .* elty(2)
@test dz isa typ{elty}
@test z == SparseMatrixCSC(dz)
-
+
# multiple inputs
w = sprand(elty, m, n, p)
dw = typ(w)
@@ -41,39 +41,39 @@ using CUDA.CUSPARSE, SparseArrays
@test dz isa typ{elty}
@test z == SparseMatrixCSC(dz)
end
- @testset "$typ($elty)" for typ in [CuSparseVector,]
- m = 64
+ @testset "$typ($elty)" for typ in [CuSparseVector]
+ m = 64
p = 0.5
x = sprand(elty, m, p)
dx = typ(x)
-
+
# zero-preserving
- y = x .* elty(1)
+ y = x .* elty(1)
dy = dx .* elty(1)
@test dy isa typ{elty}
- @test collect(dy.iPtr) == collect(dx.iPtr)
+ @test collect(dy.iPtr) == collect(dx.iPtr)
@test collect(dy.iPtr) == y.nzind
@test collect(dy.nzVal) == y.nzval
- @test y == SparseVector(dy)
-
+ @test y == SparseVector(dy)
+
# not zero-preserving
y = x .+ elty(1)
dy = dx .+ elty(1)
@test dy isa CuArray{elty}
hy = Array(dy)
- @test Array(y) == hy
+ @test Array(y) == hy
# involving something dense
y = x .+ ones(elty, m)
dy = dx .+ CUDA.ones(elty, m)
@test dy isa CuArray{elty}
@test y == Array(dy)
-
- # sparse to sparse
+
+ # sparse to sparse
dx = typ(x)
- y = sprand(elty, m, p)
+ y = sprand(elty, m, p)
dy = typ(y)
- z = x .* y
+ z = x .* y
dz = dx .* dy
@test dz isa typ{elty}
@test z == SparseVector(dz)
@@ -84,26 +84,26 @@ using CUDA.CUSPARSE, SparseArrays
dy = typ(y)
dx = typ(x)
dw = typ(w)
- z = @. x * y * w
+ z = @. x * y * w
dz = @. dx * dy * dw
@test dz isa typ{elty}
@test z == SparseVector(dz)
-
+
y = sprand(elty, m, p)
w = sprand(elty, m, p)
- dense_arr = rand(elty, m)
- d_dense_arr = CuArray(dense_arr)
+ dense_arr = rand(elty, m)
+ d_dense_arr = CuArray(dense_arr)
dy = typ(y)
dw = typ(w)
- z = @. x * y * w * dense_arr
- dz = @. dx * dy * dw * d_dense_arr
+ z = @. x * y * w * dense_arr
+ dz = @. dx * dy * dw * d_dense_arr
@test dz isa CuArray{elty}
@test z == Array(dz)
-
+
y = sprand(elty, m, p)
dy = typ(y)
dx = typ(x)
- z = x .* y .* elty(2)
+ z = x .* y .* elty(2)
dz = dx .* dy .* elty(2)
@test dz isa typ{elty}
@test z == SparseVector(dz) |
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: b7f4e89 | Previous: 719b4c4 | Ratio |
|---|---|---|---|
latency/precompile |
47455478695.5 ns |
47257959252.5 ns |
1.00 |
latency/ttfp |
6830776438 ns |
6830427074 ns |
1.00 |
latency/import |
3212602150 ns |
3217803391.5 ns |
1.00 |
integration/volumerhs |
9614520.5 ns |
9610454 ns |
1.00 |
integration/byval/slices=1 |
147200 ns |
146532 ns |
1.00 |
integration/byval/slices=3 |
425559 ns |
424864 ns |
1.00 |
integration/byval/reference |
145285 ns |
144771 ns |
1.00 |
integration/byval/slices=2 |
286159 ns |
285758 ns |
1.00 |
integration/cudadevrt |
103653 ns |
103343 ns |
1.00 |
kernel/indexing |
14171 ns |
14041 ns |
1.01 |
kernel/indexing_checked |
15255 ns |
14713.5 ns |
1.04 |
kernel/occupancy |
731.08 ns |
759.8307692307692 ns |
0.96 |
kernel/launch |
2267.8888888888887 ns |
2096.1 ns |
1.08 |
kernel/rand |
14665 ns |
14556 ns |
1.01 |
array/reverse/1d |
19745 ns |
19445.5 ns |
1.02 |
array/reverse/2d |
25248 ns |
24687 ns |
1.02 |
array/reverse/1d_inplace |
10529 ns |
10863 ns |
0.97 |
array/reverse/2d_inplace |
12173 ns |
11135 ns |
1.09 |
array/copy |
21392 ns |
20396 ns |
1.05 |
array/iteration/findall/int |
159195 ns |
157077 ns |
1.01 |
array/iteration/findall/bool |
139684 ns |
138251.5 ns |
1.01 |
array/iteration/findfirst/int |
153940 ns |
153592 ns |
1.00 |
array/iteration/findfirst/bool |
154966.5 ns |
153988.5 ns |
1.01 |
array/iteration/scalar |
72584 ns |
71780 ns |
1.01 |
array/iteration/logical |
216523 ns |
211941 ns |
1.02 |
array/iteration/findmin/1d |
41548 ns |
41025 ns |
1.01 |
array/iteration/findmin/2d |
94374 ns |
93445 ns |
1.01 |
array/reductions/reduce/1d |
40582.5 ns |
35564 ns |
1.14 |
array/reductions/reduce/2d |
45396.5 ns |
41919.5 ns |
1.08 |
array/reductions/mapreduce/1d |
37472.5 ns |
33607.5 ns |
1.12 |
array/reductions/mapreduce/2d |
51430.5 ns |
41487 ns |
1.24 |
array/broadcast |
21095 ns |
20714 ns |
1.02 |
array/copyto!/gpu_to_gpu |
11857 ns |
13388 ns |
0.89 |
array/copyto!/cpu_to_gpu |
209438 ns |
207823 ns |
1.01 |
array/copyto!/gpu_to_cpu |
245257 ns |
245031 ns |
1.00 |
array/accumulate/1d |
109144 ns |
109385 ns |
1.00 |
array/accumulate/2d |
80777 ns |
79991 ns |
1.01 |
array/construct |
1273.9 ns |
1285.1 ns |
0.99 |
array/random/randn/Float32 |
46067 ns |
42579.5 ns |
1.08 |
array/random/randn!/Float32 |
26774 ns |
26171 ns |
1.02 |
array/random/rand!/Int64 |
27207 ns |
26886 ns |
1.01 |
array/random/rand!/Float32 |
8820.333333333334 ns |
8518 ns |
1.04 |
array/random/rand/Int64 |
38222 ns |
29621 ns |
1.29 |
array/random/rand/Float32 |
13223 ns |
12750 ns |
1.04 |
array/permutedims/4d |
61410 ns |
60591.5 ns |
1.01 |
array/permutedims/2d |
55594 ns |
54882 ns |
1.01 |
array/permutedims/3d |
56191 ns |
55464.5 ns |
1.01 |
array/sorting/1d |
2776085 ns |
2774720 ns |
1.00 |
array/sorting/by |
3367802.5 ns |
3365888 ns |
1.00 |
array/sorting/2d |
1085387 ns |
1083636 ns |
1.00 |
cuda/synchronization/stream/auto |
1021.1 ns |
1080.5 ns |
0.95 |
cuda/synchronization/stream/nonblocking |
6537 ns |
6352.4 ns |
1.03 |
cuda/synchronization/stream/blocking |
801.6421052631579 ns |
801.4897959183673 ns |
1.00 |
cuda/synchronization/context/auto |
1187.7 ns |
1201.5 ns |
0.99 |
cuda/synchronization/context/nonblocking |
6733 ns |
6641 ns |
1.01 |
cuda/synchronization/context/blocking |
915.6666666666666 ns |
900.1458333333334 ns |
1.02 |
This comment was automatically generated by workflow using github-action-benchmark.
319536e to
b7f4e89
Compare
3937312 to
b8922d4
Compare
Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
b8922d4 to
43dc26f
Compare
VERY naive implementation here. Added some TODOs for obvious performance improvements I could make. Some issues I can't quite figure out yet (these are commented out in the new tests).