Merge branch 'master' into csr-dispatch

Abdelrahman912 · web-flow · commit 08b2fee1c8e0 · 2025-04-14T22:59:25.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "CUDA"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "5.7.1"
+version = "5.7.2"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
@@ -64,7 +64,7 @@ EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.2.1"
 GPUCompiler = "0.24, 0.25, 0.26, 0.27, 1"
-GPUToolbox = "0.1, 0.2"
+GPUToolbox = "0.2"
 KernelAbstractions = "0.9.2"
 LLVM = "9.1"
 LLVMLoopInfo = "1"
diff --git a/lib/cusparse/broadcast.jl b/lib/cusparse/broadcast.jl
@@ -300,153 +300,87 @@ _getindex(arg, I, ptr) = Broadcast._broadcast_getindex(arg, I)
 
 ## sparse broadcast implementation
 
-# TODO: unify CSC/CSR kernels
+iter_type(::Type{<:CuSparseMatrixCSC}, ::Type{Ti}) where {Ti} = CSCIterator{Ti}
+iter_type(::Type{<:CuSparseMatrixCSR}, ::Type{Ti}) where {Ti} = CSRIterator{Ti}
+iter_type(::Type{<:CuSparseDeviceMatrixCSC}, ::Type{Ti}) where {Ti} = CSCIterator{Ti}
+iter_type(::Type{<:CuSparseDeviceMatrixCSR}, ::Type{Ti}) where {Ti} = CSRIterator{Ti}
+
 # kernel to count the number of non-zeros in a row, to determine the row offsets
-function compute_offsets_kernel(::Type{<:CuSparseMatrixCSR}, offsets::AbstractVector{Ti},
+function compute_offsets_kernel(T::Type{<:Union{CuSparseMatrixCSR, CuSparseMatrixCSC}}, offsets::AbstractVector{Ti},
                                 args...) where Ti
     # every thread processes an entire row
-    row = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
-    row > length(offsets)-1 && return
-    iter = @inbounds CSRIterator{Ti}(row, args...)
-
-    # count the nonzero columns of all inputs
-    accum = zero(Ti)
-    for (col, vals) in iter
-        accum += one(Ti)
-    end
-
-    # the way we write the nnz counts is a bit strange, but done so that the result
-    # after accumulation can be directly used as the rowPtr array of a CSR matrix.
-    @inbounds begin
-        if row == 1
-            offsets[1] = 1
-        end
-        offsets[row+1] = accum
-    end
+    leading_dim = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
+    leading_dim > length(offsets)-1 && return
+    iter = @inbounds iter_type(T, Ti)(leading_dim, args...)
 
-    return
-end
-function compute_offsets_kernel(::Type{<:CuSparseMatrixCSC}, offsets::AbstractVector{Ti},
-                                args...) where Ti
-    # every thread processes an entire columm
-    col = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
-    col > length(offsets)-1 && return
-    iter = @inbounds CSCIterator{Ti}(col, args...)
-
-    # count the nonzero columns of all inputs
+    # count the nonzero leading_dims of all inputs
     accum = zero(Ti)
-    for (col, vals) in iter
+    for (leading_dim, vals) in iter
         accum += one(Ti)
     end
 
     # the way we write the nnz counts is a bit strange, but done so that the result
-    # after accumulation can be directly used as the colPtr array of a CSC matrix.
+    # after accumulation can be directly used as the rowPtr/colPtr array of a CSR/CSC matrix.
     @inbounds begin
-        if col == 1
+        if leading_dim == 1
             offsets[1] = 1
         end
-        offsets[col+1] = accum
+        offsets[leading_dim+1] = accum
     end
 
     return
 end
 
 # broadcast kernels that iterate the elements of sparse arrays
-function sparse_to_sparse_broadcast_kernel(f, output::CuSparseDeviceMatrixCSR{<:Any,Ti},
-                                           offsets::Union{AbstractVector,Nothing},
-                                           args...) where {Ti}
+function sparse_to_sparse_broadcast_kernel(f, output::T, offsets::Union{AbstractVector,Nothing}, args...) where {Ti, T<:Union{CuSparseDeviceMatrixCSR{<:Any,Ti},CuSparseDeviceMatrixCSC{<:Any,Ti}}}
     # every thread processes an entire row
-    row = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
-    row > size(output, 1) && return
-    iter = @inbounds CSRIterator{Ti}(row, args...)
+    leading_dim = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
+    leading_dim_size = output isa CuSparseDeviceMatrixCSR ? size(output, 1) : size(output, 2)
+    leading_dim > leading_dim_size && return
+    iter = @inbounds iter_type(T, Ti)(leading_dim, args...)
+
 
+    output_ptrs  = output isa CuSparseDeviceMatrixCSR ? output.rowPtr : output.colPtr
+    output_ivals = output isa CuSparseDeviceMatrixCSR ? output.colVal : output.rowVal
     # fetch the row offset, and write it to the output
     @inbounds begin
-        output_ptr = output.rowPtr[row] = offsets[row]
-        if row == size(output, 1)
-            output.rowPtr[row+1i32] = offsets[row+1i32]
+        output_ptr = output_ptrs[leading_dim] = offsets[leading_dim]
+        if leading_dim == leading_dim_size 
+            output_ptrs[leading_dim+1i32] = offsets[leading_dim+1i32]
         end
     end
 
     # set the values for this row
-    for (col, ptrs) in iter
-        I = CartesianIndex(row, col)
+    for (sub_leading_dim, ptrs) in iter
+        index_first  = output isa CuSparseDeviceMatrixCSR ? leading_dim : sub_leading_dim
+        index_second = output isa CuSparseDeviceMatrixCSR ? sub_leading_dim : leading_dim
+        I = CartesianIndex(index_first, index_second)
         vals = ntuple(Val(length(args))) do i
             arg = @inbounds args[i]
             ptr = @inbounds ptrs[i]
             _getindex(arg, I, ptr)
         end
 
-        @inbounds output.colVal[output_ptr] = col
+        @inbounds output_ivals[output_ptr] = sub_leading_dim
         @inbounds output.nzVal[output_ptr] = f(vals...)
         output_ptr += one(Ti)
     end
 
     return
 end
-function sparse_to_sparse_broadcast_kernel(f, output::CuSparseDeviceMatrixCSC{<:Any,Ti},
-                                           offsets::Union{AbstractVector,Nothing},
-                                           args...) where {Ti}
-    # every thread processes an entire column
-    col = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
-    col > size(output, 2) && return
-    iter = @inbounds CSCIterator{Ti}(col, args...)
-
-    # fetch the column offset, and write it to the output
-    @inbounds begin
-        output_ptr = output.colPtr[col] = offsets[col]
-        if col == size(output, 2)
-            output.colPtr[col+1i32] = offsets[col+1i32]
-        end
-    end
-
-    # set the values for this col
-    for (row, ptrs) in iter
-        I = CartesianIndex(col, row)
-        vals = ntuple(Val(length(args))) do i
-            arg = @inbounds args[i]
-            ptr = @inbounds ptrs[i]
-            _getindex(arg, I, ptr)
-        end
-
-        @inbounds output.rowVal[output_ptr] = row
-        @inbounds output.nzVal[output_ptr] = f(vals...)
-        output_ptr += one(Ti)
-    end
-
-    return
-end
-function sparse_to_dense_broadcast_kernel(::Type{<:CuSparseMatrixCSR}, f,
-                                          output::CuDeviceArray, args...)
+function sparse_to_dense_broadcast_kernel(T::Type{<:Union{CuSparseMatrixCSR{Tv, Ti}, CuSparseMatrixCSC{Tv, Ti}}}, f,
+                                          output::CuDeviceArray, args...) where {Tv, Ti}
     # every thread processes an entire row
-    row = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
-    row > size(output, 1) && return
-    iter = @inbounds CSRIterator{Int}(row, args...)
+    leading_dim = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
+    leading_dim_size = T <: CuSparseMatrixCSR ? size(output, 1) : size(output, 2)
+    leading_dim > leading_dim_size && return
+    iter = @inbounds iter_type(T, Ti)(leading_dim, args...)
 
     # set the values for this row
-    for (col, ptrs) in iter
-        I = CartesianIndex(row, col)
-        vals = ntuple(Val(length(args))) do i
-            arg = @inbounds args[i]
-            ptr = @inbounds ptrs[i]
-            _getindex(arg, I, ptr)
-        end
-
-        @inbounds output[I] = f(vals...)
-    end
-
-    return
-end
-function sparse_to_dense_broadcast_kernel(::Type{<:CuSparseMatrixCSC}, f,
-                                          output::CuDeviceArray, args...)
-    # every thread processes an entire column
-    col = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
-    col > size(output, 2) && return
-    iter = @inbounds CSCIterator{Int}(col, args...)
-
-    # set the values for this col
-    for (row, ptrs) in iter
-        I = CartesianIndex(row, col)
+    for (sub_leading_dim, ptrs) in iter
+        index_first  = T <: CuSparseMatrixCSR ? leading_dim : sub_leading_dim
+        index_second = T <: CuSparseMatrixCSR ? sub_leading_dim : leading_dim
+        I = CartesianIndex(index_first, index_second)
         vals = ntuple(Val(length(args))) do i
             arg = @inbounds args[i]
             ptr = @inbounds ptrs[i]
diff --git a/lib/cusparse/generic.jl b/lib/cusparse/generic.jl
@@ -53,7 +53,7 @@ function densetosparse(A::CuMatrix{T}, fmt::Symbol, index::SparseChar, algo::cus
         colPtr = CuVector{Cint}(undef, n+1)
         desc_sparse = CuSparseMatrixDescriptor(CuSparseMatrixCSC, colPtr, T, Cint, m, n, index)
     else
-        error("Format :$fmt not available, use :csc, :csr or :coo.")
+        throw(ArgumentError("Format :$fmt not available, use :csc, :csr or :coo."))
     end
     desc_dense = CuDenseMatrixDescriptor(A)
 
@@ -82,8 +82,6 @@ function densetosparse(A::CuMatrix{T}, fmt::Symbol, index::SparseChar, algo::cus
             nzVal = CuVector{T}(undef, nnzB[])
             B = CuSparseMatrixCSC{T, Cint}(colPtr, rowVal, nzVal, (m,n))
             cusparseCscSetPointers(desc_sparse, B.colPtr, B.rowVal, B.nzVal)
-        else
-            error("Format :$fmt not available, use :csc, :csr or :coo.")
         end
         cusparseDenseToSparse_convert(handle(), desc_dense, desc_sparse, algo, buffer)
     end
diff --git a/lib/cutensor/src/interfaces.jl b/lib/cutensor/src/interfaces.jl
@@ -9,7 +9,7 @@ function Base.:(+)(A::CuTensor, B::CuTensor)
     elementwise_binary_execute!(α, A.data, A.inds, CUTENSOR_OP_IDENTITY,
                                 γ, B.data, B.inds, CUTENSOR_OP_IDENTITY,
                                 C.data, C.inds, CUTENSOR_OP_ADD)
-    C
+    return C
 end
 
 function Base.:(-)(A::CuTensor, B::CuTensor)
@@ -19,7 +19,7 @@ function Base.:(-)(A::CuTensor, B::CuTensor)
     elementwise_binary_execute!(α, A.data, A.inds, CUTENSOR_OP_IDENTITY,
                                 γ, B.data, B.inds, CUTENSOR_OP_IDENTITY,
                                 C.data, C.inds, CUTENSOR_OP_ADD)
-    C
+    return C
 end
 
 function Base.:(*)(A::CuTensor, B::CuTensor)
diff --git a/src/CUDA.jl b/src/CUDA.jl
@@ -4,7 +4,7 @@ using GPUCompiler
 
 using GPUArrays
 
-using GPUToolbox: SimpleVersion, @sv_str
+using GPUToolbox
 
 using LLVM
 using LLVM.Interop
diff --git a/src/array.jl b/src/array.jl
@@ -472,10 +472,35 @@ function Base.unsafe_convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::DenseCuArr
 end
 
 
-## memory copying
+## synchronization
 
 synchronize(x::CuArray) = synchronize(x.data[])
 
+"""
+    enable_synchronization!(arr::CuArray, enable::Bool)
+
+By default `CuArray`s are implicitly synchronized when they are accessed on different CUDA
+devices or streams. This may be unwanted when e.g. using disjoint slices of memory across
+different tasks. This function allows to enable or disable this behavior.
+
+!!! warning
+
+    Disabling implicit synchronization affects _all_ `CuArray`s that are referring to the
+    same underlying memory. Unsafe use of this API _will_ result in data corruption.
+
+    This API is only provided as an escape hatch, and should not be used without careful
+    consideration. If automatic synchronization is generally problematic for your use case,
+    it is recommended to figure out a better model instead and file an issue or pull request.
+    For more details see [this discussion](https://github.com/JuliaGPU/CUDA.jl/issues/2617).
+"""
+function enable_synchronization!(arr::CuArray, enable::Bool=true)
+    arr.data[].synchronizing = enable
+    return arr
+end
+
+
+## memory copying
+
 if VERSION >= v"1.11.0-DEV.753"
 function typetagdata(a::Array, i=1)
   ptr_or_offset = Int(a.ref.ptr_or_offset)
diff --git a/src/device/utils.jl b/src/device/utils.jl
@@ -1,11 +1,5 @@
 # helpers for writing device functionality
 
-# helper type for writing Int32 literals
-# TODO: upstream this
-struct Literal{T} end
-Base.:(*)(x::Number, ::Type{Literal{T}}) where {T} = T(x)
-const i32 = Literal{Int32}
-
 # local method table for device functions
 @static if isdefined(Base.Experimental, Symbol("@overlay"))
 Base.Experimental.@MethodTable(method_table)
diff --git a/src/memory.jl b/src/memory.jl
@@ -503,16 +503,20 @@ mutable struct Managed{M}
   # which stream is currently using the memory.
   stream::CuStream
 
+  # whether accessing this memory can cause implicit synchronization
+  synchronizing::Bool
+
   # whether there are outstanding operations that haven't been synchronized
   dirty::Bool
 
   # whether the memory has been captured in a way that would make the dirty bit unreliable
   captured::Bool
 
-  function Managed(mem::AbstractMemory; stream=CUDA.stream(), dirty=true, captured=false)
+  function Managed(mem::AbstractMemory; stream = CUDA.stream(), synchronizing = true,
+                   dirty = true, captured = false)
     # NOTE: memory starts as dirty, because stream-ordered allocations are only
     #       guaranteed to be physically allocated at a synchronization event.
-    new{typeof(mem)}(mem, stream, dirty, captured)
+    new{typeof(mem)}(mem, stream, synchronizing, dirty, captured)
   end
 end
 
@@ -524,7 +528,7 @@ function synchronize(managed::Managed)
   managed.dirty = false
 end
 function maybe_synchronize(managed::Managed)
-  if managed.dirty || managed.captured
+  if managed.synchronizing && (managed.dirty || managed.captured)
     synchronize(managed)
   end
 end
diff --git a/test/base/array.jl b/test/base/array.jl
@@ -51,6 +51,13 @@ using ChainRulesCore: add!!, is_inplaceable_destination
   end
 end
 
+@testset "synchronization" begin
+  a = CUDA.zeros(2, 2)
+  synchronize(a)
+  CUDA.enable_synchronization!(a, false)
+  CUDA.enable_synchronization!(a)
+end
+
 @testset "unsafe_wrap" begin
     # managed memory -> CuArray
     for a in [cu([1]; device=true), cu([1]; unified=true)]
diff --git a/test/core/cudadrv.jl b/test/core/cudadrv.jl
@@ -15,6 +15,9 @@ exclusive = attribute(dev, CUDA.DEVICE_ATTRIBUTE_COMPUTE_MODE) == CUDA.CU_COMPUT
 
 synchronize(ctx)
 
+@test startswith(sprint(show, MIME"text/plain"(), ctx), "CuContext")
+@test CUDA.api_version(ctx) isa Cuint
+
 if !exclusive
     let ctx2 = CuContext(dev)
         @test ctx2 == current_context()    # ctor implicitly pushes
diff --git a/test/libraries/cusparse.jl b/test/libraries/cusparse.jl
@@ -21,6 +21,7 @@ blockdim = 5
     @test size(d_x,2) == 1
     @test ndims(d_x)  == 1
     dense_d_x = CuVector(x)
+    dense_d_x2 = CuVector(d_x)
     CUDA.@allowscalar begin
         @test sprint(show, d_x) == replace(sprint(show, x), "SparseVector{Float64, Int64}"=>"CUDA.CUSPARSE.CuSparseVector{Float64, Int32}", "sparsevec(["=>"sparsevec(Int32[")
         @test sprint(show, MIME"text/plain"(), d_x) == replace(sprint(show, MIME"text/plain"(), x), "SparseVector{Float64, Int64}"=>"CuSparseVector{Float64, Int32}", "sparsevec(["=>"sparsevec(Int32[")
@@ -30,6 +31,7 @@ blockdim = 5
         @test d_x[end]             == x[end]
         @test Array(d_x[firstindex(d_x):end]) == x[firstindex(x):end]
         @test Array(dense_d_x[firstindex(d_x):end]) == x[firstindex(x):end]
+        @test Array(dense_d_x2[firstindex(d_x):end]) == x[firstindex(x):end]
     end
     @test_throws BoundsError d_x[firstindex(d_x) - 1]
     @test_throws BoundsError d_x[end + 1]
@@ -173,8 +175,10 @@ blockdim = 5
     d_z = copy(d_x)
     CUDA.unsafe_free!(d_z)
     @test_throws ArgumentError copyto!(d_y,d_x)
+    d_y_mat = CuMatrix{eltype(d_y)}(d_y)
     CUDA.@allowscalar begin
         @test d_y[1, 1] ≈ y[1, 1]
+        @test d_y_mat[1, 1] ≈ y[1, 1]
     end
     x = sprand(m,0.2)
     d_x = CuSparseVector(x)
diff --git a/test/libraries/cusparse/generic.jl b/test/libraries/cusparse/generic.jl