diff --git a/lib/cutensor/src/blocksparse/interfaces.jl b/lib/cutensor/src/blocksparse/interfaces.jl new file mode 100644 index 0000000000..5d840e40e5 --- /dev/null +++ b/lib/cutensor/src/blocksparse/interfaces.jl @@ -0,0 +1,15 @@ + + +## LinearAlgebra + +using LinearAlgebra + +function LinearAlgebra.mul!(C::CuTensorBS, A::CuTensorBS, B::CuTensorBS, α::Number, β::Number) + contract!(α, + A, A.inds, CUTENSOR_OP_IDENTITY, + B, B.inds, CUTENSOR_OP_IDENTITY, + β, + C, C.inds, CUTENSOR_OP_IDENTITY, + CUTENSOR_OP_IDENTITY; jit=CUTENSOR_JIT_MODE_DEFAULT) + return C +end \ No newline at end of file diff --git a/lib/cutensor/src/blocksparse/operations.jl b/lib/cutensor/src/blocksparse/operations.jl new file mode 100644 index 0000000000..4344c4ad63 --- /dev/null +++ b/lib/cutensor/src/blocksparse/operations.jl @@ -0,0 +1,104 @@ +function nonzero_blocks(A::CuTensorBS) + return A.nonzero_data +end + +function contract!( + @nospecialize(alpha::Number), + @nospecialize(A), Ainds::ModeType, opA::cutensorOperator_t, + @nospecialize(B), Binds::ModeType, opB::cutensorOperator_t, + @nospecialize(beta::Number), + @nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t, + opOut::cutensorOperator_t; + jit::cutensorJitMode_t=JIT_MODE_NONE, + workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::cutensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing, + plan::Union{CuTensorPlan, Nothing}=nothing) + + actual_plan = if plan === nothing + plan_contraction(A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut; + jit, workspace, algo, compute_type) + else + plan + end + + contractBS!(actual_plan, alpha, nonzero_blocks(A), nonzero_blocks(B), beta, nonzero_blocks(C)) + + if plan === nothing + CUDACore.unsafe_free!(actual_plan) + end + + return C +end + +## This function assumes A, B, and C are Arrays of pointers to CuArrays. +## Please overwrite the `nonzero_blocks` function for your datatype to access this function from contract! +function contractBS!(plan::CuTensorPlan, + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), + @nospecialize(B::AbstractArray), + @nospecialize(beta::Number), + @nospecialize(C::AbstractArray)) + scalar_type = plan.scalar_type + + # Extract GPU pointers from each CuArray block + # cuTENSOR expects a host-accessible array of GPU pointers + A_ptrs = CuPtr{Cvoid}[pointer(block) for block in A] + B_ptrs = CuPtr{Cvoid}[pointer(block) for block in B] + C_ptrs = CuPtr{Cvoid}[pointer(block) for block in C] + + cutensorBlockSparseContract(handle(), plan, + Ref{scalar_type}(alpha), A_ptrs, B_ptrs, + Ref{scalar_type}(beta), C_ptrs, C_ptrs, + plan.workspace, sizeof(plan.workspace), stream()) + synchronize(stream()) + return C +end + +function plan_contraction( + @nospecialize(A), Ainds::ModeType, opA::cutensorOperator_t, + @nospecialize(B), Binds::ModeType, opB::cutensorOperator_t, + @nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t, + opOut::cutensorOperator_t; + jit::cutensorJitMode_t=JIT_MODE_NONE, + workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::cutensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing) + + !is_unary(opA) && throw(ArgumentError("opA must be a unary op!")) + !is_unary(opB) && throw(ArgumentError("opB must be a unary op!")) + !is_unary(opC) && throw(ArgumentError("opC must be a unary op!")) + !is_unary(opOut) && throw(ArgumentError("opOut must be a unary op!")) + + descA = CuTensorBSDescriptor(A) + descB = CuTensorBSDescriptor(B) + descC = CuTensorBSDescriptor(C) + # for now, D must be identical to C (and thus, descD must be identical to descC) + + modeA = collect(Cint, Ainds) + modeB = collect(Cint, Binds) + modeC = collect(Cint, Cinds) + + actual_compute_type = if compute_type === nothing + contraction_compute_types[(eltype(A), eltype(B), eltype(C))] + else + compute_type + end + + + desc = Ref{cutensorOperationDescriptor_t}() + cutensorCreateBlockSparseContraction(handle(), + desc, + descA, modeA, opA, + descB, modeB, opB, + descC, modeC, opC, + descC, modeC, actual_compute_type) + + plan_pref = Ref{cutensorPlanPreference_t}() + cutensorCreatePlanPreference(handle(), plan_pref, algo, jit) + + plan = CuTensorPlan(desc[], plan_pref[]; workspacePref=workspace) + # cutensorDestroyOperationDescriptor(desc[]) + cutensorDestroyPlanPreference(plan_pref[]) + return plan +end diff --git a/lib/cutensor/src/blocksparse/types.jl b/lib/cutensor/src/blocksparse/types.jl new file mode 100644 index 0000000000..aa10ebb8f8 --- /dev/null +++ b/lib/cutensor/src/blocksparse/types.jl @@ -0,0 +1,128 @@ +## tensor + +export CuTensorBS + +## TODO add checks to see if size of data matches expected block size +mutable struct CuTensorBS{T, N} + nonzero_data::Vector{<:CuArray} + inds::Vector{Int} + blocks_per_mode::Vector{Int32} + ## This expects a Vector{Tuple(Int)} right now + block_extents::Vector{<:Tuple} + ## This expects a Vector{Tuple(Int)} right now + nonzero_block_coords::Vector{NTuple{N,Int32}} + + function CuTensorBS{T, N}(nonzero_data, + blocks_per_mode, + block_extents, + nonzero_block_coords, + inds) where {T<:Number, N} + CuArrayT = eltype(nonzero_data) + @assert eltype(CuArrayT) == T + # @assert ndims(CuArrayT) == N + @assert length(block_extents) == N + new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords) + end +end + +function CuTensorBS(nonzero_data::Vector{<:CuArray{T}}, + blocks_per_mode, block_extents, nonzero_block_coords, inds) where {T<:Number} + CuTensorBS{T,length(block_extents)}(nonzero_data, + blocks_per_mode, block_extents, nonzero_block_coords, inds) +end +# array interface +function Base.size(T::CuTensorBS) + return tuple(sum.(T.block_extents)...) +end +Base.length(T::CuTensorBS) = prod(size(T)) +nonzero_length(T::CuTensorBS) = sum(length.(T.nonzero_data)) +Base.ndims(T::CuTensorBS) = Int32(length(T.inds)) + +## This tells how far away each block is from the other block in memory. +Base.strides(T::CuTensorBS) = strides(T.nonzero_data) +Base.eltype(T::CuTensorBS) = eltype(eltype(T.nonzero_data)) + +function block_extents(T::CuTensorBS) + extents = Vector{Int64}() + + for ex in T.block_extents + extents = vcat(extents, ex...) + end + return extents +end + +nblocks_per_mode(T::CuTensorBS) = T.blocks_per_mode + +num_nonzero_blocks(T::CuTensorBS) = length(T.nonzero_block_coords) + +## This function turns the tuple of the block coordinates into a single +## list of blocks +function list_nonzero_block_coords(T::CuTensorBS) + block_list = Vector{Int64}() + for block in T.nonzero_block_coords + block_list = vcat(block_list, block...) + end + return block_list +end + +# ## descriptor +mutable struct CuTensorBSDescriptor + handle::cutensorBlockSparseTensorDescriptor_t + # inner constructor handles creation and finalizer of the descriptor + function CuTensorBSDescriptor( + numModes::Int32, + numNonZeroBlocks::Int64, + numSectionsPerMode::Vector{Int32}, + extent::Vector{Int64}, + nonZeroCoordinates::Vector{Int32}, + stride, ## Union{Vector{Int64}, C_NULL}, + eltype::Type) + + desc = Ref{cuTENSOR.cutensorBlockSparseTensorDescriptor_t}() + cutensorCreateBlockSparseTensorDescriptor(handle(), desc, + numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, + stride, eltype) + + obj = new(desc[]) + finalizer(unsafe_destroy!, obj) + return obj + end +end + +## This function assumes that strides are C_NULL, i.e. canonical stride +function CuTensorBSDescriptor( + numModes::Int32, + numNonZeroBlocks::Int64, + numSectionsPerMode::Vector{Int32}, + extent::Vector{Int64}, + nonZeroCoordinates::Vector{Int32}, + # strides = C_NULL, + eltype::Type) + + return CuTensorBSDescriptor(numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, C_NULL, eltype) +end + +Base.show(io::IO, desc::CuTensorBSDescriptor) = @printf(io, "CuTensorBSDescriptor(%p)", desc.handle) + +Base.unsafe_convert(::Type{cutensorBlockSparseTensorDescriptor_t}, obj::CuTensorBSDescriptor) = obj.handle + +function unsafe_destroy!(obj::CuTensorBSDescriptor) + cutensorDestroyBlockSparseTensorDescriptor(obj) +end + +## Descriptor function for CuTensorBS type. Please overwrite for custom objects +function CuTensorBSDescriptor(A::CuTensorBS) + numModes = ndims(A) + numNonZeroBlocks = length(A.nonzero_block_coords) + numSectionsPerMode = collect(nblocks_per_mode(A)) + extent = block_extents(A) + nonZeroCoordinates = collect(Base.Iterators.flatten(A.nonzero_block_coords)) .- Int32(1) + st = strides(A) + @assert all(st .== 1) + + dataType = eltype(A) + + ## Right now assume stride is NULL. I am not sure if stride works, need to discuss with cuTENSOR team. + CuTensorBSDescriptor(numModes, numNonZeroBlocks, + numSectionsPerMode, extent, nonZeroCoordinates, dataType) +end diff --git a/lib/cutensor/src/cuTENSOR.jl b/lib/cutensor/src/cuTENSOR.jl index 088e977622..a054951b4d 100644 --- a/lib/cutensor/src/cuTENSOR.jl +++ b/lib/cutensor/src/cuTENSOR.jl @@ -4,6 +4,8 @@ using CUDACore using CUDACore: CUstream, cudaDataType, @gcsafe_ccall, @checked, @enum_without_prefix using CUDACore: retry_reclaim, initialize_context, isdebug +using CUDACore.GPUToolbox + using CEnum: @cenum using Printf: @printf @@ -32,8 +34,14 @@ include("utils.jl") include("types.jl") include("operations.jl") + +# Block sparse wrappers +include("blocksparse/types.jl") +include("blocksparse/operations.jl") + # high-level integrations include("interfaces.jl") +include("blocksparse/interfaces.jl") ## handles diff --git a/lib/cutensor/src/libcutensor.jl b/lib/cutensor/src/libcutensor.jl index 5061842735..b33560b723 100644 --- a/lib/cutensor/src/libcutensor.jl +++ b/lib/cutensor/src/libcutensor.jl @@ -545,12 +545,12 @@ end @gcsafe_ccall libcutensor.cutensorBlockSparseContract(handle::cutensorHandle_t, plan::cutensorPlan_t, alpha::Ptr{Cvoid}, - A::Ptr{Ptr{Cvoid}}, - B::Ptr{Ptr{Cvoid}}, + A::Ptr{CuPtr{Cvoid}}, + B::Ptr{CuPtr{Cvoid}}, beta::Ptr{Cvoid}, - C::Ptr{Ptr{Cvoid}}, - D::Ptr{Ptr{Cvoid}}, - workspace::Ptr{Cvoid}, + C::Ptr{CuPtr{Cvoid}}, + D::Ptr{CuPtr{Cvoid}}, + workspace::CuPtr{Cvoid}, workspaceSize::UInt64, stream::cudaStream_t)::cutensorStatus_t end diff --git a/lib/cutensor/test/contractions.jl b/lib/cutensor/test/contractions.jl index 92c7ecf31f..8539529096 100644 --- a/lib/cutensor/test/contractions.jl +++ b/lib/cutensor/test/contractions.jl @@ -188,4 +188,69 @@ end end end +eltypes_compact = [ + (Float32, Float32, Float32, Float32), + (ComplexF32, ComplexF32, ComplexF32, Float32), + (Float64, Float64, Float64, Float64), + (ComplexF64, ComplexF64, ComplexF64, Float64) +] +## There are runtime issues in the cuTENSOR C library for versions 13.1, 12.6 and 12.0. +## These errors are fixed in other versions. +if !(CUDACore.runtime_version() ∈ [ + v"13.2", v"13.1", + v"12.8", v"12.6",v"12.0" + ]) + @testset "Blocksparse Contraction" begin + ## There are many unsupported types because this is a new functionality + ## So I will test with Float32 and ComplexF32 only + @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact + ## i = [20,20,25] + ## k = [10,10,15] + ## l = [30,30,35] + ## A = Tensor(k,i,l) + ## Nonzero blocks are + ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3] + A = Vector{CuArray{eltyA, 3}}() + for k in [10,15] + for i in [20,25] + for l in [30,35] + push!(A, CuArray(ones(eltyA, k,i,l))) + end + end + end + + ## B = Tensor(k,l) + ## Nonzero blocks are + ## [1,1], [2,3] + B = Array{CuArray{eltyB, 2}}( + [CuArray(randn(eltyB, 10, 30)), + CuArray(randn(eltyB, 10, 35))]) + + ## C = Tensor(i) + ## Nonzero blocks are + ## [1,], [3,] + C = Vector{CuArray{eltyC, 1}}( + [CuArray(zeros(eltyC, 20)), + CuArray(zeros(eltyC, 25))] + ) + + cuTenA = cuTENSOR.CuTensorBS(A, [3,3,3], + [(10,10,15), (20,20,25), (30,30,35)], + [(1,1,1), (1,1,3), (1,3,1), (1,3,3), (3,1,1), (3,1,3), (3,3,1), (3,3,3)], + [1,3,2]) + cuTenB = cuTENSOR.CuTensorBS(B, [3,3], + [(10,10,15), (30,30,35)], + [(1,1),(2,3)], [1,2], ) + cuTenC = cuTENSOR.CuTensorBS(C, [3], + [(20,20,25)],[(1,),(3,)], [3]) + + mul!(cuTenC, cuTenA, cuTenB, 1, 0) + ## C[1] = A[1,1,1] * B[1,1] + @test C[1] ≈ reshape(permutedims(A[1], (2,1,3)), (20, 10 * 30)) * reshape(B[1], (10 * 30)) + ## C[3] = A[1,3,1] * B[1,1] + @test C[2] ≈ reshape(permutedims(A[3], (2,1,3)), (25, 10 * 30)) * reshape(B[1], (10 * 30)) + end + end +end + end diff --git a/res/wrap/cutensor.toml b/res/wrap/cutensor.toml index 526ccb05cc..a743af67b2 100644 --- a/res/wrap/cutensor.toml +++ b/res/wrap/cutensor.toml @@ -57,3 +57,10 @@ needs_context = false 6 = "CuPtr{Cvoid}" 7 = "CuPtr{Cvoid}" 8 = "CuPtr{Cvoid}" + +[api.cutensorBlockSparseContract.argtypes] +4 = "Ptr{CuPtr{Cvoid}}" +5 = "Ptr{CuPtr{Cvoid}}" +7 = "Ptr{CuPtr{Cvoid}}" +8 = "Ptr{CuPtr{Cvoid}}" +9 = "Ptr{CuPtr{Cvoid}}"