Add support for 3-arg dot. (#2914)

maleadt · web-flow · commit b4ac08a7b472 · 2025-10-06T09:00:29.000+02:00
diff --git a/lib/cublas/linalg.jl b/lib/cublas/linalg.jl
@@ -108,6 +108,92 @@ function LinearAlgebra.dot(x::AnyCuArray{T1}, y::AnyCuArray{T2}) where {T1,T2}
     end
 end
 
+# three-argument dot: avoid materializing A*y
+function LinearAlgebra.dot(x::AnyCuArray{T1}, A::AnyCuArray{T2}, y::AnyCuArray{T3}) where {T1,T2,T3}
+    nx = length(x)
+    ny = length(y)
+    mA, nA = size(A)
+
+    nx == mA || throw(DimensionMismatch("length of x, $nx, does not match first dimension of A, $mA"))
+    ny == nA || throw(DimensionMismatch("length of y, $ny, does not match second dimension of A, $nA"))
+
+    # custom kernel using simple linear indexing and atomic additions
+    # COV_EXCL_START
+    function kernel(x, A, y, res::AbstractArray{T}, shuffle) where {T}
+        local_val = zero(T)
+
+        # grid-stride loop over rows
+        i = threadIdx().x + (blockIdx().x - 1i32)*blockDim().x
+        while i <= mA
+            row_val = zero(T)
+            for j in 1:nA
+                # XXX: this is slow, but the focus is on avoiding materializing A*y
+                @inbounds row_val += A[i, j] * y[j]
+            end
+            @inbounds local_val += LinearAlgebra.dot(x[i], row_val)
+            i += blockDim().x * gridDim().x
+        end
+
+        val = CUDA.reduce_block(+, local_val, zero(T), shuffle)
+        if threadIdx().x == 1i32
+            # NOTE: introduces nondeterminism
+            @inbounds CUDA.@atomic res[] += val
+        end
+
+        return
+    end
+    # COV_EXCL_STOP
+
+    dev = device()
+    let T = promote_type(T1, T2, T3)
+        # only use the above kernel if we don't care about determinism
+        # and if atomic operations are supported on these inputs
+        atomic = if capability(device()) >= v"7.0"
+            T <: Union{Int16, Int32, Int64, Float16, Float32, Float64}
+        else
+            T <: Union{Int32, Int64, Float32, Float64}
+        end
+        if CUDA.math_mode() == CUDA.PEDANTIC_MATH || !atomic
+            bc = Base.Broadcast.broadcasted(A, Base.CartesianIndices(A)) do a, I
+                i, j = Tuple(I)
+                LinearAlgebra.dot(x[i], a * y[j])
+            end
+            return sum(bc)
+        end
+
+        res = CUDA.zeros(T, 1)
+
+        # be conservative about using shuffle instructions
+        shuffle = T <: Union{Bool,
+                             UInt8, UInt16, UInt32, UInt64, UInt128,
+                             Int8, Int16, Int32, Int64, Int128,
+                             Float16, Float32, Float64,
+                             ComplexF16, ComplexF32, ComplexF64}
+
+        # how many threads do we want?
+        # reduce_block(shuffle=true) requires the block to consist of full warps.
+        wanted_threads = shuffle ? nextwarp(dev, mA) : mA
+        function compute_threads(max_threads)
+            if wanted_threads > max_threads
+                shuffle ? prevwarp(dev, max_threads) : max_threads
+            else
+                wanted_threads
+            end
+        end
+
+        # how many threads can we launch?
+        kernel_func = @cuda launch=false kernel(x, A, y, res, Val(shuffle))
+        compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
+        config = launch_configuration(kernel_func.fun; shmem=compute_shmem∘compute_threads)
+        threads = compute_threads(config.threads)
+        blocks = min(config.blocks, cld(mA, config.blocks))
+        shmem = compute_shmem(threads)
+        kernel_func(x, A, y, res, Val(shuffle); threads, blocks, shmem)
+
+        CUDA.@allowscalar res[]
+    end
+end
+
 function LinearAlgebra.:(*)(transx::Transpose{<:Any,<:StridedCuVector{T}},
                             y::StridedCuVector{T}) where T<:Union{ComplexF16, CublasComplex}
     x = transx.parent
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -168,7 +168,10 @@ end
 function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
                                  A::Union{AbstractArray,Broadcast.Broadcasted};
                                  init=nothing) where {F, OP, T}
-    Base.check_reducedims(R, A)
+    if !isa(A, Broadcast.Broadcasted)
+        # XXX: Base.axes isn't defined anymore for Broadcasted, breaking this check
+        Base.check_reducedims(R, A)
+    end
     length(A) == 0 && return R # isempty(::Broadcasted) iterates
     dev = device()
 
diff --git a/test/base/linalg.jl b/test/base/linalg.jl
@@ -13,6 +13,9 @@ end
                        ComplexF16, ComplexF32, ComplexF64]
         @test testf(dot, rand(T, 256), rand(Bool, 256))
         @test testf(dot, rand(Bool, 256), rand(T, 256))
+
+        @test testf(dot, rand(T, 256), rand(T, 256, 256), rand(Bool, 256))
+        @test testf(dot, rand(Bool, 256), rand(T, 256, 256), rand(T, 256))
     end
 
     @test testf(dot, rand(Bool, 1024, 1024), rand(Float64, 1024, 1024))