try QR retraction on Stiefel

mateuszbaran · mateuszbaran · commit 966303dce218 · 2026-02-26T12:45:12.000+01:00
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
@@ -0,0 +1,36 @@
+name: Runic formatting
+on:
+  push:
+    branches:
+      - 'main'
+      - 'release-'
+    tags:
+      - '*'
+  pull_request:
+jobs:
+  runic:
+    name: Runic
+    runs-on: ubuntu-latest
+    # Permissions needed for reviewdog/action-suggester to post comments
+    permissions:
+      contents: read
+      checks: write
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v6
+      # - uses: julia-actions/setup-julia@v2
+      #   with:
+      #     version: '1'
+      # - uses: julia-actions/cache@v2
+      - uses: fredrikekre/runic-action@v1
+        with:
+          version: '1'
+          format_files: true
+        # Fail on next step instead
+        continue-on-error: ${{ github.event_name == 'pull_request' }}
+      - uses: reviewdog/action-suggester@v1
+        if: github.event_name == 'pull_request'
+        with:
+          tool_name: Runic
+          fail_level: warning
diff --git a/README.md b/README.md
@@ -1,3 +1,10 @@
 # ManifoldsGPU
 
 General GPU/CUDA support for the JuliaManifolds ecosystem.
+
+The package is in early stages of development, and the API is not yet stable.
+
+Notes:
+
+- `exp!` on `PowerManifold(Stiefel(32, 16), 2048)` is about 20x faster on CUDA.
+- QR decomposition doesn't seem to be particularly fast on GPU. Q matrix formation can't even be batched as of Feburary 2026.
diff --git a/benchmarks/main.jl b/benchmarks/main.jl
@@ -3,6 +3,7 @@ using Statistics
 
 using ManifoldsGPU
 using Manifolds
+using ManifoldsBase
 using CUDA
 
 function _time_median(f; samples::Int = 6)
@@ -16,7 +17,7 @@ function _time_median(f; samples::Int = 6)
     return median(timings), timings
 end
 
-function benchmark_stiefel_exp(; n::Int = 32, k::Int = 16, batch::Int = 2048, scale::Float32 = 0.2f0, samples::Int = 6, seed::Int = 1234)
+function _setup_stiefel_data(; n::Int, k::Int, batch::Int, scale::Float32, seed::Int)
     Random.seed!(seed)
 
     M = Stiefel(n, k)
@@ -28,33 +29,109 @@ function benchmark_stiefel_exp(; n::Int = 32, k::Int = 16, batch::Int = 2048, sc
     p_gpu = CuArray(p_cpu)
     X_gpu = CuArray(X_cpu)
 
-    exp(MP, p_cpu, X_cpu)
-    CUDA.@sync exp(MP, p_gpu, X_gpu)
+    return (; MP, p_cpu, X_cpu, p_gpu, X_gpu)
+end
 
-    cpu_ms, cpu_all = _time_median(; samples = samples) do
-        exp(MP, p_cpu, X_cpu)
-    end
+function _benchmark_cpu_gpu(cpu_f, gpu_f; samples::Int)
+    cpu_f()
+    gpu_f()
 
-    gpu_ms, gpu_all = _time_median(; samples = samples) do
-        CUDA.@sync exp(MP, p_gpu, X_gpu)
-    end
+    cpu_ms, cpu_all = _time_median(cpu_f; samples = samples)
+    gpu_ms, gpu_all = _time_median(gpu_f; samples = samples)
 
+    return cpu_ms, cpu_all, gpu_ms, gpu_all
+end
+
+function _print_results(; name::String, n::Int, k::Int, batch::Int, samples::Int, cpu_all, gpu_all, cpu_ms::Float64, gpu_ms::Float64, relerr, relerr_label::String, extra_lines::Vector{String} = String[])
     speedup = cpu_ms / gpu_ms
-    relerr = begin
-        Y_cpu = exp(MP, p_cpu, X_cpu)
-        Y_gpu = Array(CUDA.@sync exp(MP, p_gpu, X_gpu))
-        norm(Y_cpu .- Y_gpu) / max(norm(Y_cpu), eps(Float32))
-    end
 
-    println("=== ManifoldsGPU benchmark: exp on PowerManifold(Stiefel($n, $k), $batch) ===")
+    println("=== ManifoldsGPU benchmark: $name on PowerManifold(Stiefel($n, $k), $batch) ===")
     println("Element type: Float32")
+    for line in extra_lines
+        println(line)
+    end
     println("Samples: $samples")
     println("CPU times [ms]: ", round.(cpu_all; digits = 2))
     println("GPU times [ms]: ", round.(gpu_all; digits = 2))
     println("Median CPU [ms]: ", round(cpu_ms; digits = 2))
     println("Median GPU [ms]: ", round(gpu_ms; digits = 2))
     println("Speedup (CPU/GPU): ", round(speedup; digits = 2), "x")
-    return println("Relative error ||Ycpu - Ygpu||/||Ycpu||: ", relerr)
+    return println("Relative error $relerr_label: ", relerr)
+end
+
+function benchmark_stiefel_exp(; n::Int = 32, k::Int = 16, batch::Int = 2048, scale::Float32 = 0.2f0, samples::Int = 6, seed::Int = 1234)
+    data = _setup_stiefel_data(; n = n, k = k, batch = batch, scale = scale, seed = seed)
+    MP = data.MP
+    p_cpu = data.p_cpu
+    X_cpu = data.X_cpu
+    p_gpu = data.p_gpu
+    X_gpu = data.X_gpu
+
+    cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
+        () -> exp(MP, p_cpu, X_cpu),
+        () -> CUDA.@sync exp(MP, p_gpu, X_gpu);
+        samples = samples,
+    )
+
+    relerr = begin
+        Y_cpu = exp(MP, p_cpu, X_cpu)
+        Y_gpu = Array(CUDA.@sync exp(MP, p_gpu, X_gpu))
+        norm(Y_cpu .- Y_gpu) / max(norm(Y_cpu), eps(Float32))
+    end
+
+    return _print_results(
+        name = "exp",
+        n = n,
+        k = k,
+        batch = batch,
+        samples = samples,
+        cpu_all = cpu_all,
+        gpu_all = gpu_all,
+        cpu_ms = cpu_ms,
+        gpu_ms = gpu_ms,
+        relerr = relerr,
+        relerr_label = "||Ycpu - Ygpu||/||Ycpu||",
+    )
+end
+
+function benchmark_stiefel_retract_qr_fused(; n::Int = 32, k::Int = 16, batch::Int = 2048, scale::Float32 = 0.2f0, t::Float32 = 0.3f0, samples::Int = 6, seed::Int = 1234)
+    data = _setup_stiefel_data(; n = n, k = k, batch = batch, scale = scale, seed = seed)
+    MP = data.MP
+    p_cpu = data.p_cpu
+    X_cpu = data.X_cpu
+    p_gpu = data.p_gpu
+    X_gpu = data.X_gpu
+
+    q_cpu = similar(p_cpu)
+    q_gpu = similar(p_gpu)
+
+    cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
+        () -> ManifoldsBase.retract_fused!(MP, q_cpu, p_cpu, X_cpu, t, QRRetraction()),
+        () -> CUDA.@sync ManifoldsBase.retract_fused!(MP, q_gpu, p_gpu, X_gpu, t, QRRetraction());
+        samples = samples,
+    )
+
+    relerr = begin
+        ManifoldsBase.retract_fused!(MP, q_cpu, p_cpu, X_cpu, t, QRRetraction())
+        CUDA.@sync ManifoldsBase.retract_fused!(MP, q_gpu, p_gpu, X_gpu, t, QRRetraction())
+        q_gpu_h = Array(q_gpu)
+        norm(q_cpu .- q_gpu_h) / max(norm(q_cpu), eps(Float32))
+    end
+
+    return _print_results(
+        name = "retract_qr_fused",
+        n = n,
+        k = k,
+        batch = batch,
+        samples = samples,
+        cpu_all = cpu_all,
+        gpu_all = gpu_all,
+        cpu_ms = cpu_ms,
+        gpu_ms = gpu_ms,
+        relerr = relerr,
+        relerr_label = "||Qcpu - Qgpu||/||Qcpu||",
+        extra_lines = ["Retraction scalar t: $t"],
+    )
 end
 
 function _parse_arg(i::Int, default)
@@ -68,8 +145,10 @@ function main()
     samples = _parse_arg(4, 6)
 
     println("Running with n=$n, k=$k, batch=$batch, samples=$samples")
-
-    return benchmark_stiefel_exp(; n = n, k = k, batch = batch, samples = samples)
+    println()
+    benchmark_stiefel_exp(; n = n, k = k, batch = batch, samples = samples)
+    println()
+    return benchmark_stiefel_retract_qr_fused(; n = n, k = k, batch = batch, samples = samples)
 end
 
 main()
diff --git a/ext/ManifoldsGPUCUDAExt/Stiefel.jl b/ext/ManifoldsGPUCUDAExt/Stiefel.jl
@@ -30,3 +30,42 @@ function ManifoldsBase.exp!(
 
     return q
 end
+
+function ManifoldsBase.retract_qr_fused!(
+        M::PowerManifold{ℝ, <:Stiefel{ℝ}, <:Tuple, ArrayPowerRepresentation},
+        q::CuArray{T, 3},
+        p::CuArray{T, 3},
+        X::CuArray{T, 3},
+        t::Number,
+    ) where {T <: Real}
+    _, k = ManifoldsBase.get_parameter(M.manifold.size)
+    batch = size(q, 3)
+
+    q .= p .+ t .* X
+
+    q_views = [@view q[:, :, i] for i in 1:batch]
+    tau, q_factors = CUDA.CUBLAS.geqrf_batched!(q_views)
+
+    for i in 1:batch
+        q_factor_cpu = Array(q_factors[i])
+        tau_cpu = Array(tau[i])
+        d = diag(@view(q_factor_cpu[1:k, 1:k]))
+        s = sign.(sign.(d .+ T(1 // 2)))
+        LinearAlgebra.LAPACK.orgqr!(q_factor_cpu, tau_cpu)
+        q_factor_cpu .*= reshape(s, 1, k)
+        copyto!(q_factors[i], q_factor_cpu)
+    end
+
+    return q
+end
+
+function ManifoldsBase.retract_fused!(
+        M::PowerManifold{ℝ, <:Stiefel{ℝ}, <:Tuple, ArrayPowerRepresentation},
+        q::CuArray{T, 3},
+        p::CuArray{T, 3},
+        X::CuArray{T, 3},
+        t::Number,
+        ::QRRetraction,
+    ) where {T <: Real}
+    return ManifoldsBase.retract_qr_fused!(M, q, p, X, t)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -63,4 +63,50 @@ using CUDA
             @test isapprox(MP, p, Y_cu_h, Y; atol = 2.0f-5, rtol = 2.0f-5)
         end
     end
+
+    @testset "Stiefel retract_qr_fused batched" begin
+        Random.seed!(44)
+
+        M = Stiefel(8, 4)
+        MP = PowerManifold(M, 64)
+        t = 0.3
+
+        p = rand(MP)
+        X = rand(MP; vector_at = p)
+
+        q = similar(p)
+        ManifoldsBase.retract_fused!(MP, q, p, X, t, QRRetraction())
+
+        p_cu = CuArray(p)
+        X_cu = CuArray(X)
+        q_cu = similar(p_cu)
+        ManifoldsBase.retract_fused!(MP, q_cu, p_cu, X_cu, t, QRRetraction())
+        q_cu_h = Array(q_cu)
+
+        @test is_point(MP, q_cu_h)
+        @test isapprox(MP, p, q_cu_h, q; atol = 2.0e-14, rtol = 2.0e-14)
+    end
+
+    @testset "Stiefel retract_qr_fused batched Float32" begin
+        Random.seed!(45)
+
+        M = Stiefel(8, 4)
+        MP = PowerManifold(M, 64)
+        t = Float32(0.3)
+
+        p = Float32.(rand(MP))
+        X = Float32.(rand(MP; vector_at = p))
+
+        q = similar(p)
+        ManifoldsBase.retract_fused!(MP, q, p, X, t, QRRetraction())
+
+        p_cu = CuArray(p)
+        X_cu = CuArray(X)
+        q_cu = similar(p_cu)
+        ManifoldsBase.retract_fused!(MP, q_cu, p_cu, X_cu, t, QRRetraction())
+        q_cu_h = Array(q_cu)
+
+        @test is_point(MP, q_cu_h)
+        @test isapprox(MP, p, q_cu_h, q; atol = 2.0f-5, rtol = 2.0f-5)
+    end
 end