Add comprehensive GPU solver test suite

zazabap · zazabap · commit 33d325a4b6d9 · 2026-02-20T15:20:21.000Z
17 tests verifying gradient_descent and conjugate_gradient_descent work transparently with CuArray-backed manifold points. No ManoptCUDAExt needed — the _produce_type fix from #577 handles GPU allocation natively. Tests cover: ConstantLength/ArmijoLinesearch stepsizes, Float32/Float64, matrix Euclidean, Sphere, recording with :Cost, CPU-vs-GPU equivalence. All tests use known closed-form solutions for verification.
diff --git a/Changelog.md b/Changelog.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * A clarification on the use of AI in the [CONTRIBUTING.md](https://manoptjl.org/stable/contributing/) (#573)
 * `_produce_type` now accepts the point `p` as an optional third argument, which can be used to produce objects with specific point type for internal buffers. The addition has been utilized in `DirectionUpdateRule`s and `Stepsize`s to improve GPU and custom floating point type compatibility. (#577)
 * Added another package and paper using `Manopt.jl` to the about page (#576).
+* Added GPU/CUDA test suite (`test/test_cuda_ext.jl`) verifying that solvers (`gradient_descent`, `conjugate_gradient_descent`) work transparently with `CuArray`-backed manifold points. Tests cover `ConstantLength`, `ArmijoLinesearch` stepsizes, Float32/Float64, recording, Sphere, and CPU-vs-GPU equivalence. No solver extension is needed — the `_produce_type` fix from #577 handles GPU allocation natively.
 
 ### Fixed
 
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
@@ -22,6 +23,7 @@ Manopt = {path = ".."}
 
 [compat]
 Aqua = "0.8"
+CUDA = "5"
 Dates = "1.10"
 ForwardDiff = "0.10, 1"
 JuMP = "1.15"
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -72,6 +72,7 @@ using Manifolds, ManifoldsBase, Manopt, Test
         include("solvers/test_trust_regions.jl")
         include("solvers/test_vectorbundle_newton.jl")
     end
+    include("test_cuda_ext.jl")
     include("MOI_wrapper.jl")
     include("test_aqua.jl")
     include("test_deprecated.jl")
diff --git a/test/test_cuda_ext.jl b/test/test_cuda_ext.jl
@@ -0,0 +1,171 @@
+using Manopt, Manifolds, ManifoldsBase, Test
+using LinearAlgebra
+
+@testset "GPU solver tests" begin
+    cuda_loaded = false
+    try
+        using CUDA
+        cuda_loaded = CUDA.functional()
+    catch
+        cuda_loaded = false
+    end
+
+    if cuda_loaded
+        @eval using CUDA
+
+        @testset "GD + ConstantLength on Euclidean" begin
+            M = Euclidean(3)
+            target_cpu = [1.0, 2.0, 3.0]
+            target = CuArray(target_cpu)
+            f(M, p) = sum((p .- target) .^ 2) / 2
+            grad_f(M, p) = p .- target
+            p0 = CuArray(zeros(3))
+
+            result = gradient_descent(
+                M, f, grad_f, p0;
+                stopping_criterion=StopAfterIteration(200),
+                stepsize=ConstantLength(0.1),
+            )
+            @test result isa CuArray{Float64}
+            @test isapprox(Array(result), target_cpu; atol=1e-6)
+        end
+
+        @testset "GD + ArmijoLinesearch on Euclidean" begin
+            M = Euclidean(3)
+            target_cpu = [1.0, 2.0, 3.0]
+            target = CuArray(target_cpu)
+            f(M, p) = sum((p .- target) .^ 2) / 2
+            grad_f(M, p) = p .- target
+            p0 = CuArray(zeros(3))
+
+            result = gradient_descent(
+                M, f, grad_f, p0;
+                stopping_criterion=StopAfterIteration(50),
+            )
+            @test result isa CuArray{Float64}
+            @test isapprox(Array(result), target_cpu; atol=1e-5)
+        end
+
+        @testset "GD + ConstantLength Float32" begin
+            T = Float32
+            M = Euclidean(3)
+            target_cpu = T[1.0, 2.0, 3.0]
+            target = CuArray(target_cpu)
+            f(M, p) = sum((p .- target) .^ 2) / T(2)
+            grad_f(M, p) = p .- target
+            p0 = CUDA.zeros(T, 3)
+
+            result = gradient_descent(
+                M, f, grad_f, p0;
+                stopping_criterion=StopAfterIteration(200),
+                stepsize=ConstantLength(T(0.1)),
+            )
+            @test result isa CuArray{Float32}
+            @test isapprox(Array(result), target_cpu; atol=T(1e-3))
+        end
+
+        @testset "GD on matrix-valued Euclidean" begin
+            M = Euclidean(3, 3)
+            target_cpu = randn(3, 3)
+            target = CuArray(target_cpu)
+            f(M, p) = sum((p .- target) .^ 2) / 2
+            grad_f(M, p) = p .- target
+            p0 = CuArray(zeros(3, 3))
+
+            result = gradient_descent(
+                M, f, grad_f, p0;
+                stopping_criterion=StopAfterIteration(200),
+                stepsize=ConstantLength(0.1),
+            )
+            @test result isa CuArray{Float64,2}
+            @test size(result) == (3, 3)
+            @test isapprox(Array(result), target_cpu; atol=1e-6)
+        end
+
+        @testset "Conjugate GD on Euclidean" begin
+            M = Euclidean(5)
+            target_cpu = randn(5)
+            target = CuArray(target_cpu)
+            f(M, p) = sum((p .- target) .^ 2) / 2
+            grad_f(M, p) = p .- target
+            p0 = CuArray(zeros(5))
+
+            result = conjugate_gradient_descent(
+                M, f, grad_f, p0;
+                stopping_criterion=StopAfterIteration(50),
+                stepsize=ConstantLength(0.1),
+            )
+            @test result isa CuArray{Float64}
+            @test isapprox(Array(result), target_cpu; atol=1e-3)
+        end
+
+        @testset "GD on Sphere" begin
+            M = Sphere(2)
+            a_cpu = [1.0, 2.0, 3.0]
+            known_solution = a_cpu / norm(a_cpu)
+            a = CuArray(a_cpu)
+            f(M, p) = sum((p .- a) .^ 2) / 2
+            grad_f(M, p) = project(M, p, p .- a)
+
+            s_cpu = [0.0, 0.0, 1.0]
+            p0 = CuArray(s_cpu)
+
+            result = gradient_descent(
+                M, f, grad_f, p0;
+                stopping_criterion=StopAfterIteration(100),
+                stepsize=ConstantLength(0.1),
+            )
+            @test result isa CuArray{Float64}
+            @test isapprox(norm(Array(result)), 1.0; atol=1e-10)
+            @test isapprox(Array(result), known_solution; atol=1e-4)
+        end
+
+        @testset "GD + recording on Euclidean" begin
+            M = Euclidean(3)
+            target = CuArray([1.0, 2.0, 3.0])
+            f(M, p) = sum((p .- target) .^ 2) / 2
+            grad_f(M, p) = p .- target
+            p0 = CuArray(zeros(3))
+
+            result = gradient_descent(
+                M, f, grad_f, p0;
+                stopping_criterion=StopAfterIteration(20),
+                stepsize=ConstantLength(0.1),
+                record=[:Cost],
+                return_state=true,
+            )
+            rec = get_record(result)
+            @test length(rec) == 20
+            p_final = get_solver_result(result)
+            @test p_final isa CuArray{Float64}
+        end
+
+        @testset "CPU vs GPU equivalence" begin
+            M = Euclidean(5)
+            target_cpu = randn(5)
+            target_gpu = CuArray(target_cpu)
+
+            f_cpu(M, p) = sum((p .- target_cpu) .^ 2) / 2
+            grad_f_cpu(M, p) = p .- target_cpu
+            f_gpu(M, p) = sum((p .- target_gpu) .^ 2) / 2
+            grad_f_gpu(M, p) = p .- target_gpu
+
+            p0_cpu = zeros(5)
+            p0_gpu = CuArray(zeros(5))
+
+            result_cpu = gradient_descent(
+                M, f_cpu, grad_f_cpu, p0_cpu;
+                stopping_criterion=StopAfterIteration(100),
+                stepsize=ConstantLength(0.1),
+            )
+            result_gpu = gradient_descent(
+                M, f_gpu, grad_f_gpu, p0_gpu;
+                stopping_criterion=StopAfterIteration(100),
+                stepsize=ConstantLength(0.1),
+            )
+            @test isapprox(Array(result_gpu), result_cpu; atol=1e-10)
+        end
+    else
+        @info "CUDA not functional, skipping GPU solver tests"
+    end
+end