Skip to content

Commit 7a07022

Browse files
committed
feat: GPU overrides for Euclidean PowerManifold, test/benchmark restructure
Add 11 GPU-native overrides for PowerManifold(Euclidean) on CuArray: exp!, log!, distance, inner, norm, parallel_transport_to!, project! (point), project! (vector), zero_vector!, mid_point!, vector_transport_to!. Each override replaces ManifoldsBase's default per-element loop (for i in get_iterator(M)) with fused GPU broadcasting, avoiding scalar indexing on CuArray. Also: - Split tests into test/jlarray/ and test/cuda/ subdirectories - Add JLArray Stiefel tests for CI without GPU hardware - Extract shared benchmark helpers into benchmarks/common.jl - Rename benchmarks/main.jl to benchmarks/Stiefel.jl - Add benchmarks/Euclidean.jl covering all 11 overrides - Add JLArrays and GPUArrays as test dependencies
1 parent 774f1df commit 7a07022

11 files changed

Lines changed: 1197 additions & 181 deletions

File tree

Project.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,19 @@ ManifoldsGPUCUDAExt = "CUDA"
1717

1818
[compat]
1919
CUDA = "5.9.6"
20+
GPUArrays = "11"
21+
JLArrays = "0.3"
2022
ManifoldDiff = "0.4.5"
2123
Manifolds = "0.11.12"
2224
ManifoldsBase = "2.3.1"
2325
julia = "1.10.10"
2426

2527
[extras]
2628
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
29+
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
30+
JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
2731
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
2832
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2933

3034
[targets]
31-
test = ["Test", "CUDA", "Random"]
35+
test = ["Test", "CUDA", "GPUArrays", "JLArrays", "Random"]

benchmarks/Euclidean.jl

Lines changed: 371 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
include("common.jl")
2+
3+
using LinearAlgebra
4+
5+
function _setup_euclidean_data(; n::Int, k::Int, batch::Int, seed::Int)
6+
Random.seed!(seed)
7+
8+
M = Euclidean(n, k)
9+
MP = PowerManifold(M, batch)
10+
11+
p_cpu = Float32.(randn(n, k, batch))
12+
X_cpu = Float32.(randn(n, k, batch))
13+
q_cpu = Float32.(randn(n, k, batch))
14+
Y_cpu = Float32.(randn(n, k, batch))
15+
16+
p_gpu = CuArray(p_cpu)
17+
X_gpu = CuArray(X_cpu)
18+
q_gpu = CuArray(q_cpu)
19+
Y_gpu = CuArray(Y_cpu)
20+
21+
return (; MP, p_cpu, X_cpu, q_cpu, Y_cpu, p_gpu, X_gpu, q_gpu, Y_gpu)
22+
end
23+
24+
function benchmark_euclidean_exp(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
25+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
26+
MP = data.MP
27+
28+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
29+
() -> exp(MP, data.p_cpu, data.X_cpu),
30+
() -> CUDA.@sync exp(MP, data.p_gpu, data.X_gpu);
31+
samples = samples,
32+
)
33+
34+
Y_cpu = exp(MP, data.p_cpu, data.X_cpu)
35+
Y_gpu = Array(CUDA.@sync exp(MP, data.p_gpu, data.X_gpu))
36+
relerr = norm(Y_cpu .- Y_gpu) / max(norm(Y_cpu), eps(Float32))
37+
38+
return _print_results(;
39+
name = "exp",
40+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
41+
samples = samples,
42+
cpu_all = cpu_all,
43+
gpu_all = gpu_all,
44+
cpu_ms = cpu_ms,
45+
gpu_ms = gpu_ms,
46+
relerr = relerr,
47+
relerr_label = "||Ycpu - Ygpu||/||Ycpu||",
48+
)
49+
end
50+
51+
function benchmark_euclidean_log(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
52+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
53+
MP = data.MP
54+
55+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
56+
() -> log(MP, data.p_cpu, data.q_cpu),
57+
() -> CUDA.@sync log(MP, data.p_gpu, data.q_gpu);
58+
samples = samples,
59+
)
60+
61+
V_cpu = log(MP, data.p_cpu, data.q_cpu)
62+
V_gpu = Array(CUDA.@sync log(MP, data.p_gpu, data.q_gpu))
63+
relerr = norm(V_cpu .- V_gpu) / max(norm(V_cpu), eps(Float32))
64+
65+
return _print_results(;
66+
name = "log",
67+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
68+
samples = samples,
69+
cpu_all = cpu_all,
70+
gpu_all = gpu_all,
71+
cpu_ms = cpu_ms,
72+
gpu_ms = gpu_ms,
73+
relerr = relerr,
74+
relerr_label = "||Vcpu - Vgpu||/||Vcpu||",
75+
)
76+
end
77+
78+
function benchmark_euclidean_distance(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
79+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
80+
MP = data.MP
81+
82+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
83+
() -> distance(MP, data.p_cpu, data.q_cpu),
84+
() -> CUDA.@sync distance(MP, data.p_gpu, data.q_gpu);
85+
samples = samples,
86+
)
87+
88+
d_cpu = distance(MP, data.p_cpu, data.q_cpu)
89+
d_gpu = CUDA.@sync distance(MP, data.p_gpu, data.q_gpu)
90+
relerr = abs(d_cpu - d_gpu) / max(abs(d_cpu), eps(Float32))
91+
92+
return _print_results(;
93+
name = "distance",
94+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
95+
samples = samples,
96+
cpu_all = cpu_all,
97+
gpu_all = gpu_all,
98+
cpu_ms = cpu_ms,
99+
gpu_ms = gpu_ms,
100+
relerr = relerr,
101+
relerr_label = "|dcpu - dgpu|/|dcpu|",
102+
)
103+
end
104+
105+
function benchmark_euclidean_inner(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
106+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
107+
MP = data.MP
108+
109+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
110+
() -> inner(MP, data.p_cpu, data.X_cpu, data.Y_cpu),
111+
() -> CUDA.@sync inner(MP, data.p_gpu, data.X_gpu, data.Y_gpu);
112+
samples = samples,
113+
)
114+
115+
v_cpu = inner(MP, data.p_cpu, data.X_cpu, data.Y_cpu)
116+
v_gpu = CUDA.@sync inner(MP, data.p_gpu, data.X_gpu, data.Y_gpu)
117+
relerr = abs(v_cpu - v_gpu) / max(abs(v_cpu), eps(Float32))
118+
119+
return _print_results(;
120+
name = "inner",
121+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
122+
samples = samples,
123+
cpu_all = cpu_all,
124+
gpu_all = gpu_all,
125+
cpu_ms = cpu_ms,
126+
gpu_ms = gpu_ms,
127+
relerr = relerr,
128+
relerr_label = "|vcpu - vgpu|/|vcpu|",
129+
)
130+
end
131+
132+
function benchmark_euclidean_norm(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
133+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
134+
MP = data.MP
135+
136+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
137+
() -> norm(MP, data.p_cpu, data.X_cpu),
138+
() -> CUDA.@sync norm(MP, data.p_gpu, data.X_gpu);
139+
samples = samples,
140+
)
141+
142+
n_cpu = norm(MP, data.p_cpu, data.X_cpu)
143+
n_gpu = CUDA.@sync norm(MP, data.p_gpu, data.X_gpu)
144+
relerr = abs(n_cpu - n_gpu) / max(abs(n_cpu), eps(Float32))
145+
146+
return _print_results(;
147+
name = "norm",
148+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
149+
samples = samples,
150+
cpu_all = cpu_all,
151+
gpu_all = gpu_all,
152+
cpu_ms = cpu_ms,
153+
gpu_ms = gpu_ms,
154+
relerr = relerr,
155+
relerr_label = "|ncpu - ngpu|/|ncpu|",
156+
)
157+
end
158+
159+
function benchmark_euclidean_parallel_transport(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
160+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
161+
MP = data.MP
162+
163+
Z_cpu = similar(data.X_cpu)
164+
Z_gpu = similar(data.X_gpu)
165+
166+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
167+
() -> parallel_transport_to!(MP, Z_cpu, data.p_cpu, data.X_cpu, data.q_cpu),
168+
() -> CUDA.@sync parallel_transport_to!(MP, Z_gpu, data.p_gpu, data.X_gpu, data.q_gpu);
169+
samples = samples,
170+
)
171+
172+
parallel_transport_to!(MP, Z_cpu, data.p_cpu, data.X_cpu, data.q_cpu)
173+
CUDA.@sync parallel_transport_to!(MP, Z_gpu, data.p_gpu, data.X_gpu, data.q_gpu)
174+
Z_gpu_h = Array(Z_gpu)
175+
relerr = norm(Z_cpu .- Z_gpu_h) / max(norm(Z_cpu), eps(Float32))
176+
177+
return _print_results(;
178+
name = "parallel_transport_to!",
179+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
180+
samples = samples,
181+
cpu_all = cpu_all,
182+
gpu_all = gpu_all,
183+
cpu_ms = cpu_ms,
184+
gpu_ms = gpu_ms,
185+
relerr = relerr,
186+
relerr_label = "||Zcpu - Zgpu||/||Zcpu||",
187+
)
188+
end
189+
190+
function benchmark_euclidean_project_point(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
191+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
192+
MP = data.MP
193+
194+
q_cpu = similar(data.p_cpu)
195+
q_gpu = similar(data.p_gpu)
196+
197+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
198+
() -> project!(MP, q_cpu, data.p_cpu),
199+
() -> CUDA.@sync project!(MP, q_gpu, data.p_gpu);
200+
samples = samples,
201+
)
202+
203+
project!(MP, q_cpu, data.p_cpu)
204+
CUDA.@sync project!(MP, q_gpu, data.p_gpu)
205+
q_gpu_h = Array(q_gpu)
206+
relerr = norm(q_cpu .- q_gpu_h) / max(norm(q_cpu), eps(Float32))
207+
208+
return _print_results(;
209+
name = "project! (point)",
210+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
211+
samples = samples,
212+
cpu_all = cpu_all,
213+
gpu_all = gpu_all,
214+
cpu_ms = cpu_ms,
215+
gpu_ms = gpu_ms,
216+
relerr = relerr,
217+
relerr_label = "||Qcpu - Qgpu||/||Qcpu||",
218+
)
219+
end
220+
221+
function benchmark_euclidean_project_vector(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
222+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
223+
MP = data.MP
224+
225+
Y_cpu = similar(data.X_cpu)
226+
Y_gpu = similar(data.X_gpu)
227+
228+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
229+
() -> project!(MP, Y_cpu, data.p_cpu, data.X_cpu),
230+
() -> CUDA.@sync project!(MP, Y_gpu, data.p_gpu, data.X_gpu);
231+
samples = samples,
232+
)
233+
234+
project!(MP, Y_cpu, data.p_cpu, data.X_cpu)
235+
CUDA.@sync project!(MP, Y_gpu, data.p_gpu, data.X_gpu)
236+
Y_gpu_h = Array(Y_gpu)
237+
relerr = norm(Y_cpu .- Y_gpu_h) / max(norm(Y_cpu), eps(Float32))
238+
239+
return _print_results(;
240+
name = "project! (vector)",
241+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
242+
samples = samples,
243+
cpu_all = cpu_all,
244+
gpu_all = gpu_all,
245+
cpu_ms = cpu_ms,
246+
gpu_ms = gpu_ms,
247+
relerr = relerr,
248+
relerr_label = "||Ycpu - Ygpu||/||Ycpu||",
249+
)
250+
end
251+
252+
function benchmark_euclidean_zero_vector(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
253+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
254+
MP = data.MP
255+
256+
Z_cpu = similar(data.p_cpu)
257+
Z_gpu = similar(data.p_gpu)
258+
259+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
260+
() -> zero_vector!(MP, Z_cpu, data.p_cpu),
261+
() -> CUDA.@sync zero_vector!(MP, Z_gpu, data.p_gpu);
262+
samples = samples,
263+
)
264+
265+
zero_vector!(MP, Z_cpu, data.p_cpu)
266+
CUDA.@sync zero_vector!(MP, Z_gpu, data.p_gpu)
267+
relerr = norm(Array(Z_gpu))
268+
269+
return _print_results(;
270+
name = "zero_vector!",
271+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
272+
samples = samples,
273+
cpu_all = cpu_all,
274+
gpu_all = gpu_all,
275+
cpu_ms = cpu_ms,
276+
gpu_ms = gpu_ms,
277+
relerr = relerr,
278+
relerr_label = "||Zgpu|| (should be 0)",
279+
)
280+
end
281+
282+
function benchmark_euclidean_mid_point(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
283+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
284+
MP = data.MP
285+
286+
q_cpu = similar(data.p_cpu)
287+
q_gpu = similar(data.p_gpu)
288+
289+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
290+
() -> mid_point!(MP, q_cpu, data.p_cpu, data.q_cpu),
291+
() -> CUDA.@sync mid_point!(MP, q_gpu, data.p_gpu, data.q_gpu);
292+
samples = samples,
293+
)
294+
295+
mid_point!(MP, q_cpu, data.p_cpu, data.q_cpu)
296+
CUDA.@sync mid_point!(MP, q_gpu, data.p_gpu, data.q_gpu)
297+
q_gpu_h = Array(q_gpu)
298+
relerr = norm(q_cpu .- q_gpu_h) / max(norm(q_cpu), eps(Float32))
299+
300+
return _print_results(;
301+
name = "mid_point!",
302+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
303+
samples = samples,
304+
cpu_all = cpu_all,
305+
gpu_all = gpu_all,
306+
cpu_ms = cpu_ms,
307+
gpu_ms = gpu_ms,
308+
relerr = relerr,
309+
relerr_label = "||Qcpu - Qgpu||/||Qcpu||",
310+
)
311+
end
312+
313+
function benchmark_euclidean_vector_transport(; n::Int = 32, k::Int = 16, batch::Int = 2048, samples::Int = 6, seed::Int = 1234)
314+
data = _setup_euclidean_data(; n = n, k = k, batch = batch, seed = seed)
315+
MP = data.MP
316+
317+
Y_cpu = similar(data.X_cpu)
318+
Y_gpu = similar(data.X_gpu)
319+
320+
cpu_ms, cpu_all, gpu_ms, gpu_all = _benchmark_cpu_gpu(
321+
() -> vector_transport_to!(MP, Y_cpu, data.p_cpu, data.X_cpu, data.q_cpu, ParallelTransport()),
322+
() -> CUDA.@sync vector_transport_to!(MP, Y_gpu, data.p_gpu, data.X_gpu, data.q_gpu, ParallelTransport());
323+
samples = samples,
324+
)
325+
326+
vector_transport_to!(MP, Y_cpu, data.p_cpu, data.X_cpu, data.q_cpu, ParallelTransport())
327+
CUDA.@sync vector_transport_to!(MP, Y_gpu, data.p_gpu, data.X_gpu, data.q_gpu, ParallelTransport())
328+
Y_gpu_h = Array(Y_gpu)
329+
relerr = norm(Y_cpu .- Y_gpu_h) / max(norm(Y_cpu), eps(Float32))
330+
331+
return _print_results(;
332+
name = "vector_transport_to!",
333+
manifold_label = "PowerManifold(Euclidean($n, $k), $batch)",
334+
samples = samples,
335+
cpu_all = cpu_all,
336+
gpu_all = gpu_all,
337+
cpu_ms = cpu_ms,
338+
gpu_ms = gpu_ms,
339+
relerr = relerr,
340+
relerr_label = "||Ycpu - Ygpu||/||Ycpu||",
341+
)
342+
end
343+
344+
function main()
345+
n = _parse_arg(1, 32)
346+
k = _parse_arg(2, 16)
347+
batch = _parse_arg(3, 2048)
348+
samples = _parse_arg(4, 6)
349+
350+
println("Running Euclidean benchmarks with n=$n, k=$k, batch=$batch, samples=$samples")
351+
352+
for bench_fn in [
353+
benchmark_euclidean_exp,
354+
benchmark_euclidean_log,
355+
benchmark_euclidean_distance,
356+
benchmark_euclidean_inner,
357+
benchmark_euclidean_norm,
358+
benchmark_euclidean_parallel_transport,
359+
benchmark_euclidean_project_point,
360+
benchmark_euclidean_project_vector,
361+
benchmark_euclidean_zero_vector,
362+
benchmark_euclidean_mid_point,
363+
benchmark_euclidean_vector_transport,
364+
]
365+
println()
366+
bench_fn(; n = n, k = k, batch = batch, samples = samples)
367+
end
368+
return
369+
end
370+
371+
main()

0 commit comments

Comments
 (0)