Skip to content

Commit 2535789

Browse files
Fix KernelAbstractions example with Valentin's help
1 parent 560f8b5 commit 2535789

1 file changed

Lines changed: 34 additions & 23 deletions

File tree

test/GPU/heat_assembly.jl

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ end
5959
@kernel function ka_assembly_kernel(assembler, @Const(color), cc, cv, Kes, fes)
6060
# This is the classical grid-stride-loop
6161
task_index = @index(Global, Linear)
62-
stride = KA.@groupsize()[1]
62+
stride = prod(KA.@ndrange())
6363
for i in task_index:stride:length(color)
6464
# Work item index
6565
cellid = color[i]
@@ -87,10 +87,18 @@ function assemble_global_ka!(backend, cv::CellValuesContainer, K, f, cc, colors:
8787
for color in colors
8888
# We divide the work into blocks and fire up the kernel.
8989
n = length(color)
90-
threads = min(NUM_THREADS, n)
91-
blocks = cld(length(color), threads)
90+
# Let's assign, arbitrarily, two element assembly tasks per GPU thread.
91+
tasks_per_thread = min(2, n)
92+
# To do so, let us first compute how many element groups we have to assemble.
93+
n_effective = cld(n, tasks_per_thread)
94+
# This potentially limits the number of usable threads, e.g. when a color just has a small
95+
# number of elements.
96+
threads = min(NUM_THREADS, n_effective)
97+
# Furthermore, for CPU computing we typically group the tasks into blocks of worker threads.
98+
blocks = cld(n, tasks_per_thread * threads)
99+
# Now, we can build and execute the Kernel.
92100
ka_kernel = ka_assembly_kernel(backend, threads)
93-
ka_kernel(assembler, color, cc, cv, Ke, fe, ndrange = length(color))
101+
ka_kernel(assembler, color, cc, cv, Ke, fe, ndrange = threads * blocks)
94102
# Since the kernel launches asynchronously we need to add a synchronization
95103
# point before proceeding here. Otherwise we will start assembling the next color,
96104
# while there are still threads working on the current color, therefore potentially
@@ -119,8 +127,9 @@ function assemble_global_cuda!(cv::CellValuesContainer, K, f, cc, colors::Vector
119127
assembler = K === nothing ? nothing : start_assemble(K, f)
120128
for color in colors
121129
n = length(color)
130+
tasks_per_thread = 2
122131
threads = min(NUM_THREADS, n)
123-
blocks = cld(length(color), threads)
132+
blocks = cld(n, tasks_per_thread * threads)
124133
@cuda threads = threads blocks = blocks cuda_assembly_kernel(assembler, color, cc, cv, Ke, fe)
125134
CUDA.synchronize()
126135
end
@@ -184,7 +193,7 @@ Kes = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv), getnbas
184193
fes = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv))
185194

186195
# Now everything is set to launch the assembly via KernelAbstractions.
187-
# assemble_global_ka!(backend, cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes) # FIXME launch differs from cuda variant.
196+
# assemble_global_ka!(backend, cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes)
188197
# Or alternatively the cuda variant.
189198
assemble_global_cuda!(cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes)
190199

@@ -213,21 +222,23 @@ u_cpu = K \ f
213222
# the solutions are usually still very close.
214223
@test u_cpu u_gpu
215224

216-
# Test KA CPU
217-
begin
218-
backend = KA.CPU()
219-
colors_cpu = [adapt(backend, c) for c in colors]
220-
n_workers = maximum(length.(colors_cpu))
221-
dh_cpu = adapt(backend, dh)
222-
K_cpu = allocate_matrix(SparseMatrixCSC{Float32, Int32}, dh)
223-
f_cpu = KA.zeros(backend, Float32, ndofs(dh))
224-
cv_cpu = CellValuesContainer(backend, n_workers, cv)
225-
cell_cache = CellCacheContainer(backend, n_workers, dh_cpu)
226-
Kes_cpu = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv), getnbasefunctions(cv))
227-
fes_cpu = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv))
228-
# Assembly here does notw ork because we are missing a SOA transformation of the assembler.
229-
assemble_global_ka!(backend, cv_cpu, nothing, nothing, cell_cache, colors_cpu, Kes_cpu, fes_cpu)
230-
# @test K_cpu \ f_cpu ≈ u_cpu
231-
@test Kes_cpu Array(Kes) broken = true
232-
@test fes_cpu Array(fes) broken = true
225+
# Test KA
226+
@testset "KernelAbstractions paths for $backend" for backend in [KA.CPU(), CUDABackend()]
227+
colors_device = [adapt(backend, c) for c in colors]
228+
n_workers = maximum(length.(colors_device))
229+
dh_device = adapt(backend, dh)
230+
K_device = if backend isa KA.CPU
231+
allocate_matrix(SparseMatrixCSC{Float32, Int32}, dh)
232+
else
233+
allocate_matrix(CuSparseMatrixCSC{Float32, Int32}, dh)
234+
end
235+
f_device = KA.zeros(backend, Float32, ndofs(dh))
236+
cv_device = CellValuesContainer(backend, n_workers, cv)
237+
cell_cache = CellCacheContainer(backend, n_workers, dh_device)
238+
Kes_device = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv), getnbasefunctions(cv))
239+
fes_device = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv))
240+
# Assembly here does not work because we are missing a SOA transformation of the assembler.
241+
assemble_global_ka!(backend, cv_device, nothing, nothing, cell_cache, colors_device, Kes_device, fes_device)
242+
@test Array(Kes_device) Array(Kes)
243+
@test Array(fes_device) Array(fes)
233244
end

0 commit comments

Comments
 (0)