|
59 | 59 | @kernel function ka_assembly_kernel(assembler, @Const(color), cc, cv, Kes, fes) |
60 | 60 | # This is the classical grid-stride-loop |
61 | 61 | task_index = @index(Global, Linear) |
62 | | - stride = KA.@groupsize()[1] |
| 62 | + stride = prod(KA.@ndrange()) |
63 | 63 | for i in task_index:stride:length(color) |
64 | 64 | # Work item index |
65 | 65 | cellid = color[i] |
@@ -87,10 +87,18 @@ function assemble_global_ka!(backend, cv::CellValuesContainer, K, f, cc, colors: |
87 | 87 | for color in colors |
88 | 88 | # We divide the work into blocks and fire up the kernel. |
89 | 89 | n = length(color) |
90 | | - threads = min(NUM_THREADS, n) |
91 | | - blocks = cld(length(color), threads) |
| 90 | + # Let's assign, arbitrarily, two element assembly tasks per GPU thread. |
| 91 | + tasks_per_thread = min(2, n) |
| 92 | + # To do so, let us first compute how many element groups we have to assemble. |
| 93 | + n_effective = cld(n, tasks_per_thread) |
| 94 | + # This potentially limits the number of usable threads, e.g. when a color just has a small |
| 95 | + # number of elements. |
| 96 | + threads = min(NUM_THREADS, n_effective) |
| 97 | + # Furthermore, for CPU computing we typically group the tasks into blocks of worker threads. |
| 98 | + blocks = cld(n, tasks_per_thread * threads) |
| 99 | + # Now, we can build and execute the Kernel. |
92 | 100 | ka_kernel = ka_assembly_kernel(backend, threads) |
93 | | - ka_kernel(assembler, color, cc, cv, Ke, fe, ndrange = length(color)) |
| 101 | + ka_kernel(assembler, color, cc, cv, Ke, fe, ndrange = threads * blocks) |
94 | 102 | # Since the kernel launches asynchronously we need to add a synchronization |
95 | 103 | # point before proceeding here. Otherwise we will start assembling the next color, |
96 | 104 | # while there are still threads working on the current color, therefore potentially |
@@ -119,8 +127,9 @@ function assemble_global_cuda!(cv::CellValuesContainer, K, f, cc, colors::Vector |
119 | 127 | assembler = K === nothing ? nothing : start_assemble(K, f) |
120 | 128 | for color in colors |
121 | 129 | n = length(color) |
| 130 | + tasks_per_thread = 2 |
122 | 131 | threads = min(NUM_THREADS, n) |
123 | | - blocks = cld(length(color), threads) |
| 132 | + blocks = cld(n, tasks_per_thread * threads) |
124 | 133 | @cuda threads = threads blocks = blocks cuda_assembly_kernel(assembler, color, cc, cv, Ke, fe) |
125 | 134 | CUDA.synchronize() |
126 | 135 | end |
@@ -184,7 +193,7 @@ Kes = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv), getnbas |
184 | 193 | fes = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv)) |
185 | 194 |
|
186 | 195 | # Now everything is set to launch the assembly via KernelAbstractions. |
187 | | -# assemble_global_ka!(backend, cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes) # FIXME launch differs from cuda variant. |
| 196 | +# assemble_global_ka!(backend, cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes) |
188 | 197 | # Or alternatively the cuda variant. |
189 | 198 | assemble_global_cuda!(cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes) |
190 | 199 |
|
@@ -213,21 +222,23 @@ u_cpu = K \ f |
213 | 222 | # the solutions are usually still very close. |
214 | 223 | @test u_cpu ≈ u_gpu |
215 | 224 |
|
216 | | -# Test KA CPU |
217 | | -begin |
218 | | - backend = KA.CPU() |
219 | | - colors_cpu = [adapt(backend, c) for c in colors] |
220 | | - n_workers = maximum(length.(colors_cpu)) |
221 | | - dh_cpu = adapt(backend, dh) |
222 | | - K_cpu = allocate_matrix(SparseMatrixCSC{Float32, Int32}, dh) |
223 | | - f_cpu = KA.zeros(backend, Float32, ndofs(dh)) |
224 | | - cv_cpu = CellValuesContainer(backend, n_workers, cv) |
225 | | - cell_cache = CellCacheContainer(backend, n_workers, dh_cpu) |
226 | | - Kes_cpu = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv), getnbasefunctions(cv)) |
227 | | - fes_cpu = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv)) |
228 | | - # Assembly here does notw ork because we are missing a SOA transformation of the assembler. |
229 | | - assemble_global_ka!(backend, cv_cpu, nothing, nothing, cell_cache, colors_cpu, Kes_cpu, fes_cpu) |
230 | | - # @test K_cpu \ f_cpu ≈ u_cpu |
231 | | - @test Kes_cpu ≈ Array(Kes) broken = true |
232 | | - @test fes_cpu ≈ Array(fes) broken = true |
| 225 | +# Test KA |
| 226 | +@testset "KernelAbstractions paths for $backend" for backend in [KA.CPU(), CUDABackend()] |
| 227 | + colors_device = [adapt(backend, c) for c in colors] |
| 228 | + n_workers = maximum(length.(colors_device)) |
| 229 | + dh_device = adapt(backend, dh) |
| 230 | + K_device = if backend isa KA.CPU |
| 231 | + allocate_matrix(SparseMatrixCSC{Float32, Int32}, dh) |
| 232 | + else |
| 233 | + allocate_matrix(CuSparseMatrixCSC{Float32, Int32}, dh) |
| 234 | + end |
| 235 | + f_device = KA.zeros(backend, Float32, ndofs(dh)) |
| 236 | + cv_device = CellValuesContainer(backend, n_workers, cv) |
| 237 | + cell_cache = CellCacheContainer(backend, n_workers, dh_device) |
| 238 | + Kes_device = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv), getnbasefunctions(cv)) |
| 239 | + fes_device = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv)) |
| 240 | + # Assembly here does not work because we are missing a SOA transformation of the assembler. |
| 241 | + assemble_global_ka!(backend, cv_device, nothing, nothing, cell_cache, colors_device, Kes_device, fes_device) |
| 242 | + @test Array(Kes_device) ≈ Array(Kes) |
| 243 | + @test Array(fes_device) ≈ Array(fes) |
233 | 244 | end |
0 commit comments