|
1 | | -using Ferrite, CUDA |
2 | | -import CUDA: CUDA.CUSPARSE.CuSparseMatrixCSC |
3 | | -import Adapt: adapt |
4 | | -import KernelAbstractions: @kernel, @index |
5 | | -import KernelAbstractions as KA |
6 | | -using SparseArrays |
7 | 1 |
|
8 | | -import Ferrite: CellValuesContainer, CellCacheContainer |
9 | | - |
10 | | -const NUM_THREADS = 64 |
11 | | -const NUM_TASKS_PER_THREAD = 2 |
12 | | - |
13 | | -# In this how-to we want to use an existing assembly routine on the GPU with Ferrite. |
14 | | -# We implicitly assume that nothing dynamic happens inside the routine, i.e. the routine |
15 | | -# is type stable, does not allocate and also does not have any dynamic dispatches. |
16 | | -function assemble_element!(Ke::AbstractMatrix, fe::AbstractVector, cv::CellValues) |
17 | | - n_basefuncs = getnbasefunctions(cv) |
18 | | - |
19 | | - for q_point in 1:getnquadpoints(cv) |
20 | | - dΩ = getdetJdV(cv, q_point) |
21 | | - for i in 1:n_basefuncs |
22 | | - ∇δuᵢ = shape_gradient(cv, q_point, i) |
23 | | - δuᵢ = shape_value(cv, q_point, i) |
24 | | - fe[i] += δuᵢ * dΩ |
25 | | - for j in 1:n_basefuncs |
26 | | - ∇δuⱼ = shape_gradient(cv, q_point, j) |
27 | | - Ke[i, j] += (∇δuᵢ ⋅ ∇δuⱼ) * dΩ |
28 | | - end |
29 | | - end |
30 | | - end |
31 | | - return nothing |
32 | | -end |
33 | | - |
34 | | -# We also have a simple cell assembly wrapping the element in two variants. |
35 | | -# In the first variant we assemble using an assembler. In the second variant we |
36 | | -# only fill Ke, e.g. as part of element-assembly techniques. |
37 | | -function assemble_cell!(Ke, fe, cell, cv, assembler) |
38 | | - reinit!(cv, nothing, cell.coords) |
39 | | - fill!(Ke, 0) |
40 | | - fill!(fe, 0) |
41 | | - assemble_element!(Ke, fe, cv) |
42 | | - assemble!(assembler, celldofs(cell), Ke, fe) |
43 | | - return nothing |
44 | | -end |
45 | | -function assemble_cell!(Ke, fe, cell, cv, ::Nothing) |
46 | | - reinit!(cv, nothing, cell.coords) |
47 | | - fill!(Ke, 0) |
48 | | - fill!(fe, 0) |
49 | | - assemble_element!(Ke, fe, cv) |
50 | | - return nothing |
51 | | -end |
52 | | - |
53 | | -# Now to the actual assembly kernel. To ensure portability we show how to use KernelAbstractions.jl |
54 | | -# as the kernel language, although we will also show how to use CUDA directly below. In this kernel |
55 | | -# we use a grid-stride loop, which has several benefits in terms of performance and debuggability. |
56 | | -# For more details please consult https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ . |
57 | | -@kernel function ka_assembly_kernel(assembler, @Const(color), cc, cv, Kes, fes) |
58 | | - # This is the classical grid-stride-loop |
59 | | - task_index = @index(Global, Linear) |
60 | | - stride = prod(KA.@ndrange()) |
61 | | - for i in task_index:stride:length(color) |
62 | | - # Work item index |
63 | | - cellid = color[i] |
64 | | - |
65 | | - # Query the local evaluation buffer of the GPU worker. |
66 | | - # As explained later this is the secret sauce. |
67 | | - cv_i = cv[task_index] |
68 | | - |
69 | | - # Query work item cell cache. |
70 | | - cc_i = cc[task_index](cellid) # FIXME is there a better way to sneak the cellid into the cache? |
71 | | - |
72 | | - # Fill buffer. |
73 | | - reinit!(cc_i, cellid) |
74 | | - |
75 | | - # Query assembly buffer. |
76 | | - Ke = view(Kes, i, :, :) |
77 | | - fe = view(fes, i, :) |
78 | | - |
79 | | - # # Actual assembly routine. |
80 | | - assemble_cell!(Ke, fe, cc_i, cv_i, assembler) |
81 | | - end |
82 | | -end |
83 | | -function assemble_global_ka!(backend, cv::CellValuesContainer, K, f, cc, colors::Vector, Ke, fe) |
84 | | - assembler = K === nothing ? nothing : start_assemble(K, f) |
85 | | - for color in colors |
86 | | - # We divide the work into blocks and fire up the kernel. |
87 | | - n = length(color) |
88 | | - # Let's assign, arbitrarily, two element assembly tasks per GPU thread. |
89 | | - tasks_per_thread = min(NUM_TASKS_PER_THREAD, n) |
90 | | - # To do so, let us first compute how many element groups we have to assemble. |
91 | | - n_effective = cld(n, tasks_per_thread) |
92 | | - # This potentially limits the number of usable threads, e.g. when a color just has a small |
93 | | - # number of elements. |
94 | | - threads = min(NUM_THREADS, n_effective) |
95 | | - # Furthermore, for CPU computing we typically group the tasks into blocks of worker threads. |
96 | | - blocks = cld(n, tasks_per_thread * threads) |
97 | | - # Now, we can build and execute the Kernel. |
98 | | - ka_kernel = ka_assembly_kernel(backend, threads) |
99 | | - ka_kernel(assembler, color, cc, cv, Ke, fe, ndrange = threads * blocks) |
100 | | - # Since the kernel launches asynchronously we need to add a synchronization |
101 | | - # point before proceeding here. Otherwise we will start assembling the next color, |
102 | | - # while there are still threads working on the current color, therefore potentially |
103 | | - # causing race conditions. |
104 | | - KA.synchronize(backend) |
105 | | - end |
106 | | - return nothing |
107 | | -end |
108 | | - |
109 | | -# And here now the CUDA variant. Please see above for details, as the kernels are almost the same. |
110 | | -function cuda_assembly_kernel(assembler, color, cc, cv, Kes, fes) |
111 | | - task_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x |
112 | | - stride = gridDim().x * blockDim().x |
113 | | - for i in task_index:stride:length(color) |
114 | | - cellid = color[i] |
115 | | - cv_i = cv[task_index] |
116 | | - cc_i = cc[task_index](cellid) |
117 | | - reinit!(cc_i, cellid) |
118 | | - Ke = view(Kes, i, :, :) |
119 | | - fe = view(fes, i, :) |
120 | | - assemble_cell!(Ke, fe, cc_i, cv_i, assembler) |
121 | | - end |
122 | | - return nothing |
123 | | -end |
124 | | -function assemble_global_cuda!(cv::CellValuesContainer, K, f, cc, colors::Vector, Ke, fe) |
125 | | - assembler = K === nothing ? nothing : start_assemble(K, f) |
126 | | - for color in colors |
127 | | - n = length(color) |
128 | | - tasks_per_thread = min(NUM_TASKS_PER_THREAD, n) |
129 | | - threads = min(NUM_THREADS, cld(n, tasks_per_thread)) |
130 | | - blocks = cld(n, tasks_per_thread * threads) |
131 | | - @cuda threads = threads blocks = blocks cuda_assembly_kernel(assembler, color, cc, cv, Ke, fe) |
132 | | - CUDA.synchronize() |
133 | | - end |
134 | | - return nothing |
135 | | -end |
136 | | - |
137 | | -# Reference for internal testing #hide |
138 | | -function assemble_global!(cv::CellValues, K::SparseMatrixCSC, f, dh::DofHandler) #hide |
139 | | - n_basefuncs = getnbasefunctions(cv) #hide |
140 | | - Ke = zeros(Float32, n_basefuncs, n_basefuncs) #hide |
141 | | - fe = zeros(Float32, n_basefuncs) #hide |
142 | | - assembler = start_assemble(K, f) #hide |
143 | | - for cell in CellIterator(dh) #hide |
144 | | - assemble_cell!(Ke, fe, cell, cv, assembler) #hide |
145 | | - end #hide |
146 | | - return nothing #hide |
147 | | -end #hide |
148 | | - |
149 | | -# Now we first setup the problem almost as usual on the host (CPU). |
150 | | -# The only major difference here is that we instantiate everything |
151 | | -# using Float32 and Int32 whenever it makes sense to lower memory |
152 | | -# pressure on the GPU, and because Float32 is on most GPUs quite |
153 | | -# a bit faster than using Float64 -- outside of high-end server GPUs. |
154 | | -# Please note that GPU kernels have a launch overhead. Therefore out problem |
155 | | -# must be sufficiently large to see any benefits of utilizing the GPU. |
156 | | -# The small number of elements here is just for demonstration purposes. |
157 | | -num_elements = 20 |
158 | | -# We generate a Float32 coordinate grid by passing in Float32 corner coordinates. |
159 | | -grid = generate_grid(Hexahedron, (num_elements, num_elements, num_elements), Vec{3}((-1.0f0, -1.0f0, -1.0f0)), Vec{3}((1.0f0, 1.0f0, 1.0f0))) |
160 | | -ip = Lagrange{RefHexahedron, 1}() |
161 | | -qr = QuadratureRule{RefHexahedron}(Float32, 2) |
162 | | -cv = CellValues(Float32, qr, ip) |
163 | | -dh = DofHandler(grid) |
164 | | -add!(dh, :u, ip) |
165 | | -close!(dh) |
166 | | - |
167 | | -# If we assemble into a matrix, then we still need to color the grid as usual. |
168 | | -# See also the threading how-to. Note that we still leave some of the integers |
169 | | -# 64 bit to still enable the indexing of large problems. |
170 | | -colors = create_coloring(grid) |
171 | | - |
172 | | -# Now to the GPU side. Here we use Adapt.jl to generate GPU counterparts of |
173 | | -# all relevant objects. |
174 | | -backend = CUDABackend() |
175 | | -colors_gpu = [adapt(backend, c) for c in colors] |
176 | | -dh_gpu = adapt(backend, dh) |
177 | | -K_gpu = allocate_matrix(CuSparseMatrixCSC{Float32, Int32}, dh) |
178 | | -f_gpu = KA.zeros(backend, Float32, (ndofs(dh),)) |
179 | | - |
180 | | -# Furthermore, the individual GPU workers need local buffers. |
181 | | -# Ferrite comes with a little helper to transform common buffers |
182 | | -# into a suitable GPU format. |
183 | | -# n_workers = ceil(Int, length(grid.cells) / NUM_THREADS) # FIXME does not match the used 493 |
184 | | -n_workers = getncells(grid) |
185 | | -cv_gpu = CellValuesContainer(backend, n_workers, cv) |
186 | | -cc_gpu = CellCacheContainer(backend, n_workers, dh_gpu) |
187 | | -# Technically we can also just get one Ke or fe per worker, but for demonstration |
188 | | -# purposes we allocate the full block here for element-assembly style matrix-free GPU |
189 | | -# usage. |
190 | | -Kes = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv), getnbasefunctions(cv)) |
191 | | -fes = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv)) |
192 | | - |
193 | | -# Now everything is set to launch the assembly via KernelAbstractions. |
194 | | -# assemble_global_ka!(backend, cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes) |
195 | | -# Or alternatively the cuda variant. |
196 | | -assemble_global_cuda!(cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes) |
197 | | - |
198 | | -# Finally, we can apply the Dirichlet constraints and solve our linear system. |
199 | | -ch = ConstraintHandler(Float32, Int32, dh) |
200 | | -∂Ω = union( |
201 | | - getfacetset(grid, "left"), getfacetset(grid, "right"), |
202 | | - getfacetset(grid, "top"), getfacetset(grid, "bottom") |
203 | | -) |
204 | | -add!(ch, Dirichlet(:u, ∂Ω, (x, t) -> 1.0)) |
205 | | -close!(ch) |
206 | | - |
207 | | -ch_gpu = adapt(backend, ch) |
208 | | -apply!(K_gpu, f_gpu, ch_gpu) |
209 | | -u_gpu = SparseMatrixCSC(K_gpu) \ Vector(f_gpu) |
| 2 | +include("../../docs/src/howto/gpu_heat_howto_literate.jl") |
210 | 3 |
|
211 | 4 | # ----------------------------- Tests -------------------------- |
212 | 5 |
|
|
0 commit comments