Skip to content

Commit ac3a916

Browse files
Sneak GPU assembly into docs
1 parent 530bdd2 commit ac3a916

3 files changed

Lines changed: 21 additions & 208 deletions

File tree

docs/make.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ bibtex_plugin = CitationBibliography(
9898
"How-to guide overview" => "howto/index.md",
9999
"howto/postprocessing.md",
100100
"howto/threaded_assembly.md",
101+
"howto/gpu_assembly.md",
101102
],
102103
"gallery/index.md",
103104
# "Code gallery" => [

docs/src/howto/gpu_assembly.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# [Basic GPU Asembly](@id gpu_assembly_howto)
2+
3+
For some large problems it can be beneficial to use a GPU to assemble the residual and in some cases even the system.
4+
Before developing hand-written optimiezd assembly routines using Ferrite users can try to port their existing assembly
5+
routine.
6+
In the following we show how users can assemble the heat problem from the first tutorial using CUDA either directly
7+
or via KernelAbstractions. This can also serve as a first step towards more elaborate assembly optimizations on the
8+
GPU.
9+
10+
```@eval
11+
# Include the example here, but modify the Literate output to suit being embedded
12+
using Literate, Markdown
13+
base_name = "gpu_heat_howto_literate"
14+
Literate.markdown(string(base_name, ".jl"); name = base_name, execute = false, credit = false, documenter=false)
15+
content = read(string(base_name, ".md"), String)
16+
rm(string(base_name, ".md"))
17+
rm(string(base_name, ".jl"))
18+
Markdown.parse(content)
19+
```

test/GPU/heat_assembly.jl

Lines changed: 1 addition & 208 deletions
Original file line numberDiff line numberDiff line change
@@ -1,212 +1,5 @@
1-
using Ferrite, CUDA
2-
import CUDA: CUDA.CUSPARSE.CuSparseMatrixCSC
3-
import Adapt: adapt
4-
import KernelAbstractions: @kernel, @index
5-
import KernelAbstractions as KA
6-
using SparseArrays
71

8-
import Ferrite: CellValuesContainer, CellCacheContainer
9-
10-
const NUM_THREADS = 64
11-
const NUM_TASKS_PER_THREAD = 2
12-
13-
# In this how-to we want to use an existing assembly routine on the GPU with Ferrite.
14-
# We implicitly assume that nothing dynamic happens inside the routine, i.e. the routine
15-
# is type stable, does not allocate and also does not have any dynamic dispatches.
16-
function assemble_element!(Ke::AbstractMatrix, fe::AbstractVector, cv::CellValues)
17-
n_basefuncs = getnbasefunctions(cv)
18-
19-
for q_point in 1:getnquadpoints(cv)
20-
= getdetJdV(cv, q_point)
21-
for i in 1:n_basefuncs
22-
∇δuᵢ = shape_gradient(cv, q_point, i)
23-
δuᵢ = shape_value(cv, q_point, i)
24-
fe[i] += δuᵢ *
25-
for j in 1:n_basefuncs
26-
∇δuⱼ = shape_gradient(cv, q_point, j)
27-
Ke[i, j] += (∇δuᵢ ∇δuⱼ) *
28-
end
29-
end
30-
end
31-
return nothing
32-
end
33-
34-
# We also have a simple cell assembly wrapping the element in two variants.
35-
# In the first variant we assemble using an assembler. In the second variant we
36-
# only fill Ke, e.g. as part of element-assembly techniques.
37-
function assemble_cell!(Ke, fe, cell, cv, assembler)
38-
reinit!(cv, nothing, cell.coords)
39-
fill!(Ke, 0)
40-
fill!(fe, 0)
41-
assemble_element!(Ke, fe, cv)
42-
assemble!(assembler, celldofs(cell), Ke, fe)
43-
return nothing
44-
end
45-
function assemble_cell!(Ke, fe, cell, cv, ::Nothing)
46-
reinit!(cv, nothing, cell.coords)
47-
fill!(Ke, 0)
48-
fill!(fe, 0)
49-
assemble_element!(Ke, fe, cv)
50-
return nothing
51-
end
52-
53-
# Now to the actual assembly kernel. To ensure portability we show how to use KernelAbstractions.jl
54-
# as the kernel language, although we will also show how to use CUDA directly below. In this kernel
55-
# we use a grid-stride loop, which has several benefits in terms of performance and debuggability.
56-
# For more details please consult https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ .
57-
@kernel function ka_assembly_kernel(assembler, @Const(color), cc, cv, Kes, fes)
58-
# This is the classical grid-stride-loop
59-
task_index = @index(Global, Linear)
60-
stride = prod(KA.@ndrange())
61-
for i in task_index:stride:length(color)
62-
# Work item index
63-
cellid = color[i]
64-
65-
# Query the local evaluation buffer of the GPU worker.
66-
# As explained later this is the secret sauce.
67-
cv_i = cv[task_index]
68-
69-
# Query work item cell cache.
70-
cc_i = cc[task_index](cellid) # FIXME is there a better way to sneak the cellid into the cache?
71-
72-
# Fill buffer.
73-
reinit!(cc_i, cellid)
74-
75-
# Query assembly buffer.
76-
Ke = view(Kes, i, :, :)
77-
fe = view(fes, i, :)
78-
79-
# # Actual assembly routine.
80-
assemble_cell!(Ke, fe, cc_i, cv_i, assembler)
81-
end
82-
end
83-
function assemble_global_ka!(backend, cv::CellValuesContainer, K, f, cc, colors::Vector, Ke, fe)
84-
assembler = K === nothing ? nothing : start_assemble(K, f)
85-
for color in colors
86-
# We divide the work into blocks and fire up the kernel.
87-
n = length(color)
88-
# Let's assign, arbitrarily, two element assembly tasks per GPU thread.
89-
tasks_per_thread = min(NUM_TASKS_PER_THREAD, n)
90-
# To do so, let us first compute how many element groups we have to assemble.
91-
n_effective = cld(n, tasks_per_thread)
92-
# This potentially limits the number of usable threads, e.g. when a color just has a small
93-
# number of elements.
94-
threads = min(NUM_THREADS, n_effective)
95-
# Furthermore, for CPU computing we typically group the tasks into blocks of worker threads.
96-
blocks = cld(n, tasks_per_thread * threads)
97-
# Now, we can build and execute the Kernel.
98-
ka_kernel = ka_assembly_kernel(backend, threads)
99-
ka_kernel(assembler, color, cc, cv, Ke, fe, ndrange = threads * blocks)
100-
# Since the kernel launches asynchronously we need to add a synchronization
101-
# point before proceeding here. Otherwise we will start assembling the next color,
102-
# while there are still threads working on the current color, therefore potentially
103-
# causing race conditions.
104-
KA.synchronize(backend)
105-
end
106-
return nothing
107-
end
108-
109-
# And here now the CUDA variant. Please see above for details, as the kernels are almost the same.
110-
function cuda_assembly_kernel(assembler, color, cc, cv, Kes, fes)
111-
task_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
112-
stride = gridDim().x * blockDim().x
113-
for i in task_index:stride:length(color)
114-
cellid = color[i]
115-
cv_i = cv[task_index]
116-
cc_i = cc[task_index](cellid)
117-
reinit!(cc_i, cellid)
118-
Ke = view(Kes, i, :, :)
119-
fe = view(fes, i, :)
120-
assemble_cell!(Ke, fe, cc_i, cv_i, assembler)
121-
end
122-
return nothing
123-
end
124-
function assemble_global_cuda!(cv::CellValuesContainer, K, f, cc, colors::Vector, Ke, fe)
125-
assembler = K === nothing ? nothing : start_assemble(K, f)
126-
for color in colors
127-
n = length(color)
128-
tasks_per_thread = min(NUM_TASKS_PER_THREAD, n)
129-
threads = min(NUM_THREADS, cld(n, tasks_per_thread))
130-
blocks = cld(n, tasks_per_thread * threads)
131-
@cuda threads = threads blocks = blocks cuda_assembly_kernel(assembler, color, cc, cv, Ke, fe)
132-
CUDA.synchronize()
133-
end
134-
return nothing
135-
end
136-
137-
# Reference for internal testing #hide
138-
function assemble_global!(cv::CellValues, K::SparseMatrixCSC, f, dh::DofHandler) #hide
139-
n_basefuncs = getnbasefunctions(cv) #hide
140-
Ke = zeros(Float32, n_basefuncs, n_basefuncs) #hide
141-
fe = zeros(Float32, n_basefuncs) #hide
142-
assembler = start_assemble(K, f) #hide
143-
for cell in CellIterator(dh) #hide
144-
assemble_cell!(Ke, fe, cell, cv, assembler) #hide
145-
end #hide
146-
return nothing #hide
147-
end #hide
148-
149-
# Now we first setup the problem almost as usual on the host (CPU).
150-
# The only major difference here is that we instantiate everything
151-
# using Float32 and Int32 whenever it makes sense to lower memory
152-
# pressure on the GPU, and because Float32 is on most GPUs quite
153-
# a bit faster than using Float64 -- outside of high-end server GPUs.
154-
# Please note that GPU kernels have a launch overhead. Therefore out problem
155-
# must be sufficiently large to see any benefits of utilizing the GPU.
156-
# The small number of elements here is just for demonstration purposes.
157-
num_elements = 20
158-
# We generate a Float32 coordinate grid by passing in Float32 corner coordinates.
159-
grid = generate_grid(Hexahedron, (num_elements, num_elements, num_elements), Vec{3}((-1.0f0, -1.0f0, -1.0f0)), Vec{3}((1.0f0, 1.0f0, 1.0f0)))
160-
ip = Lagrange{RefHexahedron, 1}()
161-
qr = QuadratureRule{RefHexahedron}(Float32, 2)
162-
cv = CellValues(Float32, qr, ip)
163-
dh = DofHandler(grid)
164-
add!(dh, :u, ip)
165-
close!(dh)
166-
167-
# If we assemble into a matrix, then we still need to color the grid as usual.
168-
# See also the threading how-to. Note that we still leave some of the integers
169-
# 64 bit to still enable the indexing of large problems.
170-
colors = create_coloring(grid)
171-
172-
# Now to the GPU side. Here we use Adapt.jl to generate GPU counterparts of
173-
# all relevant objects.
174-
backend = CUDABackend()
175-
colors_gpu = [adapt(backend, c) for c in colors]
176-
dh_gpu = adapt(backend, dh)
177-
K_gpu = allocate_matrix(CuSparseMatrixCSC{Float32, Int32}, dh)
178-
f_gpu = KA.zeros(backend, Float32, (ndofs(dh),))
179-
180-
# Furthermore, the individual GPU workers need local buffers.
181-
# Ferrite comes with a little helper to transform common buffers
182-
# into a suitable GPU format.
183-
# n_workers = ceil(Int, length(grid.cells) / NUM_THREADS) # FIXME does not match the used 493
184-
n_workers = getncells(grid)
185-
cv_gpu = CellValuesContainer(backend, n_workers, cv)
186-
cc_gpu = CellCacheContainer(backend, n_workers, dh_gpu)
187-
# Technically we can also just get one Ke or fe per worker, but for demonstration
188-
# purposes we allocate the full block here for element-assembly style matrix-free GPU
189-
# usage.
190-
Kes = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv), getnbasefunctions(cv))
191-
fes = KA.zeros(backend, Float32, getncells(grid), getnbasefunctions(cv))
192-
193-
# Now everything is set to launch the assembly via KernelAbstractions.
194-
# assemble_global_ka!(backend, cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes)
195-
# Or alternatively the cuda variant.
196-
assemble_global_cuda!(cv_gpu, K_gpu, f_gpu, cc_gpu, colors_gpu, Kes, fes)
197-
198-
# Finally, we can apply the Dirichlet constraints and solve our linear system.
199-
ch = ConstraintHandler(Float32, Int32, dh)
200-
∂Ω = union(
201-
getfacetset(grid, "left"), getfacetset(grid, "right"),
202-
getfacetset(grid, "top"), getfacetset(grid, "bottom")
203-
)
204-
add!(ch, Dirichlet(:u, ∂Ω, (x, t) -> 1.0))
205-
close!(ch)
206-
207-
ch_gpu = adapt(backend, ch)
208-
apply!(K_gpu, f_gpu, ch_gpu)
209-
u_gpu = SparseMatrixCSC(K_gpu) \ Vector(f_gpu)
2+
include("../../docs/src/howto/gpu_heat_howto_literate.jl")
2103

2114
# ----------------------------- Tests --------------------------
2125

0 commit comments

Comments
 (0)