Skip to content

Commit a92356b

Browse files
Fix some more subtle Int issues and move most of the stuff into an extension.
1 parent 1311ef4 commit a92356b

9 files changed

Lines changed: 470 additions & 441 deletions

File tree

Project.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,16 @@ Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
1616
WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
1717

1818
[weakdeps]
19+
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
1920
BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
21+
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
2022
Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b"
23+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
2124
SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
2225

2326
[extensions]
2427
FerriteBlockArrays = "BlockArrays"
28+
FerriteCudaExt = ["Adapt", "CUDA", "KernelAbstractions"]
2529
FerriteMetis = "Metis"
2630
FerriteSparseMatrixCSR = "SparseMatricesCSR"
2731

ext/FerriteCudaExt.jl

Lines changed: 361 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
module FerriteCudaExt
2+
3+
using Ferrite, CUDA, SparseArrays
4+
5+
import Adapt: Adapt, adapt, adapt_structure
6+
7+
import Ferrite: get_grid, AbstractGrid, AbstractDofHandler, get_coordinate_eltype, TaskDescriptor, get_worker_part
8+
9+
import CUDA: CUDA.CUSPARSE.CuSparseMatrixCSC, CUDA.CUSPARSE.CuSparseMatrixCSR, @allowscalar
10+
import KernelAbstractions: KernelAbstractions, get_backend
11+
12+
13+
# ----------------- grid --------------------
14+
15+
struct GPUGrid{dim, C <: Ferrite.AbstractCell, T <: Real,
16+
CA <: AbstractArray{C,1}, NA <: AbstractArray{Node{dim,T},1}} <: Ferrite.AbstractGrid{dim}
17+
cells::CA
18+
nodes::NA
19+
end
20+
21+
function Adapt.adapt_structure(to, grid::GPUGrid)
22+
return GPUGrid(Adapt.adapt_structure(to, grid.cells), Adapt.adapt_structure(to, grid.nodes))
23+
end
24+
25+
function GPUGrid(backend, grid::AbstractGrid)
26+
return GPUGrid(adapt(backend, getcells(grid)), adapt(backend, getnodes(grid)))
27+
end
28+
29+
30+
# ----------------- dofs --------------------
31+
32+
struct GPUSubDofHandler{CS <: AbstractVector{Int}, CD <: AbstractVector{Int},
33+
CO <: AbstractVector{Int}, FN, DR <: Tuple, G}
34+
cellset::CS
35+
cell_dofs::CD
36+
cell_dofs_offset::CO
37+
ndofs_per_cell::Int
38+
# Vector{Symbol} on CPU, Nothing on GPU — Symbol is not a bitstype and cannot
39+
# be stored in GPU memory. Use the integer index overload of dof_range on GPU.
40+
field_names::FN
41+
dof_ranges::DR
42+
grid::G
43+
end
44+
45+
function Adapt.adapt_structure(to, sdh::GPUSubDofHandler)
46+
return GPUSubDofHandler(
47+
Adapt.adapt_structure(to, sdh.cellset),
48+
Adapt.adapt_structure(to, sdh.cell_dofs),
49+
Adapt.adapt_structure(to, sdh.cell_dofs_offset),
50+
sdh.ndofs_per_cell,
51+
nothing,
52+
sdh.dof_ranges,
53+
Adapt.adapt_structure(to, sdh.grid),
54+
)
55+
end
56+
57+
Ferrite.ndofs_per_cell(sdh::GPUSubDofHandler) = sdh.ndofs_per_cell
58+
59+
function Ferrite.dof_range(sdh::GPUSubDofHandler, field_idx::Int)
60+
return sdh.dof_ranges[field_idx]
61+
end
62+
function Ferrite.dof_range(sdh::GPUSubDofHandler, field_name::Symbol)
63+
idx = findfirst(==(field_name), sdh.field_names)
64+
idx === nothing && error("Field $field_name not found in GPUSubDofHandler")
65+
return sdh.dof_ranges[idx]
66+
end
67+
68+
function Ferrite.celldofs!(global_dofs::AbstractVector, sdh::GPUSubDofHandler, i::Integer)
69+
offset = sdh.cell_dofs_offset[i] - 1
70+
n = sdh.ndofs_per_cell
71+
@inbounds for j in 1:n
72+
global_dofs[j] = sdh.cell_dofs[offset + j]
73+
end
74+
return global_dofs
75+
end
76+
77+
# CPU-only container — not sent to the GPU
78+
struct GPUDofHandler{dim, G <: GPUGrid{dim}, SDH <: GPUSubDofHandler} <: Ferrite.AbstractDofHandler
79+
subdofhandlers::Vector{SDH}
80+
grid::G
81+
end
82+
83+
function GPUDofHandler(backend, dh::DofHandler)
84+
gpu_grid = GPUGrid(backend, dh.grid)
85+
cell_dofs = adapt(backend, dh.cell_dofs)
86+
cell_dofs_offset = adapt(backend, dh.cell_dofs_offset)
87+
subdofhandlers = map(dh.subdofhandlers) do sdh
88+
dof_ranges = Tuple(Ferrite.dof_range(sdh, i) for i in 1:length(sdh.field_names))
89+
GPUSubDofHandler(adapt(backend, collect(Int, sdh.cellset)), cell_dofs, cell_dofs_offset,
90+
sdh.ndofs_per_cell, copy(sdh.field_names), dof_ranges, gpu_grid)
91+
end
92+
return GPUDofHandler(subdofhandlers, gpu_grid)
93+
end
94+
95+
function Adapt.adapt_structure(to::TaskDescriptor, dh::DofHandler)
96+
return GPUDofHandler(to.device, dh)
97+
end
98+
99+
Ferrite.get_grid(dh::GPUDofHandler) = dh.grid
100+
101+
102+
# ----------------- Adapt.jl parts --------------------
103+
104+
import Adapt: Adapt, adapt, adapt_structure
105+
Adapt.@adapt_structure CellValues
106+
Adapt.@adapt_structure Ferrite.GeometryMapping
107+
Adapt.@adapt_structure Ferrite.FunctionValues
108+
109+
adapt(to, ip::Ferrite.Interpolation) = ip
110+
111+
function adapt(to::TaskDescriptor, fv::Ferrite.FunctionValues)
112+
N = to.num_workers
113+
d = to.device
114+
return Ferrite.FunctionValues(
115+
adapt(to, fv.ip),
116+
KernelAbstractions.zeros(d, eltype(fv.Nx), N, size(fv.Nx, 1), size(fv.Nx, 2)),
117+
adapt(d, collect(fv.Nξ)),
118+
KernelAbstractions.zeros(d, eltype(fv.dNdx), N, size(fv.dNdx, 1), size(fv.dNdx, 2)),
119+
adapt(d, collect(fv.dNdξ)),
120+
fv.d2Ndx2 === nothing ? nothing : KernelAbstractions.zeros(d, eltype(fv.d2Ndx2), N, size(fv.d2Ndx2, 1), size(fv.d2Ndx2, 2)),
121+
fv.d2Ndξ2 === nothing ? nothing : adapt(d, collect(fv.d2Ndξ2)),
122+
)
123+
end
124+
125+
function adapt(to::TaskDescriptor, fv::Ferrite.GeometryMapping)
126+
N = to.num_workers
127+
d = to.device
128+
return Ferrite.GeometryMapping(
129+
adapt(to, fv.ip),
130+
KernelAbstractions.zeros(d, eltype(fv.M), N, size(fv.M, 1), size(fv.M, 2)),
131+
fv.dMdξ === nothing ? nothing : adapt(d, collect(fv.dMdξ)),
132+
fv.d2Mdξ2 === nothing ? nothing : adapt(d, collect(fv.d2Mdξ2)),
133+
)
134+
end
135+
136+
function adapt(to::TaskDescriptor, cv::CellValues)
137+
N = to.num_workers
138+
d = to.device
139+
return CellValues(
140+
adapt(to, cv.fun_values),
141+
adapt(to, cv.geo_mapping),
142+
adapt(to, cv.qr),
143+
KernelAbstractions.zeros(d, eltype(cv.detJdV), N, length(cv.detJdV)),
144+
)
145+
end
146+
147+
function adapt(to::TaskDescriptor, qr::QuadratureRule{shape}) where shape
148+
d = to.device
149+
return QuadratureRule{shape}(adapt(d, collect(qr.weights)), adapt(d, collect(qr.points)))
150+
end
151+
152+
function adapt(to, qr::QuadratureRule{shape}) where shape
153+
return QuadratureRule{shape}(adapt(to, qr.weights), adapt(to, qr.points))
154+
end
155+
156+
function adapt_structure(to, qr::QuadratureRule{shape}) where shape
157+
return QuadratureRule{shape}(adapt_structure(to, qr.weights), adapt_structure(to, qr.points))
158+
end
159+
160+
161+
# -------------------- iterator ----------------------
162+
163+
struct GPUCellCache{G <: AbstractGrid, SDH, IVT, VX}
164+
flags::UpdateFlags
165+
grid::G
166+
cellid::Int
167+
nodes::IVT
168+
coords::VX
169+
sdh::SDH
170+
dofs::IVT
171+
end
172+
Adapt.@adapt_structure GPUCellCache
173+
174+
function GPUCellCache(sdh::GPUSubDofHandler, grid::GPUGrid{dim}, descriptor::TaskDescriptor,
175+
flags::UpdateFlags = UpdateFlags()) where dim
176+
d = descriptor.device
177+
@allowscalar begin
178+
n = sdh.ndofs_per_cell
179+
N = Ferrite.nnodes_per_cell(grid, 1)
180+
W = descriptor.num_workers
181+
nodes = KernelAbstractions.zeros(d, Int, W, N)
182+
coords = KernelAbstractions.zeros(d, Vec{dim, get_coordinate_eltype(grid)}, W, N)
183+
dofs = KernelAbstractions.zeros(d, Int, W, n)
184+
end
185+
return GPUCellCache(flags, grid, -1, nodes, coords, sdh, dofs)
186+
end
187+
188+
Ferrite.celldofs(cc::GPUCellCache) = cc.dofs
189+
190+
# TODO I do not like this design.
191+
function Ferrite.CellCache(descriptor::TaskDescriptor, sdh::GPUSubDofHandler, flags::UpdateFlags = UpdateFlags())
192+
return GPUCellCache(sdh, sdh.grid, descriptor, flags)
193+
end
194+
195+
function get_worker_part(i, cc::GPUCellCache, cellid)
196+
return GPUCellCache(cc.flags, cc.grid, cellid,
197+
view(cc.nodes, i, :), view(cc.coords, i, :), cc.sdh, view(cc.dofs, i, :))
198+
end
199+
200+
201+
# -------------------- assembler ----------------------
202+
203+
struct GPUCSCAssembler{CP <: AbstractVector, RV <: AbstractVector, NZ <: AbstractVector}
204+
colptr::CP
205+
rowval::RV
206+
nzval::NZ
207+
end
208+
Adapt.@adapt_structure GPUCSCAssembler
209+
210+
function Ferrite.start_assemble(K::CuSparseMatrixCSC; fillzero::Bool = true)
211+
fillzero && fill!(nonzeros(K), zero(eltype(K)))
212+
return GPUCSCAssembler(SparseArrays.getcolptr(K), rowvals(K), nonzeros(K))
213+
end
214+
215+
function Ferrite.assemble!(A::GPUCSCAssembler, dofs::AbstractVector{<:Integer}, Ke::AbstractMatrix, fe::Nothing = nothing)
216+
ndofs = length(dofs)
217+
for j in 1:ndofs
218+
col = dofs[j]
219+
r1 = A.colptr[col]
220+
r2 = A.colptr[col + 1] - 1
221+
for i in 1:ndofs
222+
val = Ke[i, j]
223+
iszero(val) && continue
224+
row = dofs[i]
225+
# Linear search for the row in this column's nonzeros
226+
for idx in r1:r2
227+
if A.rowval[idx] == row
228+
A.nzval[idx] += val
229+
break
230+
end
231+
end
232+
end
233+
end
234+
return nothing
235+
end
236+
237+
function Ferrite.allocate_matrix(::Type{CuSparseMatrixCSC{Tv, Ti}}, dh::DofHandler) where {Tv, Ti}
238+
return CuSparseMatrixCSC(allocate_matrix(SparseMatrixCSC{Tv, Ti}, dh))
239+
end
240+
241+
function Ferrite.allocate_matrix(::Type{CuSparseMatrixCSR{Tv, Ti}}, dh::DofHandler) where {Tv, Ti}
242+
return CuSparseMatrixCSR(allocate_matrix(SparseMatrixCSC{Tv, Ti}, dh))
243+
end
244+
245+
# -------------------- constraints ----------------------
246+
247+
struct GPUConstraintHandler{Tv, Ti, PD <: AbstractVector{Ti}, IH <: AbstractVector{Tv}, IP <: AbstractVector{Bool}}
248+
prescribed_dofs::PD
249+
inhomogeneities::IH
250+
is_prescribed::IP
251+
end
252+
253+
function adapt(backend, ch::ConstraintHandler)
254+
@assert Ferrite.isclosed(ch)
255+
n = Ferrite.ndofs(ch.dh)
256+
is_prescribed = zeros(Bool, n)
257+
for d in ch.prescribed_dofs
258+
is_prescribed[d] = true
259+
end
260+
return GPUConstraintHandler(
261+
adapt(backend, ch.prescribed_dofs),
262+
adapt(backend, ch.inhomogeneities),
263+
adapt(backend, is_prescribed),
264+
)
265+
end
266+
267+
function meandiag_kernel!(diag, colptr, rowval, nzval, n)
268+
j = threadIdx().x + (blockIdx().x - 1) * blockDim().x
269+
j > n && return
270+
for k in colptr[j]:(colptr[j + 1] - 1)
271+
if rowval[k] == j
272+
diag[j] = abs(nzval[k])
273+
break
274+
end
275+
end
276+
return nothing
277+
end
278+
279+
function meandiag(K::CuSparseMatrixCSC{T}) where T
280+
n = size(K, 1)
281+
backend = get_backend(nonzeros(K))
282+
diag = KernelAbstractions.zeros(backend, T, n)
283+
threads = min(256, n)
284+
@cuda threads=threads blocks=cld(n, threads) meandiag_kernel!(
285+
diag, SparseArrays.getcolptr(K), rowvals(K), nonzeros(K), n)
286+
return sum(diag) / n
287+
end
288+
289+
# Combined: f -= K[:, d] * inhom[d], then zero column d
290+
function apply_inhom_zero_cols_kernel!(f, colptr, rowval, nzval, prescribed_dofs, inhomogeneities, n_prescribed, applyzero)
291+
idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
292+
idx > n_prescribed && return
293+
d = prescribed_dofs[idx]
294+
v = inhomogeneities[idx]
295+
for k in colptr[d]:(colptr[d + 1] - 1)
296+
if !applyzero && !iszero(v)
297+
CUDA.@atomic f[rowval[k]] -= v * nzval[k]
298+
end
299+
nzval[k] = zero(eltype(nzval))
300+
end
301+
return nothing
302+
end
303+
304+
# Zero out prescribed rows
305+
function apply_zero_rows_kernel!(rowval, nzval, is_prescribed, nnz)
306+
idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
307+
idx > nnz && return
308+
if is_prescribed[rowval[idx]]
309+
nzval[idx] = zero(eltype(nzval))
310+
end
311+
return nothing
312+
end
313+
314+
# Set K[d,d] = m and f[d] = inhom[d] * m
315+
function apply_set_diag_kernel!(colptr, rowval, nzval, f, prescribed_dofs, inhomogeneities, m, n_prescribed, applyzero)
316+
idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
317+
idx > n_prescribed && return
318+
d = prescribed_dofs[idx]
319+
for k in colptr[d]:(colptr[d + 1] - 1)
320+
if rowval[k] == d
321+
nzval[k] = m
322+
break
323+
end
324+
end
325+
f[d] = (applyzero ? zero(eltype(f)) : inhomogeneities[idx]) * m
326+
return nothing
327+
end
328+
329+
function Ferrite.apply!(K::CuSparseMatrixCSC{T}, f::CuVector{T}, ch::GPUConstraintHandler{T}, applyzero::Bool = false) where T
330+
n_prescribed = length(ch.prescribed_dofs)
331+
n_prescribed == 0 && return
332+
333+
colptr = SparseArrays.getcolptr(K)
334+
rv = rowvals(K)
335+
nz = nonzeros(K)
336+
nnz = length(nz)
337+
m = meandiag(K)
338+
threads_p = min(256, n_prescribed)
339+
blocks_p = cld(n_prescribed, threads_p)
340+
341+
# Step 1+2: f -= K[:,d]*v, then zero prescribed columns
342+
@cuda threads=threads_p blocks=blocks_p apply_inhom_zero_cols_kernel!(
343+
f, colptr, rv, nz, ch.prescribed_dofs, ch.inhomogeneities, n_prescribed, applyzero)
344+
345+
# Step 3: zero prescribed rows
346+
threads_z = min(256, nnz)
347+
@cuda threads=threads_z blocks=cld(nnz, threads_z) apply_zero_rows_kernel!(
348+
rv, nz, ch.is_prescribed, nnz)
349+
350+
# Step 4: set diagonal and rhs
351+
@cuda threads=threads_p blocks=blocks_p apply_set_diag_kernel!(
352+
colptr, rv, nz, f, ch.prescribed_dofs, ch.inhomogeneities, T(m), n_prescribed, applyzero)
353+
354+
return
355+
end
356+
357+
function Ferrite.apply_zero!(K::CuSparseMatrixCSC{T}, f::CuVector{T}, ch::GPUConstraintHandler{T}) where T
358+
return Ferrite.apply!(K, f, ch, true)
359+
end
360+
361+
end

0 commit comments

Comments
 (0)