Skip to content

Commit 04ff942

Browse files
Fix and comment how-to for naive GPU assembly. For now this how-to is integrated as a full test. I will split it up later.
1 parent 545a317 commit 04ff942

3 files changed

Lines changed: 216 additions & 256 deletions

File tree

ext/FerriteCudaExt.jl

Lines changed: 42 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
# TODO's
2+
# * `update!` for ConstraintHandler on GPU.
3+
14
module FerriteCudaExt
25

36
using Ferrite, CUDA, SparseArrays
47

58
import Adapt: Adapt, adapt, adapt_structure
69

7-
import Ferrite: get_grid, AbstractGrid, AbstractDofHandler, get_coordinate_eltype, as_structure_of_arrays, get_substruct, materialize, meandiag
10+
import Ferrite: get_grid, AbstractGrid, AbstractDofHandler, get_coordinate_eltype, as_structure_of_arrays, get_substruct, meandiag
811

912
import CUDA: CUDA.CUSPARSE.CuSparseMatrixCSC, CUDA.CUSPARSE.CuSparseMatrixCSR, @allowscalar
1013
import KernelAbstractions as KA
@@ -74,11 +77,7 @@ function Ferrite.dof_range(sdh::GPUSubDofHandler, field_name::Symbol)
7477
end
7578

7679
function Ferrite.celldofs!(global_dofs::AbstractVector, sdh::GPUSubDofHandler, i::Integer)
77-
offset = sdh.cell_dofs_offset[i] - 1
78-
n = sdh.ndofs_per_cell
79-
@inbounds for j in 1:n
80-
global_dofs[j] = sdh.cell_dofs[offset + j]
81-
end
80+
copyto!(global_dofs, 1, sdh.cell_dofs, sdh.cell_dofs_offset[i], length(global_dofs))
8281
return global_dofs
8382
end
8483

@@ -118,34 +117,12 @@ Adapt.@adapt_structure Ferrite.FunctionValues
118117

119118
adapt(to, ip::Ferrite.Interpolation) = ip
120119

121-
get_number_of_device_threads(_) = attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
122-
123-
# The design below outmaneuvers the standard Ferrite setup logic. Here start with the setup of a *defective* FEValues object.
124-
# On the device we do not access the this object directly, but instead we have an accessor function which queries the shared
125-
# memory variant of this object (e.g. CuStaticSharedMemory). The main reason reason is that we cannot just allocate device
126-
# shared memory on the host, but using global memory is not optimal either.
127-
struct DeviceResidentSharedMemoryDescriptor{Tv, N, Dims, M} <: AbstractArray{Tv, M}
128-
end
129-
130-
as_shared_array(d, N, a) = adapt(d, a)
131-
as_shared_array(d, N, a::Array{T, M}) where {T, M} = DeviceResidentSharedMemoryDescriptor{T, N, size(a), M}()
132-
# FIXME adjust the number of threads on kernel launch
133-
# as_shared_array(d::CUDA.KernelAdaptor, N, a::DeviceResidentSharedMemoryDescriptor) = DeviceResidentSharedMemoryDescriptor{eltype(T), N, size(a)}()
134-
135-
function materialize(::DeviceResidentSharedMemoryDescriptor{Tv, N, Dims}) where {Tv, N, Dims}
136-
# TODO KernelAbstractions v0.10
137-
# return KA.KernelIntrinsics.localmemory(Tv, (N, Dims...))
138-
return CuStaticSharedArray(Tv, N, Dims...)
139-
# return KA.SharedMemory(Tv, (N, Dims...), KA.gensym("static_shmem"))
140-
end
141-
142120
function adapt(d, cv::CellValues)
143-
N = get_number_of_device_threads(d)
144121
return CellValues(
145122
adapt(d, cv.fun_values),
146123
adapt(d, cv.geo_mapping),
147124
adapt(d, cv.qr),
148-
as_shared_array(d, N, cv.detJdV),
125+
adapt(d, cv.detJdV),
149126
)
150127
end
151128

@@ -164,35 +141,35 @@ function as_structure_of_arrays(d, N, cv::CellValues)
164141
end
165142

166143
function adapt(d, fv::Ferrite.FunctionValues)
167-
N = get_number_of_device_threads(d)
144+
= adapt(d, fv.)
168145
return Ferrite.FunctionValues(
169146
adapt(d, fv.ip),
170-
as_shared_array(d, N, fv.Nx),
171-
adapt(d, fv.Nξ),
172-
as_shared_array(d, N, fv.dNdx),
147+
fv.=== fv.Nx ?: adapt(fv.Nx), # Ensure proper aliasing
148+
,
149+
adapt(d, fv.dNdx),
173150
adapt(d, fv.dNdξ),
174151
fv.d2Ndx2 === nothing ? nothing : as_shared_array(d, N, fv.d2Ndx2),
175152
fv.d2Ndξ2 === nothing ? nothing : adapt(d, collect(fv.d2Ndξ2)),
176153
)
177154
end
178155

179156
function as_structure_of_arrays(d, N, fv::Ferrite.FunctionValues)
157+
= adapt(d, fv.Nξ)
180158
return Ferrite.FunctionValues(
181159
adapt(d, fv.ip),
182-
KA.zeros(d, eltype(fv.Nx), N, size(fv.Nx, 1), size(fv.Nx, 2)),
183-
adapt(d, fv.Nξ),
184-
KA.zeros(d, eltype(fv.dNdx), N, size(fv.dNdx, 1), size(fv.dNdx, 2)),
160+
fv.=== fv.Nx ?: KA.zeros(d, eltype(fv.Nx), N, size(fv.Nx, 1), size(fv.Nx, 2)), # Ensure proper aliasing,
161+
,
162+
fv.dNdx === nothing ? nothing : KA.zeros(d, eltype(fv.dNdx), N, size(fv.dNdx, 1), size(fv.dNdx, 2)),
185163
adapt(d, fv.dNdξ),
186164
fv.d2Ndx2 === nothing ? nothing : KA.zeros(d, eltype(fv.d2Ndx2), N, size(fv.d2Ndx2, 1), size(fv.d2Ndx2, 2)),
187165
fv.d2Ndξ2 === nothing ? nothing : adapt(d, fv.d2Ndξ2),
188166
)
189167
end
190168

191169
function adapt(d, fv::Ferrite.GeometryMapping)
192-
N = get_number_of_device_threads(d)
193170
return Ferrite.GeometryMapping(
194171
adapt(d, fv.ip),
195-
as_shared_array(d, N, fv.M),
172+
adapt(d, fv.M),
196173
fv.dMdξ === nothing ? nothing : adapt(d, fv.dMdξ),
197174
fv.d2Mdξ2 === nothing ? nothing : adapt(d, fv.d2Mdξ2),
198175
)
@@ -216,34 +193,6 @@ function adapt_structure(to, qr::QuadratureRule{shape}) where {shape}
216193
return QuadratureRule{shape}(adapt_structure(to, qr.weights), adapt_structure(to, qr.points))
217194
end
218195

219-
function materialize(cv::CellValues)
220-
return CellValues(
221-
materialize(cv.fun_values),
222-
materialize(cv.geo_mapping),
223-
materialize(cv.qr),
224-
materialize(cv.detJdV),
225-
)
226-
end
227-
function materialize(fv::Ferrite.FunctionValues)
228-
return Ferrite.FunctionValues(
229-
materialize(fv.ip),
230-
materialize(fv.Nx),
231-
materialize(fv.Nξ),
232-
materialize(fv.dNdx),
233-
materialize(fv.dNdξ),
234-
materialize(fv.d2Ndx2),
235-
materialize(fv.d2Ndξ2),
236-
)
237-
end
238-
function materialize(fv::Ferrite.GeometryMapping)
239-
return Ferrite.GeometryMapping(
240-
materialize(fv.ip),
241-
materialize(fv.M),
242-
materialize(fv.dMdξ),
243-
materialize(fv.d2Mdξ2),
244-
)
245-
end
246-
247196
# -------------------- iterator ----------------------
248197

249198
# NOTE CellCache is mutable and hence inherently incompatible with GPU. So here is the
@@ -294,40 +243,27 @@ function Ferrite.CellCache(d, dh::GPUDofHandler{dim}, flags::UpdateFlags = Updat
294243
end
295244

296245
function Ferrite.CellCache(d, sdh::GPUSubDofHandler{dim}, flags::UpdateFlags = UpdateFlags()) where {dim}
297-
outer_dim = get_number_of_device_threads(d)
298246
grid = get_grid(sdh)
299247
@allowscalar begin
300-
n = Ferrite.ndofs_per_cell(sdh)
301248
N = Ferrite.nnodes_per_cell(grid, first(sdh.cellset))
302-
nodes = as_shared_array(d, outer_dim, zeros(Int, N))
303-
coords = as_shared_array(d, outer_dim, zeros(Vec{dim, get_coordinate_eltype(grid)}, N))
304-
dofs = as_shared_array(d, outer_dim, zeros(Int, N))
249+
nodes = KA.zeros(d, Int, N)
250+
coords = KA.zeros(d, Vec{dim, get_coordinate_eltype(grid)}, N)
251+
252+
n = Ferrite.ndofs_per_cell(sdh)
253+
dofs = KA.zeros(d, Int, n)
305254
end
306255
return GPUCellCache(flags, grid, -1, nodes, coords, sdh, dofs)
307256
end
308257

309258
function adapt(d, cc::GPUCellCache)
310-
outer_dim = get_number_of_device_threads(d)
311259
return GPUCellCache(
312260
cc.flags,
313261
adapt(d, cc.grid),
314262
-1,
315-
as_shared_array(d, outer_dim, cc.nodes),
316-
as_shared_array(d, outer_dim, cc.coords),
263+
adapt(d, cc.nodes),
264+
adapt(d, cc.coords),
317265
adapt(d, cc.sdh),
318-
as_shared_array(d, outer_dim, cc.dofs),
319-
)
320-
end
321-
322-
function materialize(cc::GPUCellCache)
323-
return GPUCellCache(
324-
materialize(cc.flags),
325-
materialize(cc.grid),
326-
materialize(cc.cellid),
327-
materialize(cc.nodes),
328-
materialize(cc.coords),
329-
materialize(cc.sdh),
330-
materialize(cc.dofs),
266+
adapt(d, cc.dofs),
331267
)
332268
end
333269

@@ -347,18 +283,33 @@ end
347283

348284
# -------------------- assembler ----------------------
349285

350-
struct GPUCSCAssembler{KType}
286+
struct GPUCSCAssembler{KType, FType}
351287
K::KType
288+
f::FType
352289
end
353290
Adapt.@adapt_structure GPUCSCAssembler
354291

355292
# FIXME buffer
356-
function Ferrite.start_assemble(K::CuSparseMatrixCSC; fillzero::Bool = true)
293+
function Ferrite.start_assemble(K::CuSparseMatrixCSC, f::Union{CuVector, Nothing} = nothing; fillzero::Bool = true)
357294
fillzero && fill!(nonzeros(K), zero(eltype(K)))
358-
return GPUCSCAssembler(K)
295+
f !== nothing && fillzero && fill!(f, zero(eltype(f)))
296+
return GPUCSCAssembler(K, f)
297+
end
298+
299+
function Ferrite.assemble!(A::GPUCSCAssembler, dofs::AbstractVector{<:Integer}, Ke::AbstractMatrix, fe::AbstractVector)
300+
Ferrite.assemble!(A, dofs, Ke)
301+
Ferrite.assemble!(A, dofs, fe)
302+
return nothing
303+
end
304+
305+
function Ferrite.assemble!(A::GPUCSCAssembler, dofs::AbstractVector{<:Integer}, fe::AbstractVector)
306+
for (i, dof) in enumerate(dofs)
307+
A.f[dof] += fe[i]
308+
end
309+
return nothing
359310
end
360311

361-
function Ferrite.assemble!(A::GPUCSCAssembler, dofs::AbstractVector{<:Integer}, Ke::AbstractMatrix, fe::Nothing = nothing)
312+
function Ferrite.assemble!(A::GPUCSCAssembler, dofs::AbstractVector{<:Integer}, Ke::AbstractMatrix)
362313
colptr = SparseArrays.getcolptr(A.K)
363314
rowval = rowvals(A.K)
364315
nzval = nonzeros(A.K)

src/soa_utils.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ function get_substruct(i, cv::CellValues)
1515
end
1616

1717
function get_substruct(i, fv::FunctionValues)
18+
Nx = fv.=== fv.Nx ? fv.Nx : view(fv.Nx, i, :, :)
19+
dNdx = fv.dNdx === nothing ? nothing : view(fv.dNdx, i, :, :)
1820
return FunctionValues(
19-
fv.ip, view(fv.Nx, i, :, :), fv.Nξ,
20-
view(fv.dNdx, i, :, :), fv.dNdξ, nothing, nothing
21+
fv.ip, Nx, fv.Nξ,
22+
dNdx, fv.dNdξ, nothing, nothing
2123
)
2224
end
2325

0 commit comments

Comments
 (0)