Skip to content

Commit fdc1085

Browse files
Split up extension into KA part and CUDA part. Also restore CPU error messages.
1 parent 4fe3556 commit fdc1085

9 files changed

Lines changed: 315 additions & 273 deletions

File tree

Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
2626
[extensions]
2727
FerriteBlockArrays = "BlockArrays"
2828
FerriteCudaExt = ["Adapt", "CUDA", "KernelAbstractions"]
29+
FerriteKAExt = ["Adapt", "KernelAbstractions"]
2930
FerriteMetis = "Metis"
3031
FerriteSparseMatrixCSR = "SparseMatricesCSR"
3132

ext/FerriteCudaExt.jl

Lines changed: 16 additions & 270 deletions
Original file line numberDiff line numberDiff line change
@@ -12,278 +12,19 @@ import Ferrite: as_structure_of_arrays, get_substruct
1212
import Ferrite: meandiag, nnodes_per_cell
1313
import Ferrite: CellCacheContainer, CellValuesContainer, CellCache
1414

15-
import CUDA: CUDA.CUSPARSE.CuSparseMatrixCSC, CUDA.CUSPARSE.CuSparseMatrixCSR, @allowscalar
15+
import CUDA: CUDA.CUSPARSE.CuSparseMatrixCSC, CUDA.CUSPARSE.CuSparseMatrixCSR, @gputhrow, @device_override
1616
import KernelAbstractions as KA
1717
import KernelAbstractions: get_backend
1818

19+
# ---------------- custom dispatches for error paths --------------------
1920

20-
# ----------------- grid --------------------
21-
22-
struct DeviceGrid{
23-
dim, C <: Ferrite.AbstractCell, T <: Real,
24-
CA <: AbstractArray{C, 1}, NA <: AbstractArray{Node{dim, T}, 1},
25-
} <: Ferrite.AbstractGrid{dim}
26-
cells::CA
27-
nodes::NA
28-
end
29-
30-
function Adapt.adapt_structure(to, grid::DeviceGrid)
31-
return DeviceGrid(Adapt.adapt_structure(to, grid.cells), Adapt.adapt_structure(to, grid.nodes))
32-
end
33-
34-
function DeviceGrid(backend, grid::AbstractGrid)
35-
return DeviceGrid(adapt(backend, getcells(grid)), adapt(backend, getnodes(grid)))
36-
end
37-
38-
Ferrite.get_coordinate_eltype(::DeviceGrid{<:Any, <:Any, T}) where {T} = T
39-
40-
# ----------------- dofs --------------------
41-
42-
struct DeviceSubDofHandler{
43-
dim,
44-
CS <: AbstractVector{Int}, CD <: AbstractVector{Int},
45-
CO <: AbstractVector{Int}, FN, DR <: Tuple, G <: Ferrite.AbstractGrid{dim},
46-
} <: AbstractDofHandler
47-
cellset::CS
48-
cell_dofs::CD
49-
cell_dofs_offset::CO
50-
ndofs_per_cell::Int
51-
nnodes_per_cell::Int
52-
# Vector{Symbol} on host, Nothing on device — Symbol is not a bitstype and cannot
53-
# be stored in device memory. Use the integer index overload of dof_range on device.
54-
field_names::FN
55-
dof_ranges::DR
56-
grid::G
57-
end
58-
59-
Ferrite.get_grid(dh::DeviceSubDofHandler) = dh.grid
60-
61-
function Adapt.adapt_structure(to, sdh::DeviceSubDofHandler)
62-
return DeviceSubDofHandler(
63-
Adapt.adapt_structure(to, sdh.cellset),
64-
Adapt.adapt_structure(to, sdh.cell_dofs),
65-
Adapt.adapt_structure(to, sdh.cell_dofs_offset),
66-
sdh.ndofs_per_cell,
67-
sdh.nnodes_per_cell,
68-
nothing,
69-
sdh.dof_ranges,
70-
Adapt.adapt_structure(to, sdh.grid),
71-
)
21+
# GPUs cannot interpolate strings out of the box, so we use a reduced error message for now.
22+
@device_override @noinline Ferrite.throw_detJ_not_pos(detJ) = @gputhrow("ArgumentError", "det(J) is not positive. Please check the value on CPU.")
23+
@device_override @noinline function Ferrite.throw_incompatible_dof_length(length_ue, n_base_funcs)
24+
@gputhrow("ArgumentError", "the number of base functions does not match the length of the vector. Perhaps you passed the global vector, or forgot to pass a dof_range? Please check the values on CPU.")
7225
end
73-
74-
Ferrite.nnodes_per_cell(sdh::DeviceSubDofHandler) = sdh.nnodes_per_cell
75-
Ferrite.ndofs_per_cell(sdh::DeviceSubDofHandler) = sdh.ndofs_per_cell
76-
77-
function Ferrite.dof_range(sdh::DeviceSubDofHandler, field_idx::Int)
78-
return sdh.dof_ranges[field_idx]
79-
end
80-
function Ferrite.dof_range(sdh::DeviceSubDofHandler, field_name::Symbol)
81-
idx = findfirst(==(field_name), sdh.field_names)
82-
idx === nothing && error("Field $field_name not found in DeviceSubDofHandler")
83-
return sdh.dof_ranges[idx]
84-
end
85-
86-
function Ferrite.celldofs!(global_dofs::AbstractVector, sdh::DeviceSubDofHandler, i::Integer)
87-
copyto!(global_dofs, 1, sdh.cell_dofs, sdh.cell_dofs_offset[i], length(global_dofs))
88-
return global_dofs
89-
end
90-
91-
# Host-only container — not sent to the device!
92-
struct HostDofHandler{dim, G <: DeviceGrid{dim}, SDH <: DeviceSubDofHandler, DH <: AbstractDofHandler} <: AbstractDofHandler
93-
subdofhandlers::Vector{SDH}
94-
grid::G
95-
original_dh::DH
96-
end
97-
98-
function HostDofHandler(backend, dh::DofHandler)
99-
gpu_grid = DeviceGrid(backend, dh.grid)
100-
cell_dofs = adapt(backend, dh.cell_dofs)
101-
cell_dofs_offset = adapt(backend, dh.cell_dofs_offset)
102-
subdofhandlers = map(dh.subdofhandlers) do sdh
103-
dof_ranges = Tuple(Ferrite.dof_range(sdh, i) for i in 1:length(sdh.field_names))
104-
DeviceSubDofHandler(
105-
adapt(backend, collect(Int, sdh.cellset)), cell_dofs, cell_dofs_offset,
106-
sdh.ndofs_per_cell, nnodes_per_cell(dh.grid, first(sdh.cellset)), copy(sdh.field_names), dof_ranges, gpu_grid
107-
)
108-
end
109-
return HostDofHandler(subdofhandlers, gpu_grid, dh)
110-
end
111-
112-
function Adapt.adapt_structure(to::KA.Backend, dh::DofHandler)
113-
return HostDofHandler(to, dh)
114-
end
115-
116-
Ferrite.get_grid(dh::HostDofHandler) = dh.grid
117-
118-
119-
# ----------------- Adapt.jl parts --------------------
120-
121-
import Adapt: Adapt, adapt, adapt_structure
122-
Adapt.@adapt_structure CellValues
123-
Adapt.@adapt_structure Ferrite.GeometryMapping
124-
Adapt.@adapt_structure Ferrite.FunctionValues
125-
function adapt_structure(to, ccc::Ferrite.CellValuesContainer)
126-
inner_values = adapt(to, ccc.values)
127-
return Ferrite.CellValuesContainer{typeof(get_substruct(1, inner_values)), typeof(inner_values)}(inner_values)
128-
end
129-
130-
adapt(to, ip::Ferrite.Interpolation) = ip
131-
132-
function adapt(d, cv::CellValues)
133-
return CellValues(
134-
adapt(d, cv.fun_values),
135-
adapt(d, cv.geo_mapping),
136-
adapt(d, cv.qr),
137-
adapt(d, cv.detJdV),
138-
)
139-
end
140-
141-
function as_structure_of_arrays(d, outer_dim, ::Type{CellValues}, args...; kwargs...)
142-
cv = CellValues(args...; kwargs...)
143-
return as_structure_of_arrays(d, outer_dim, cv)
144-
end
145-
146-
function as_structure_of_arrays(d, N, cv::CellValues)
147-
return CellValues(
148-
as_structure_of_arrays(d, N, cv.fun_values),
149-
as_structure_of_arrays(d, N, cv.geo_mapping),
150-
adapt(d, cv.qr),
151-
KA.zeros(d, eltype(cv.detJdV), N, length(cv.detJdV)),
152-
)
153-
end
154-
155-
function adapt(d, fv::Ferrite.FunctionValues)
156-
= adapt(d, fv.Nξ)
157-
return Ferrite.FunctionValues(
158-
adapt(d, fv.ip),
159-
fv.=== fv.Nx ?: adapt(fv.Nx), # Ensure proper aliasing
160-
Nξ,
161-
adapt(d, fv.dNdx),
162-
adapt(d, fv.dNdξ),
163-
fv.d2Ndx2 === nothing ? nothing : as_shared_array(d, N, fv.d2Ndx2),
164-
fv.d2Ndξ2 === nothing ? nothing : adapt(d, collect(fv.d2Ndξ2)),
165-
)
166-
end
167-
168-
function as_structure_of_arrays(d, N, fv::Ferrite.FunctionValues)
169-
= adapt(d, fv.Nξ)
170-
return Ferrite.FunctionValues(
171-
adapt(d, fv.ip),
172-
fv.=== fv.Nx ?: KA.zeros(d, eltype(fv.Nx), N, size(fv.Nx, 1), size(fv.Nx, 2)), # Ensure proper aliasing,
173-
Nξ,
174-
fv.dNdx === nothing ? nothing : KA.zeros(d, eltype(fv.dNdx), N, size(fv.dNdx, 1), size(fv.dNdx, 2)),
175-
adapt(d, fv.dNdξ),
176-
fv.d2Ndx2 === nothing ? nothing : KA.zeros(d, eltype(fv.d2Ndx2), N, size(fv.d2Ndx2, 1), size(fv.d2Ndx2, 2)),
177-
fv.d2Ndξ2 === nothing ? nothing : adapt(d, fv.d2Ndξ2),
178-
)
179-
end
180-
181-
function adapt(d, fv::Ferrite.GeometryMapping)
182-
return Ferrite.GeometryMapping(
183-
adapt(d, fv.ip),
184-
adapt(d, fv.M),
185-
fv.dMdξ === nothing ? nothing : adapt(d, fv.dMdξ),
186-
fv.d2Mdξ2 === nothing ? nothing : adapt(d, fv.d2Mdξ2),
187-
)
188-
end
189-
190-
function as_structure_of_arrays(d, N, fv::Ferrite.GeometryMapping)
191-
return Ferrite.GeometryMapping(
192-
adapt(d, fv.ip),
193-
KA.zeros(d, eltype(fv.M), N, size(fv.M, 1), size(fv.M, 2)),
194-
fv.dMdξ === nothing ? nothing : adapt(d, fv.dMdξ),
195-
fv.d2Mdξ2 === nothing ? nothing : adapt(d, fv.d2Mdξ2),
196-
)
197-
end
198-
199-
function adapt(to, qr::QuadratureRule{shape}) where {shape}
200-
return QuadratureRule{shape}(adapt(to, qr.weights), adapt(to, qr.points))
201-
end
202-
203-
# Adapt.@adapt_structure QuadratureRule does not work here due to the type parameter ctor.
204-
function adapt_structure(to, qr::QuadratureRule{shape}) where {shape}
205-
return QuadratureRule{shape}(adapt_structure(to, qr.weights), adapt_structure(to, qr.points))
206-
end
207-
208-
# -------------------- iterator ----------------------
209-
210-
# NOTE CellCache is mutable and hence inherently incompatible with GPU. So here is the
211-
# immutable variant. Making the CellCache immutable is considered breaking due to the reinit! API integration.
212-
struct ImmutableCellCache{G <: AbstractGrid, SDH, IVT, VX}
213-
flags::UpdateFlags
214-
grid::G
215-
cellid::Int
216-
nodes::IVT
217-
coords::VX
218-
sdh::SDH
219-
dofs::IVT
220-
end
221-
(cc::ImmutableCellCache)(cellid::Int) = ImmutableCellCache(cc.flags, cc.grid, cellid, cc.nodes, cc.coords, cc.sdh, cc.dofs)
222-
Adapt.@adapt_structure ImmutableCellCache
223-
function adapt_structure(to, ccc::CellCacheContainer)
224-
inner_values = adapt(to, ccc.values)
225-
return CellCacheContainer{typeof(get_substruct(1, inner_values, -1)), typeof(inner_values)}(inner_values)
226-
end
227-
228-
Ferrite.celldofs(cc::ImmutableCellCache) = cc.dofs
229-
230-
function as_structure_of_arrays(backend, outer_dim, ::Type{CellCache}, dh::HostDofHandler, flags::UpdateFlags = UpdateFlags())
231-
@assert length(dh.subdofhandlers) == 1 "ImmutableCellCache only works on HostDofHandler's with a single subdomain. Please call the ImmutableCellCache adaptation on the DeviceSubDofHandler."
232-
return as_structure_of_arrays(backend, outer_dim, CellCache, first(dh.subdofhandlers), flags)
233-
end
234-
235-
function as_structure_of_arrays(backend, outer_dim, ::Type{CellCache}, sdh::DeviceSubDofHandler{dim}, flags::UpdateFlags = UpdateFlags()) where {dim}
236-
grid = get_grid(sdh)
237-
begin
238-
n = Ferrite.ndofs_per_cell(sdh)
239-
N = Ferrite.nnodes_per_cell(sdh)
240-
nodes = KA.zeros(backend, Int, outer_dim, N)
241-
coords = KA.zeros(backend, Vec{dim, get_coordinate_eltype(grid)}, outer_dim, N)
242-
dofs = KA.zeros(backend, Int, outer_dim, n)
243-
end
244-
return ImmutableCellCache(flags, grid, -1, nodes, coords, sdh, dofs)
245-
end
246-
247-
function Ferrite.CellCache(backend, dh::HostDofHandler{dim}, flags::UpdateFlags = UpdateFlags()) where {dim}
248-
@assert length(dh.subdofhandlers) == 1 "ImmutableCellCache only works on HostDofHandler's with a single subdomain. Please call the ImmutableCellCache adaptation on the DeviceSubDofHandler."
249-
return CellCache(backend, first(dh.subdofhandlers), flags)
250-
end
251-
252-
function Ferrite.CellCache(backend, sdh::DeviceSubDofHandler{dim}, flags::UpdateFlags = UpdateFlags()) where {dim}
253-
grid = get_grid(sdh)
254-
N = Ferrite.nnodes_per_cell(grid, first(sdh.cellset))
255-
nodes = KA.zeros(backend, Int, N)
256-
coords = KA.zeros(backend, Vec{dim, get_coordinate_eltype(grid)}, N)
257-
258-
n = Ferrite.ndofs_per_cell(sdh)
259-
dofs = KA.zeros(backend, Int, n)
260-
return ImmutableCellCache(flags, grid, -1, nodes, coords, sdh, dofs)
261-
end
262-
263-
function adapt(backend, cc::ImmutableCellCache)
264-
return ImmutableCellCache(
265-
cc.flags,
266-
adapt(backend, cc.grid),
267-
-1,
268-
adapt(backend, cc.nodes),
269-
adapt(backend, cc.coords),
270-
adapt(backend, cc.sdh),
271-
adapt(backend, cc.dofs),
272-
)
273-
end
274-
275-
function get_substruct(i, cc::ImmutableCellCache, cellid)
276-
return ImmutableCellCache(
277-
cc.flags, cc.grid, cellid,
278-
view(cc.nodes, i, :), view(cc.coords, i, :), cc.sdh, view(cc.dofs, i, :)
279-
)
280-
end
281-
282-
function Ferrite.reinit!(cc_i::ImmutableCellCache, cellid::Integer)
283-
cc_i.flags.nodes && Ferrite.cellnodes!(cc_i.nodes, cc_i.grid, cellid)
284-
cc_i.flags.coords && Ferrite.getcoordinates!(cc_i.coords, cc_i.grid, cellid)
285-
cc_i.sdh !== nothing && cc_i.flags.dofs && Ferrite.celldofs!(cc_i.dofs, cc_i.sdh, cellid)
286-
return nothing
26+
@device_override @noinline function Ferrite.throw_incompatible_coord_length(length_x, n_base_funcs)
27+
@gputhrow("ArgumentError", "the number of (geometric) base functions does not match the number of coordinates in the vector. Perhaps you forgot to use an appropriate geometric interpolation when creating FE values? See https://github.com/Ferrite-FEM/Ferrite.jl/issues/265 for more details. Please check the values on CPU.")
28728
end
28829

28930
# -------------------- assembler ----------------------
@@ -382,11 +123,15 @@ function meandiag_kernel!(diag, colptr, rowval, nzval, n)
382123
return nothing
383124
end
384125

126+
# TODO The code below must be refactored a bit to use KernelAbstractions directly.
127+
# The limiting factor right now is not having a enough sparse matrix base types
128+
# and related to this no interfaces to query the fields.
129+
385130
function meandiag(K::CuSparseMatrixCSC{T}) where {T}
386131
n = size(K, 1)
387132
backend = get_backend(nonzeros(K))
388133
diag = KA.zeros(backend, T, n)
389-
threads = min(256, n)
134+
threads = min(256, n, CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
390135
@cuda threads = threads blocks = cld(n, threads) meandiag_kernel!(
391136
diag, SparseArrays.getcolptr(K), rowvals(K), nonzeros(K), n
392137
)
@@ -442,16 +187,17 @@ function Ferrite.apply!(K::CuSparseMatrixCSC{T}, f::CuVector{T}, ch::GPUConstrai
442187
nz = nonzeros(K)
443188
nnz = length(nz)
444189
m = meandiag(K)
445-
threads_p = min(256, n_prescribed)
190+
threads_p = min(256, n_prescribed, CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
446191
blocks_p = cld(n_prescribed, threads_p)
447192

193+
# TODO fuze into one kernel
448194
# Step 1+2: f -= K[:,d]*v, then zero prescribed columns
449195
@cuda threads = threads_p blocks = blocks_p apply_inhom_zero_cols_kernel!(
450196
f, colptr, rv, nz, ch.prescribed_dofs, ch.inhomogeneities, n_prescribed, applyzero
451197
)
452198

453199
# Step 3: zero prescribed rows
454-
threads_z = min(256, nnz)
200+
threads_z = min(256, nnz, CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
455201
@cuda threads = threads_z blocks = cld(nnz, threads_z) apply_zero_rows_kernel!(
456202
rv, nz, ch.is_prescribed, nnz
457203
)

ext/FerriteKAExt.jl

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
module FerriteKAExt
2+
3+
using Ferrite, SparseArrays
4+
5+
import Adapt: Adapt, adapt, adapt_structure
6+
7+
import Ferrite: get_grid, AbstractGrid, AbstractDofHandler, get_coordinate_eltype
8+
import Ferrite: as_structure_of_arrays, get_substruct
9+
import Ferrite: meandiag, nnodes_per_cell
10+
import Ferrite: CellCacheContainer, CellValuesContainer, CellCache
11+
12+
import KernelAbstractions as KA
13+
import KernelAbstractions: get_backend
14+
15+
include("FerriteKAExt/adapt_core.jl")
16+
include("FerriteKAExt/soa_core.jl")
17+
18+
include("FerriteKAExt/device_grid.jl")
19+
20+
include("FerriteKAExt/dof_handler.jl")
21+
22+
include("FerriteKAExt/iterator.jl")
23+
24+
end

0 commit comments

Comments
 (0)