1+ # TODO 's
2+ # * `update!` for ConstraintHandler on GPU.
3+
14module FerriteCudaExt
25
36using Ferrite, CUDA, SparseArrays
47
58import Adapt: Adapt, adapt, adapt_structure
69
7- import Ferrite: get_grid, AbstractGrid, AbstractDofHandler, get_coordinate_eltype, as_structure_of_arrays, get_substruct, materialize, meandiag
10+ import Ferrite: get_grid, AbstractGrid, AbstractDofHandler, get_coordinate_eltype, as_structure_of_arrays, get_substruct, meandiag
811
912import CUDA: CUDA. CUSPARSE. CuSparseMatrixCSC, CUDA. CUSPARSE. CuSparseMatrixCSR, @allowscalar
1013import KernelAbstractions as KA
@@ -74,11 +77,7 @@ function Ferrite.dof_range(sdh::GPUSubDofHandler, field_name::Symbol)
7477end
7578
7679function Ferrite. celldofs! (global_dofs:: AbstractVector , sdh:: GPUSubDofHandler , i:: Integer )
77- offset = sdh. cell_dofs_offset[i] - 1
78- n = sdh. ndofs_per_cell
79- @inbounds for j in 1 : n
80- global_dofs[j] = sdh. cell_dofs[offset + j]
81- end
80+ copyto! (global_dofs, 1 , sdh. cell_dofs, sdh. cell_dofs_offset[i], length (global_dofs))
8281 return global_dofs
8382end
8483
@@ -118,34 +117,12 @@ Adapt.@adapt_structure Ferrite.FunctionValues
118117
119118adapt (to, ip:: Ferrite.Interpolation ) = ip
120119
121- get_number_of_device_threads (_) = attribute (device (), CUDA. DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
122-
123- # The design below outmaneuvers the standard Ferrite setup logic. Here start with the setup of a *defective* FEValues object.
124- # On the device we do not access the this object directly, but instead we have an accessor function which queries the shared
125- # memory variant of this object (e.g. CuStaticSharedMemory). The main reason reason is that we cannot just allocate device
126- # shared memory on the host, but using global memory is not optimal either.
127- struct DeviceResidentSharedMemoryDescriptor{Tv, N, Dims, M} <: AbstractArray{Tv, M}
128- end
129-
130- as_shared_array (d, N, a) = adapt (d, a)
131- as_shared_array (d, N, a:: Array{T, M} ) where {T, M} = DeviceResidentSharedMemoryDescriptor {T, N, size(a), M} ()
132- # FIXME adjust the number of threads on kernel launch
133- # as_shared_array(d::CUDA.KernelAdaptor, N, a::DeviceResidentSharedMemoryDescriptor) = DeviceResidentSharedMemoryDescriptor{eltype(T), N, size(a)}()
134-
135- function materialize (:: DeviceResidentSharedMemoryDescriptor{Tv, N, Dims} ) where {Tv, N, Dims}
136- # TODO KernelAbstractions v0.10
137- # return KA.KernelIntrinsics.localmemory(Tv, (N, Dims...))
138- return CuStaticSharedArray (Tv, N, Dims... )
139- # return KA.SharedMemory(Tv, (N, Dims...), KA.gensym("static_shmem"))
140- end
141-
142120function adapt (d, cv:: CellValues )
143- N = get_number_of_device_threads (d)
144121 return CellValues (
145122 adapt (d, cv. fun_values),
146123 adapt (d, cv. geo_mapping),
147124 adapt (d, cv. qr),
148- as_shared_array (d, N , cv. detJdV),
125+ adapt (d , cv. detJdV),
149126 )
150127end
151128
@@ -164,35 +141,35 @@ function as_structure_of_arrays(d, N, cv::CellValues)
164141end
165142
166143function adapt (d, fv:: Ferrite.FunctionValues )
167- N = get_number_of_device_threads (d )
144+ Nξ = adapt (d, fv . Nξ )
168145 return Ferrite. FunctionValues (
169146 adapt (d, fv. ip),
170- as_shared_array (d, N, fv. Nx),
171- adapt (d, fv . Nξ) ,
172- as_shared_array (d, N , fv. dNdx),
147+ fv . Nξ === fv. Nx ? Nξ : adapt (fv . Nx), # Ensure proper aliasing
148+ Nξ ,
149+ adapt (d , fv. dNdx),
173150 adapt (d, fv. dNdξ),
174151 fv. d2Ndx2 === nothing ? nothing : as_shared_array (d, N, fv. d2Ndx2),
175152 fv. d2Ndξ2 === nothing ? nothing : adapt (d, collect (fv. d2Ndξ2)),
176153 )
177154end
178155
179156function as_structure_of_arrays (d, N, fv:: Ferrite.FunctionValues )
157+ Nξ = adapt (d, fv. Nξ)
180158 return Ferrite. FunctionValues (
181159 adapt (d, fv. ip),
182- KA. zeros (d, eltype (fv. Nx), N, size (fv. Nx, 1 ), size (fv. Nx, 2 )),
183- adapt (d, fv . Nξ) ,
184- KA. zeros (d, eltype (fv. dNdx), N, size (fv. dNdx, 1 ), size (fv. dNdx, 2 )),
160+ fv . Nξ === fv . Nx ? Nξ : KA. zeros (d, eltype (fv. Nx), N, size (fv. Nx, 1 ), size (fv. Nx, 2 )), # Ensure proper aliasing ,
161+ Nξ ,
162+ fv . dNdx === nothing ? nothing : KA. zeros (d, eltype (fv. dNdx), N, size (fv. dNdx, 1 ), size (fv. dNdx, 2 )),
185163 adapt (d, fv. dNdξ),
186164 fv. d2Ndx2 === nothing ? nothing : KA. zeros (d, eltype (fv. d2Ndx2), N, size (fv. d2Ndx2, 1 ), size (fv. d2Ndx2, 2 )),
187165 fv. d2Ndξ2 === nothing ? nothing : adapt (d, fv. d2Ndξ2),
188166 )
189167end
190168
191169function adapt (d, fv:: Ferrite.GeometryMapping )
192- N = get_number_of_device_threads (d)
193170 return Ferrite. GeometryMapping (
194171 adapt (d, fv. ip),
195- as_shared_array (d, N , fv. M),
172+ adapt (d , fv. M),
196173 fv. dMdξ === nothing ? nothing : adapt (d, fv. dMdξ),
197174 fv. d2Mdξ2 === nothing ? nothing : adapt (d, fv. d2Mdξ2),
198175 )
@@ -216,34 +193,6 @@ function adapt_structure(to, qr::QuadratureRule{shape}) where {shape}
216193 return QuadratureRule {shape} (adapt_structure (to, qr. weights), adapt_structure (to, qr. points))
217194end
218195
219- function materialize (cv:: CellValues )
220- return CellValues (
221- materialize (cv. fun_values),
222- materialize (cv. geo_mapping),
223- materialize (cv. qr),
224- materialize (cv. detJdV),
225- )
226- end
227- function materialize (fv:: Ferrite.FunctionValues )
228- return Ferrite. FunctionValues (
229- materialize (fv. ip),
230- materialize (fv. Nx),
231- materialize (fv. Nξ),
232- materialize (fv. dNdx),
233- materialize (fv. dNdξ),
234- materialize (fv. d2Ndx2),
235- materialize (fv. d2Ndξ2),
236- )
237- end
238- function materialize (fv:: Ferrite.GeometryMapping )
239- return Ferrite. GeometryMapping (
240- materialize (fv. ip),
241- materialize (fv. M),
242- materialize (fv. dMdξ),
243- materialize (fv. d2Mdξ2),
244- )
245- end
246-
247196# -------------------- iterator ----------------------
248197
249198# NOTE CellCache is mutable and hence inherently incompatible with GPU. So here is the
@@ -294,40 +243,27 @@ function Ferrite.CellCache(d, dh::GPUDofHandler{dim}, flags::UpdateFlags = Updat
294243end
295244
296245function Ferrite. CellCache (d, sdh:: GPUSubDofHandler{dim} , flags:: UpdateFlags = UpdateFlags ()) where {dim}
297- outer_dim = get_number_of_device_threads (d)
298246 grid = get_grid (sdh)
299247 @allowscalar begin
300- n = Ferrite. ndofs_per_cell (sdh)
301248 N = Ferrite. nnodes_per_cell (grid, first (sdh. cellset))
302- nodes = as_shared_array (d, outer_dim, zeros (Int, N))
303- coords = as_shared_array (d, outer_dim, zeros (Vec{dim, get_coordinate_eltype (grid)}, N))
304- dofs = as_shared_array (d, outer_dim, zeros (Int, N))
249+ nodes = KA. zeros (d, Int, N)
250+ coords = KA. zeros (d, Vec{dim, get_coordinate_eltype (grid)}, N)
251+
252+ n = Ferrite. ndofs_per_cell (sdh)
253+ dofs = KA. zeros (d, Int, n)
305254 end
306255 return GPUCellCache (flags, grid, - 1 , nodes, coords, sdh, dofs)
307256end
308257
309258function adapt (d, cc:: GPUCellCache )
310- outer_dim = get_number_of_device_threads (d)
311259 return GPUCellCache (
312260 cc. flags,
313261 adapt (d, cc. grid),
314262 - 1 ,
315- as_shared_array (d, outer_dim , cc. nodes),
316- as_shared_array (d, outer_dim , cc. coords),
263+ adapt (d , cc. nodes),
264+ adapt (d , cc. coords),
317265 adapt (d, cc. sdh),
318- as_shared_array (d, outer_dim, cc. dofs),
319- )
320- end
321-
322- function materialize (cc:: GPUCellCache )
323- return GPUCellCache (
324- materialize (cc. flags),
325- materialize (cc. grid),
326- materialize (cc. cellid),
327- materialize (cc. nodes),
328- materialize (cc. coords),
329- materialize (cc. sdh),
330- materialize (cc. dofs),
266+ adapt (d, cc. dofs),
331267 )
332268end
333269
@@ -347,18 +283,33 @@ end
347283
348284# -------------------- assembler ----------------------
349285
350- struct GPUCSCAssembler{KType}
286+ struct GPUCSCAssembler{KType, FType }
351287 K:: KType
288+ f:: FType
352289end
353290Adapt. @adapt_structure GPUCSCAssembler
354291
355292# FIXME buffer
356- function Ferrite. start_assemble (K:: CuSparseMatrixCSC ; fillzero:: Bool = true )
293+ function Ferrite. start_assemble (K:: CuSparseMatrixCSC , f :: Union{CuVector, Nothing} = nothing ; fillzero:: Bool = true )
357294 fillzero && fill! (nonzeros (K), zero (eltype (K)))
358- return GPUCSCAssembler (K)
295+ f != = nothing && fillzero && fill! (f, zero (eltype (f)))
296+ return GPUCSCAssembler (K, f)
297+ end
298+
299+ function Ferrite. assemble! (A:: GPUCSCAssembler , dofs:: AbstractVector{<:Integer} , Ke:: AbstractMatrix , fe:: AbstractVector )
300+ Ferrite. assemble! (A, dofs, Ke)
301+ Ferrite. assemble! (A, dofs, fe)
302+ return nothing
303+ end
304+
305+ function Ferrite. assemble! (A:: GPUCSCAssembler , dofs:: AbstractVector{<:Integer} , fe:: AbstractVector )
306+ for (i, dof) in enumerate (dofs)
307+ A. f[dof] += fe[i]
308+ end
309+ return nothing
359310end
360311
361- function Ferrite. assemble! (A:: GPUCSCAssembler , dofs:: AbstractVector{<:Integer} , Ke:: AbstractMatrix , fe :: Nothing = nothing )
312+ function Ferrite. assemble! (A:: GPUCSCAssembler , dofs:: AbstractVector{<:Integer} , Ke:: AbstractMatrix )
362313 colptr = SparseArrays. getcolptr (A. K)
363314 rowval = rowvals (A. K)
364315 nzval = nonzeros (A. K)
0 commit comments