|
| 1 | +module FerriteCudaExt |
| 2 | + |
| 3 | +using Ferrite, CUDA, SparseArrays |
| 4 | + |
| 5 | +import Adapt: Adapt, adapt, adapt_structure |
| 6 | + |
| 7 | +import Ferrite: get_grid, AbstractGrid, AbstractDofHandler, get_coordinate_eltype, TaskDescriptor, get_worker_part |
| 8 | + |
| 9 | +import CUDA: CUDA.CUSPARSE.CuSparseMatrixCSC, CUDA.CUSPARSE.CuSparseMatrixCSR, @allowscalar |
| 10 | +import KernelAbstractions: KernelAbstractions, get_backend |
| 11 | + |
| 12 | + |
| 13 | +# ----------------- grid -------------------- |
| 14 | + |
| 15 | +struct GPUGrid{dim, C <: Ferrite.AbstractCell, T <: Real, |
| 16 | + CA <: AbstractArray{C,1}, NA <: AbstractArray{Node{dim,T},1}} <: Ferrite.AbstractGrid{dim} |
| 17 | +cells::CA |
| 18 | +nodes::NA |
| 19 | +end |
| 20 | + |
| 21 | +function Adapt.adapt_structure(to, grid::GPUGrid) |
| 22 | +return GPUGrid(Adapt.adapt_structure(to, grid.cells), Adapt.adapt_structure(to, grid.nodes)) |
| 23 | +end |
| 24 | + |
| 25 | +function GPUGrid(backend, grid::AbstractGrid) |
| 26 | +return GPUGrid(adapt(backend, getcells(grid)), adapt(backend, getnodes(grid))) |
| 27 | +end |
| 28 | + |
| 29 | + |
| 30 | +# ----------------- dofs -------------------- |
| 31 | + |
| 32 | +struct GPUSubDofHandler{CS <: AbstractVector{Int}, CD <: AbstractVector{Int}, |
| 33 | + CO <: AbstractVector{Int}, FN, DR <: Tuple, G} |
| 34 | + cellset::CS |
| 35 | + cell_dofs::CD |
| 36 | + cell_dofs_offset::CO |
| 37 | + ndofs_per_cell::Int |
| 38 | + # Vector{Symbol} on CPU, Nothing on GPU — Symbol is not a bitstype and cannot |
| 39 | + # be stored in GPU memory. Use the integer index overload of dof_range on GPU. |
| 40 | + field_names::FN |
| 41 | + dof_ranges::DR |
| 42 | + grid::G |
| 43 | +end |
| 44 | + |
| 45 | +function Adapt.adapt_structure(to, sdh::GPUSubDofHandler) |
| 46 | + return GPUSubDofHandler( |
| 47 | + Adapt.adapt_structure(to, sdh.cellset), |
| 48 | + Adapt.adapt_structure(to, sdh.cell_dofs), |
| 49 | + Adapt.adapt_structure(to, sdh.cell_dofs_offset), |
| 50 | + sdh.ndofs_per_cell, |
| 51 | + nothing, |
| 52 | + sdh.dof_ranges, |
| 53 | + Adapt.adapt_structure(to, sdh.grid), |
| 54 | + ) |
| 55 | +end |
| 56 | + |
| 57 | +Ferrite.ndofs_per_cell(sdh::GPUSubDofHandler) = sdh.ndofs_per_cell |
| 58 | + |
| 59 | +function Ferrite.dof_range(sdh::GPUSubDofHandler, field_idx::Int) |
| 60 | + return sdh.dof_ranges[field_idx] |
| 61 | +end |
| 62 | +function Ferrite.dof_range(sdh::GPUSubDofHandler, field_name::Symbol) |
| 63 | + idx = findfirst(==(field_name), sdh.field_names) |
| 64 | + idx === nothing && error("Field $field_name not found in GPUSubDofHandler") |
| 65 | + return sdh.dof_ranges[idx] |
| 66 | +end |
| 67 | + |
| 68 | +function Ferrite.celldofs!(global_dofs::AbstractVector, sdh::GPUSubDofHandler, i::Integer) |
| 69 | + offset = sdh.cell_dofs_offset[i] - 1 |
| 70 | + n = sdh.ndofs_per_cell |
| 71 | + @inbounds for j in 1:n |
| 72 | + global_dofs[j] = sdh.cell_dofs[offset + j] |
| 73 | + end |
| 74 | + return global_dofs |
| 75 | +end |
| 76 | + |
| 77 | +# CPU-only container — not sent to the GPU |
| 78 | +struct GPUDofHandler{dim, G <: GPUGrid{dim}, SDH <: GPUSubDofHandler} <: Ferrite.AbstractDofHandler |
| 79 | + subdofhandlers::Vector{SDH} |
| 80 | + grid::G |
| 81 | +end |
| 82 | + |
| 83 | +function GPUDofHandler(backend, dh::DofHandler) |
| 84 | + gpu_grid = GPUGrid(backend, dh.grid) |
| 85 | + cell_dofs = adapt(backend, dh.cell_dofs) |
| 86 | + cell_dofs_offset = adapt(backend, dh.cell_dofs_offset) |
| 87 | + subdofhandlers = map(dh.subdofhandlers) do sdh |
| 88 | + dof_ranges = Tuple(Ferrite.dof_range(sdh, i) for i in 1:length(sdh.field_names)) |
| 89 | + GPUSubDofHandler(adapt(backend, collect(Int, sdh.cellset)), cell_dofs, cell_dofs_offset, |
| 90 | + sdh.ndofs_per_cell, copy(sdh.field_names), dof_ranges, gpu_grid) |
| 91 | + end |
| 92 | + return GPUDofHandler(subdofhandlers, gpu_grid) |
| 93 | +end |
| 94 | + |
| 95 | +function Adapt.adapt_structure(to::TaskDescriptor, dh::DofHandler) |
| 96 | + return GPUDofHandler(to.device, dh) |
| 97 | +end |
| 98 | + |
| 99 | +Ferrite.get_grid(dh::GPUDofHandler) = dh.grid |
| 100 | + |
| 101 | + |
| 102 | +# ----------------- Adapt.jl parts -------------------- |
| 103 | + |
| 104 | +import Adapt: Adapt, adapt, adapt_structure |
| 105 | +Adapt.@adapt_structure CellValues |
| 106 | +Adapt.@adapt_structure Ferrite.GeometryMapping |
| 107 | +Adapt.@adapt_structure Ferrite.FunctionValues |
| 108 | + |
| 109 | +adapt(to, ip::Ferrite.Interpolation) = ip |
| 110 | + |
| 111 | +function adapt(to::TaskDescriptor, fv::Ferrite.FunctionValues) |
| 112 | + N = to.num_workers |
| 113 | + d = to.device |
| 114 | + return Ferrite.FunctionValues( |
| 115 | + adapt(to, fv.ip), |
| 116 | + KernelAbstractions.zeros(d, eltype(fv.Nx), N, size(fv.Nx, 1), size(fv.Nx, 2)), |
| 117 | + adapt(d, collect(fv.Nξ)), |
| 118 | + KernelAbstractions.zeros(d, eltype(fv.dNdx), N, size(fv.dNdx, 1), size(fv.dNdx, 2)), |
| 119 | + adapt(d, collect(fv.dNdξ)), |
| 120 | + fv.d2Ndx2 === nothing ? nothing : KernelAbstractions.zeros(d, eltype(fv.d2Ndx2), N, size(fv.d2Ndx2, 1), size(fv.d2Ndx2, 2)), |
| 121 | + fv.d2Ndξ2 === nothing ? nothing : adapt(d, collect(fv.d2Ndξ2)), |
| 122 | + ) |
| 123 | +end |
| 124 | + |
| 125 | +function adapt(to::TaskDescriptor, fv::Ferrite.GeometryMapping) |
| 126 | + N = to.num_workers |
| 127 | + d = to.device |
| 128 | + return Ferrite.GeometryMapping( |
| 129 | + adapt(to, fv.ip), |
| 130 | + KernelAbstractions.zeros(d, eltype(fv.M), N, size(fv.M, 1), size(fv.M, 2)), |
| 131 | + fv.dMdξ === nothing ? nothing : adapt(d, collect(fv.dMdξ)), |
| 132 | + fv.d2Mdξ2 === nothing ? nothing : adapt(d, collect(fv.d2Mdξ2)), |
| 133 | + ) |
| 134 | +end |
| 135 | + |
| 136 | +function adapt(to::TaskDescriptor, cv::CellValues) |
| 137 | + N = to.num_workers |
| 138 | + d = to.device |
| 139 | + return CellValues( |
| 140 | + adapt(to, cv.fun_values), |
| 141 | + adapt(to, cv.geo_mapping), |
| 142 | + adapt(to, cv.qr), |
| 143 | + KernelAbstractions.zeros(d, eltype(cv.detJdV), N, length(cv.detJdV)), |
| 144 | + ) |
| 145 | +end |
| 146 | + |
| 147 | +function adapt(to::TaskDescriptor, qr::QuadratureRule{shape}) where shape |
| 148 | + d = to.device |
| 149 | + return QuadratureRule{shape}(adapt(d, collect(qr.weights)), adapt(d, collect(qr.points))) |
| 150 | +end |
| 151 | + |
| 152 | +function adapt(to, qr::QuadratureRule{shape}) where shape |
| 153 | + return QuadratureRule{shape}(adapt(to, qr.weights), adapt(to, qr.points)) |
| 154 | +end |
| 155 | + |
| 156 | +function adapt_structure(to, qr::QuadratureRule{shape}) where shape |
| 157 | + return QuadratureRule{shape}(adapt_structure(to, qr.weights), adapt_structure(to, qr.points)) |
| 158 | +end |
| 159 | + |
| 160 | + |
| 161 | +# -------------------- iterator ---------------------- |
| 162 | + |
| 163 | +struct GPUCellCache{G <: AbstractGrid, SDH, IVT, VX} |
| 164 | + flags::UpdateFlags |
| 165 | + grid::G |
| 166 | + cellid::Int |
| 167 | + nodes::IVT |
| 168 | + coords::VX |
| 169 | + sdh::SDH |
| 170 | + dofs::IVT |
| 171 | +end |
| 172 | +Adapt.@adapt_structure GPUCellCache |
| 173 | + |
| 174 | +function GPUCellCache(sdh::GPUSubDofHandler, grid::GPUGrid{dim}, descriptor::TaskDescriptor, |
| 175 | + flags::UpdateFlags = UpdateFlags()) where dim |
| 176 | + d = descriptor.device |
| 177 | + @allowscalar begin |
| 178 | + n = sdh.ndofs_per_cell |
| 179 | + N = Ferrite.nnodes_per_cell(grid, 1) |
| 180 | + W = descriptor.num_workers |
| 181 | + nodes = KernelAbstractions.zeros(d, Int, W, N) |
| 182 | + coords = KernelAbstractions.zeros(d, Vec{dim, get_coordinate_eltype(grid)}, W, N) |
| 183 | + dofs = KernelAbstractions.zeros(d, Int, W, n) |
| 184 | + end |
| 185 | + return GPUCellCache(flags, grid, -1, nodes, coords, sdh, dofs) |
| 186 | +end |
| 187 | + |
| 188 | +Ferrite.celldofs(cc::GPUCellCache) = cc.dofs |
| 189 | + |
| 190 | +# TODO I do not like this design. |
| 191 | +function Ferrite.CellCache(descriptor::TaskDescriptor, sdh::GPUSubDofHandler, flags::UpdateFlags = UpdateFlags()) |
| 192 | + return GPUCellCache(sdh, sdh.grid, descriptor, flags) |
| 193 | +end |
| 194 | + |
| 195 | +function get_worker_part(i, cc::GPUCellCache, cellid) |
| 196 | + return GPUCellCache(cc.flags, cc.grid, cellid, |
| 197 | + view(cc.nodes, i, :), view(cc.coords, i, :), cc.sdh, view(cc.dofs, i, :)) |
| 198 | +end |
| 199 | + |
| 200 | + |
| 201 | +# -------------------- assembler ---------------------- |
| 202 | + |
| 203 | +struct GPUCSCAssembler{CP <: AbstractVector, RV <: AbstractVector, NZ <: AbstractVector} |
| 204 | + colptr::CP |
| 205 | + rowval::RV |
| 206 | + nzval::NZ |
| 207 | +end |
| 208 | +Adapt.@adapt_structure GPUCSCAssembler |
| 209 | + |
| 210 | +function Ferrite.start_assemble(K::CuSparseMatrixCSC; fillzero::Bool = true) |
| 211 | + fillzero && fill!(nonzeros(K), zero(eltype(K))) |
| 212 | + return GPUCSCAssembler(SparseArrays.getcolptr(K), rowvals(K), nonzeros(K)) |
| 213 | +end |
| 214 | + |
| 215 | +function Ferrite.assemble!(A::GPUCSCAssembler, dofs::AbstractVector{<:Integer}, Ke::AbstractMatrix, fe::Nothing = nothing) |
| 216 | + ndofs = length(dofs) |
| 217 | + for j in 1:ndofs |
| 218 | + col = dofs[j] |
| 219 | + r1 = A.colptr[col] |
| 220 | + r2 = A.colptr[col + 1] - 1 |
| 221 | + for i in 1:ndofs |
| 222 | + val = Ke[i, j] |
| 223 | + iszero(val) && continue |
| 224 | + row = dofs[i] |
| 225 | + # Linear search for the row in this column's nonzeros |
| 226 | + for idx in r1:r2 |
| 227 | + if A.rowval[idx] == row |
| 228 | + A.nzval[idx] += val |
| 229 | + break |
| 230 | + end |
| 231 | + end |
| 232 | + end |
| 233 | + end |
| 234 | + return nothing |
| 235 | +end |
| 236 | + |
| 237 | +function Ferrite.allocate_matrix(::Type{CuSparseMatrixCSC{Tv, Ti}}, dh::DofHandler) where {Tv, Ti} |
| 238 | + return CuSparseMatrixCSC(allocate_matrix(SparseMatrixCSC{Tv, Ti}, dh)) |
| 239 | +end |
| 240 | + |
| 241 | +function Ferrite.allocate_matrix(::Type{CuSparseMatrixCSR{Tv, Ti}}, dh::DofHandler) where {Tv, Ti} |
| 242 | + return CuSparseMatrixCSR(allocate_matrix(SparseMatrixCSC{Tv, Ti}, dh)) |
| 243 | +end |
| 244 | + |
| 245 | +# -------------------- constraints ---------------------- |
| 246 | + |
| 247 | +struct GPUConstraintHandler{Tv, Ti, PD <: AbstractVector{Ti}, IH <: AbstractVector{Tv}, IP <: AbstractVector{Bool}} |
| 248 | + prescribed_dofs::PD |
| 249 | + inhomogeneities::IH |
| 250 | + is_prescribed::IP |
| 251 | +end |
| 252 | + |
| 253 | +function adapt(backend, ch::ConstraintHandler) |
| 254 | + @assert Ferrite.isclosed(ch) |
| 255 | + n = Ferrite.ndofs(ch.dh) |
| 256 | + is_prescribed = zeros(Bool, n) |
| 257 | + for d in ch.prescribed_dofs |
| 258 | + is_prescribed[d] = true |
| 259 | + end |
| 260 | + return GPUConstraintHandler( |
| 261 | + adapt(backend, ch.prescribed_dofs), |
| 262 | + adapt(backend, ch.inhomogeneities), |
| 263 | + adapt(backend, is_prescribed), |
| 264 | + ) |
| 265 | +end |
| 266 | + |
| 267 | +function meandiag_kernel!(diag, colptr, rowval, nzval, n) |
| 268 | + j = threadIdx().x + (blockIdx().x - 1) * blockDim().x |
| 269 | + j > n && return |
| 270 | + for k in colptr[j]:(colptr[j + 1] - 1) |
| 271 | + if rowval[k] == j |
| 272 | + diag[j] = abs(nzval[k]) |
| 273 | + break |
| 274 | + end |
| 275 | + end |
| 276 | + return nothing |
| 277 | +end |
| 278 | + |
| 279 | +function meandiag(K::CuSparseMatrixCSC{T}) where T |
| 280 | + n = size(K, 1) |
| 281 | + backend = get_backend(nonzeros(K)) |
| 282 | + diag = KernelAbstractions.zeros(backend, T, n) |
| 283 | + threads = min(256, n) |
| 284 | + @cuda threads=threads blocks=cld(n, threads) meandiag_kernel!( |
| 285 | + diag, SparseArrays.getcolptr(K), rowvals(K), nonzeros(K), n) |
| 286 | + return sum(diag) / n |
| 287 | +end |
| 288 | + |
| 289 | +# Combined: f -= K[:, d] * inhom[d], then zero column d |
| 290 | +function apply_inhom_zero_cols_kernel!(f, colptr, rowval, nzval, prescribed_dofs, inhomogeneities, n_prescribed, applyzero) |
| 291 | + idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x |
| 292 | + idx > n_prescribed && return |
| 293 | + d = prescribed_dofs[idx] |
| 294 | + v = inhomogeneities[idx] |
| 295 | + for k in colptr[d]:(colptr[d + 1] - 1) |
| 296 | + if !applyzero && !iszero(v) |
| 297 | + CUDA.@atomic f[rowval[k]] -= v * nzval[k] |
| 298 | + end |
| 299 | + nzval[k] = zero(eltype(nzval)) |
| 300 | + end |
| 301 | + return nothing |
| 302 | +end |
| 303 | + |
| 304 | +# Zero out prescribed rows |
| 305 | +function apply_zero_rows_kernel!(rowval, nzval, is_prescribed, nnz) |
| 306 | + idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x |
| 307 | + idx > nnz && return |
| 308 | + if is_prescribed[rowval[idx]] |
| 309 | + nzval[idx] = zero(eltype(nzval)) |
| 310 | + end |
| 311 | + return nothing |
| 312 | +end |
| 313 | + |
| 314 | +# Set K[d,d] = m and f[d] = inhom[d] * m |
| 315 | +function apply_set_diag_kernel!(colptr, rowval, nzval, f, prescribed_dofs, inhomogeneities, m, n_prescribed, applyzero) |
| 316 | + idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x |
| 317 | + idx > n_prescribed && return |
| 318 | + d = prescribed_dofs[idx] |
| 319 | + for k in colptr[d]:(colptr[d + 1] - 1) |
| 320 | + if rowval[k] == d |
| 321 | + nzval[k] = m |
| 322 | + break |
| 323 | + end |
| 324 | + end |
| 325 | + f[d] = (applyzero ? zero(eltype(f)) : inhomogeneities[idx]) * m |
| 326 | + return nothing |
| 327 | +end |
| 328 | + |
| 329 | +function Ferrite.apply!(K::CuSparseMatrixCSC{T}, f::CuVector{T}, ch::GPUConstraintHandler{T}, applyzero::Bool = false) where T |
| 330 | + n_prescribed = length(ch.prescribed_dofs) |
| 331 | + n_prescribed == 0 && return |
| 332 | + |
| 333 | + colptr = SparseArrays.getcolptr(K) |
| 334 | + rv = rowvals(K) |
| 335 | + nz = nonzeros(K) |
| 336 | + nnz = length(nz) |
| 337 | + m = meandiag(K) |
| 338 | + threads_p = min(256, n_prescribed) |
| 339 | + blocks_p = cld(n_prescribed, threads_p) |
| 340 | + |
| 341 | + # Step 1+2: f -= K[:,d]*v, then zero prescribed columns |
| 342 | + @cuda threads=threads_p blocks=blocks_p apply_inhom_zero_cols_kernel!( |
| 343 | + f, colptr, rv, nz, ch.prescribed_dofs, ch.inhomogeneities, n_prescribed, applyzero) |
| 344 | + |
| 345 | + # Step 3: zero prescribed rows |
| 346 | + threads_z = min(256, nnz) |
| 347 | + @cuda threads=threads_z blocks=cld(nnz, threads_z) apply_zero_rows_kernel!( |
| 348 | + rv, nz, ch.is_prescribed, nnz) |
| 349 | + |
| 350 | + # Step 4: set diagonal and rhs |
| 351 | + @cuda threads=threads_p blocks=blocks_p apply_set_diag_kernel!( |
| 352 | + colptr, rv, nz, f, ch.prescribed_dofs, ch.inhomogeneities, T(m), n_prescribed, applyzero) |
| 353 | + |
| 354 | + return |
| 355 | +end |
| 356 | + |
| 357 | +function Ferrite.apply_zero!(K::CuSparseMatrixCSC{T}, f::CuVector{T}, ch::GPUConstraintHandler{T}) where T |
| 358 | + return Ferrite.apply!(K, f, ch, true) |
| 359 | +end |
| 360 | + |
| 361 | +end |
0 commit comments