Ferrite-FEM · KnutAM · Mar 6, 2026 · Mar 5, 2026 · Mar 6, 2026 · KnutAM
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
@@ -1,8 +1,8 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.12.1"
+julia_version = "1.12.5"
 manifest_format = "2.0"
-project_hash = "765feb17a5ba23a96b16cd2bae96223ad0618954"
+project_hash = "6d0dc5950af997fe06d71214d77ad65f439e8f72"
 
 [[deps.ADTypes]]
 git-tree-sha1 = "27cecae79e5cc9935255f90c53bb831cc3c870d7"
@@ -231,9 +231,9 @@ uuid = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e"
 version = "1.1.0"
 
 [[deps.ChunkSplitters]]
-git-tree-sha1 = "63a3903063d035260f0f6eab00f517471c5dc784"
+git-tree-sha1 = "1c52c8e2673edc030191177ff1aee42d25149acb"
 uuid = "ae650224-84b6-46f8-82ea-d812ca08434e"
-version = "3.1.2"
+version = "3.2.0"
 
 [[deps.CloseOpenIntervals]]
 deps = ["Static", "StaticArrayInterface"]
@@ -521,7 +521,7 @@ version = "1.4.1"
 [[deps.Downloads]]
 deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
 uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
-version = "1.6.0"
+version = "1.7.0"
 
 [[deps.EnumX]]
 git-tree-sha1 = "bddad79635af6aec424f53ed8aad5d7555dc6f00"
@@ -1066,7 +1066,7 @@ version = "0.6.4"
 [[deps.LibCURL_jll]]
 deps = ["Artifacts", "LibSSH2_jll", "Libdl", "OpenSSL_jll", "Zlib_jll", "nghttp2_jll"]
 uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
-version = "8.11.1+1"
+version = "8.15.0+0"
 
 [[deps.LibGit2]]
 deps = ["LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
@@ -1358,7 +1358,7 @@ version = "0.3.7"
 
 [[deps.MozillaCACerts_jll]]
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-version = "2025.5.20"
+version = "2025.11.4"
 
 [[deps.MuladdMacro]]
 git-tree-sha1 = "cac9cc5499c25554cba55cd3c30543cff5ca4fab"
@@ -1440,7 +1440,7 @@ version = "1.6.0"
 [[deps.OpenSSL_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
-version = "3.5.1+0"
+version = "3.5.4+0"
 
 [[deps.OpenSpecFun_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl"]
@@ -1533,7 +1533,7 @@ version = "0.44.2+0"
 [[deps.Pkg]]
 deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "Random", "SHA", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.12.0"
+version = "1.12.1"
 weakdeps = ["REPL"]
 
     [deps.Pkg.extensions]
@@ -2419,9 +2419,9 @@ uuid = "1317d2d5-d96f-522e-a858-c73665f53c3e"
 version = "2022.0.0+1"
 
 [[deps.p7zip_jll]]
-deps = ["Artifacts", "Libdl"]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
 uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
-version = "17.5.0+2"
+version = "17.7.0+0"
 
 [[deps.x264_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -2,6 +2,7 @@
 Bibliography = "f1be7e48-bf82-45af-a471-ae754a193061"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 Changelog = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e"
+ChunkSplitters = "ae650224-84b6-46f8-82ea-d812ca08434e"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"

diff --git a/docs/src/literate-gallery/landau.jl b/docs/src/literate-gallery/landau.jl
@@ -26,6 +26,7 @@ using Ferrite
 using Optim, LineSearches
 using SparseArrays
 using Tensors
+using OhMyThreads, ChunkSplitters
 
 # ## Energy terms
 # ### 4th order Landau free energy
@@ -47,7 +48,7 @@ struct ModelParams{V, T}
 end
 
 # ### ThreadCache
-# ### ThreadCache
+# ### TaskCache
-# ### ThreadCache
+# ### TaskCache
-# This holds the values that each thread will use during the assembly.
+# This holds the values that each task will use during the assembly.
 struct ThreadCache{CV, T, DIM, F <: Function, GC <: GradientConfig, HC <: HessianConfig}
-struct ThreadCache{CV, T, DIM, F <: Function, GC <: GradientConfig, HC <: HessianConfig}
+struct TaskCache{CV, T, DIM, F <: Function, GC <: GradientConfig, HC <: HessianConfig}
-struct ThreadCache{CV, T, DIM, F <: Function, GC <: GradientConfig, HC <: HessianConfig}
+struct TaskCache{CV, T, DIM, F <: Function, GC <: GradientConfig, HC <: HessianConfig}
     cvP::CV
     element_indices::Vector{Int}
@@ -72,16 +73,17 @@ function ThreadCache(dpc::Int, nodespercell, cvP::CellValues, modelparams, elpot
 end
 
 # ## The Model
-# everything is combined into a model.
+# Everything is combined into a model. The caches are pre-allocated (one per task)
+# and indexed by chunk index during assembly.
 mutable struct LandauModel{T, DH <: DofHandler, CH <: ConstraintHandler, TC <: ThreadCache}
-mutable struct LandauModel{T, DH <: DofHandler, CH <: ConstraintHandler, TC <: ThreadCache}
+mutable struct LandauModel{T, DH <: DofHandler, CH <: ConstraintHandler, TC <: TaskCache}
-mutable struct LandauModel{T, DH <: DofHandler, CH <: ConstraintHandler, TC <: ThreadCache}
+mutable struct LandauModel{T, DH <: DofHandler, CH <: ConstraintHandler, TC <: TaskCache}
     dofs::Vector{T}
     dofhandler::DH
     boundaryconds::CH
     threadindices::Vector{Vector{Int}}
-    threadindices::Vector{Vector{Int}}
+    colors::Vector{Vector{Int}}
-    threadindices::Vector{Vector{Int}}
+    colors::Vector{Vector{Int}}
-    threadcaches::Vector{TC}
+    caches::Vector{TC}
 end
 
-function LandauModel(α, G, gridsize, left::Vec{DIM, T}, right::Vec{DIM, T}, elpotential) where {DIM, T}
+function LandauModel(α, G, gridsize, left::Vec{DIM, T}, right::Vec{DIM, T}, elpotential, ntasks) where {DIM, T}
     grid = generate_grid(Tetrahedron, gridsize, left, right)
     threadindices = Ferrite.create_coloring(grid)
-    threadindices = Ferrite.create_coloring(grid)
+    colors = create_coloring(grid)
-    threadindices = Ferrite.create_coloring(grid)
+    colors = create_coloring(grid)
 
@@ -106,7 +108,7 @@ function LandauModel(α, G, gridsize, left::Vec{DIM, T}, right::Vec{DIM, T}, elp
 
     dpc = ndofs_per_cell(dofhandler)
     cpc = length(grid.cells[1].nodes)
-    caches = [ThreadCache(dpc, cpc, copy(cvP), ModelParams(α, G), elpotential) for t in 1:Threads.maxthreadid()]
+    caches = [ThreadCache(dpc, cpc, copy(cvP), ModelParams(α, G), elpotential) for _ in 1:ntasks]
-    caches = [ThreadCache(dpc, cpc, copy(cvP), ModelParams(α, G), elpotential) for _ in 1:ntasks]
+    caches = [TaskCache(dpc, cpc, copy(cvP), ModelParams(α, G), elpotential) for _ in 1:ntasks]
-    caches = [ThreadCache(dpc, cpc, copy(cvP), ModelParams(α, G), elpotential) for _ in 1:ntasks]
+    caches = [TaskCache(dpc, cpc, copy(cvP), ModelParams(α, G), elpotential) for _ in 1:ntasks]
     return LandauModel(dofvector, dofhandler, boundaryconds, threadindices, caches)
-    return LandauModel(dofvector, dofhandler, boundaryconds, threadindices, caches)
+    return LandauModel(dofvector, dofhandler, boundaryconds, colors, caches)
-    return LandauModel(dofvector, dofhandler, boundaryconds, threadindices, caches)
+    return LandauModel(dofvector, dofhandler, boundaryconds, colors, caches)
 end
 
@@ -119,75 +121,77 @@ function save_landau(path, model, dofs = model.dofs)
 end
 
 # ## Assembly
-# This macro defines most of the assembly step, since the structure is the same for
-# the energy, gradient and Hessian calculations.
-macro assemble!(innerbody)
-    return esc(
-        quote
-            dofhandler = model.dofhandler
-            for indices in model.threadindices
-                Threads.@threads for i in indices
-                    cache = model.threadcaches[Threads.threadid()]
-                    eldofs = cache.element_dofs
-                    nodeids = dofhandler.grid.cells[i].nodes
-                    for j in 1:length(cache.element_coords)
-                        cache.element_coords[j] = dofhandler.grid.nodes[nodeids[j]].x
-                    end
-                    reinit!(cache.cvP, cache.element_coords)
-
-                    celldofs!(cache.element_indices, dofhandler, i)
-                    for j in 1:length(cache.element_dofs)
-                        eldofs[j] = dofvector[cache.element_indices[j]]
-                    end
-                    $innerbody
-                end
-            end
-        end
-    )
+# This helper sets up the cell data in the cache for a given cell index,
+# and returns the element dof values.
+function setup_cell!(cache, dofhandler, dofvector, cellidx)
+    nodeids = dofhandler.grid.cells[cellidx].nodes
+    for j in 1:length(cache.element_coords)
+        cache.element_coords[j] = dofhandler.grid.nodes[nodeids[j]].x
+    end
+    reinit!(cache.cvP, cache.element_coords)
+    celldofs!(cache.element_indices, dofhandler, cellidx)
+    eldofs = cache.element_dofs
+    for j in 1:length(eldofs)
+        eldofs[j] = dofvector[cache.element_indices[j]]
+    end
+    return eldofs
 end
 
-# This calculates the total energy calculation of the grid
+# This calculates the total energy of the grid.
 function F(dofvector::Vector{T}, model) where {T}
-    outs = fill(zero(T), Threads.maxthreadid())
-    @assemble! begin
-        outs[Threads.threadid()] += cache.element_potential(eldofs)
+    out = zero(T)
+    for indices in model.threadindices
-    for indices in model.threadindices
+    for indices in model.colors
-    for indices in model.threadindices
+    for indices in model.colors
+        partial = OhMyThreads.@tasks for (ichunk, range) in enumerate(chunks(indices; n = length(model.caches)))
+            @set reducer = +
-            @set reducer = +
+            OhMyThreads.@set reducer = +
-            @set reducer = +
+            OhMyThreads.@set reducer = +
+            cache = model.caches[ichunk]
+            local_energy = zero(T)
+            for i in range
+                eldofs = setup_cell!(cache, model.dofhandler, dofvector, i)
+                local_energy += cache.element_potential(eldofs)
+            end
+            local_energy
+        end
+        out += partial
     end
-    return sum(outs)
+    return out
 end
 
-# The gradient calculation for each dof
+# The gradient calculation for each dof.
+# The grid coloring ensures no two tasks within a color share dofs,
+# so assembly is safe without locks.
 function ∇F!(∇f::Vector{T}, dofvector::Vector{T}, model::LandauModel{T}) where {T}
     fill!(∇f, zero(T))
-    @assemble! begin
-        ForwardDiff.gradient!(cache.element_gradient, cache.element_potential, eldofs, cache.gradconf)
-        @inbounds assemble!(∇f, cache.element_indices, cache.element_gradient)
+    for indices in model.threadindices
-    for indices in model.threadindices
+    for indices in model.colors
-    for indices in model.threadindices
+    for indices in model.colors
+        OhMyThreads.@tasks for (ichunk, range) in enumerate(chunks(indices; n = length(model.caches)))
+            cache = model.caches[ichunk]
+            for i in range
+                eldofs = setup_cell!(cache, model.dofhandler, dofvector, i)
+                ForwardDiff.gradient!(cache.element_gradient, cache.element_potential, eldofs, cache.gradconf)
+                @inbounds assemble!(∇f, cache.element_indices, cache.element_gradient)
+            end
+        end
     end
     return
 end
 
 # The Hessian calculation for the whole grid
 function ∇²F!(∇²f::SparseMatrixCSC, dofvector::Vector{T}, model::LandauModel{T}) where {T}
-    assemblers = [start_assemble(∇²f) for t in 1:Threads.maxthreadid()]
-    @assemble! begin
-        ForwardDiff.hessian!(cache.element_hessian, cache.element_potential, eldofs, cache.hessconf)
-        @inbounds assemble!(assemblers[Threads.threadid()], cache.element_indices, cache.element_hessian)
+    dh = model.dofhandler
+    ntasks = length(model.caches)
+    assemblers = [start_assemble(∇²f; fillzero = (i == 1)) for i in 1:ntasks]
+    for indices in model.threadindices
-    for indices in model.threadindices
+    for indices in model.colors
-    for indices in model.threadindices
+    for indices in model.colors
+        OhMyThreads.@tasks for (ichunk, range) in enumerate(chunks(indices; n = ntasks))
+            cache = model.caches[ichunk]
+            for i in range
+                eldofs = setup_cell!(cache, dh, dofvector, i)
+                ForwardDiff.hessian!(cache.element_hessian, cache.element_potential, eldofs, cache.hessconf)
+                @inbounds assemble!(assemblers[ichunk], cache.element_indices, cache.element_hessian)
+            end
+        end
     end
     return
 end
 
-# We can also calculate all things in one go!
-function calcall(∇²f::SparseMatrixCSC, ∇f::Vector{T}, dofvector::Vector{T}, model::LandauModel{T}) where {T}
-    outs = fill(zero(T), Threads.maxthreadid())
-    fill!(∇f, zero(T))
-    assemblers = [start_assemble(∇²f, ∇f) for t in 1:Threads.maxthreadid()]
-    @assemble! begin
-        outs[Threads.threadid()] += cache.element_potential(eldofs)
-        ForwardDiff.hessian!(cache.element_hessian, cache.element_potential, eldofs, cache.hessconf)
-        ForwardDiff.gradient!(cache.element_gradient, cache.element_potential, eldofs, cache.gradconf)
-        @inbounds assemble!(assemblers[Threads.threadid()], cache.element_indices, cache.element_gradient, cache.element_hessian)
-    end
-    return sum(outs)
-end
 
 # ## Minimization
 # Now everything can be combined to minimize the energy, and find the equilibrium
@@ -255,7 +259,7 @@ G = V2T(1.0e2, 0.0, 1.0e2)
 α = Vec{3}((-1.0, 1.0, 1.0))
 left = Vec{3}((-75.0, -25.0, -2.0))
 right = Vec{3}((75.0, 25.0, 2.0))
-model = LandauModel(α, G, (50, 50, 2), left, right, element_potential)
+model = LandauModel(α, G, (50, 50, 2), left, right, element_potential, Threads.nthreads())
 
-
+dh = model.dofhandler               #hide
+ddf = allocate_matrix(dh)           #hide
+df = zeros(ndofs(dh))               #hide
+a = collect(range(0, 1, ndofs(dh))) #hide
+@test F(a, model) ≈ ??              #hide
+∇F!(df, a, model)                   #hide
+@test norm(df) ≈ ??                 #hide
+∇²F!(ddf, a, model)                 #hide
+@test norm(ddf) ≈ ??                #hide
-
+dh = model.dofhandler               #hide
+ddf = allocate_matrix(dh)           #hide
+df = zeros(ndofs(dh))               #hide
+a = collect(range(0, 1, ndofs(dh))) #hide
+@test F(a, model) ≈ ??              #hide
+∇F!(df, a, model)                   #hide
+@test norm(df) ≈ ??                 #hide
+∇²F!(ddf, a, model)                 #hide
+@test norm(ddf) ≈ ??                #hide
 save_landau("landauorig", model)
 @time minimize!(model)