Add memory barrier semantics to sync_workgroup (#783)

vchuravy · gbaraldi · vchuravy · commit fa8a1015b7bd · 2025-06-25T08:03:46.000+02:00
Co-authored-by: Valentin Churavy &lt;v.churavy@gmail.com&gt;
Co-authored-by: Gabriel Baraldi &lt;baraldigabriel@gmail.com&gt;
diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
@@ -82,6 +82,10 @@ include("memory.jl")
 
 Base.Experimental.@MethodTable(method_table)
 
+#needs to be before Device since sync uses this
+const syncscope_agent = UnsafeAtomics.Internal.LLVMSyncScope{:agent}()
+const syncscope_workgroup = UnsafeAtomics.Internal.LLVMSyncScope{:workgroup}()
+
 # Device sources must load _before_ the compiler infrastructure,
 # because of generated functions.
 include("device/Device.jl")
@@ -128,12 +132,11 @@ include("random.jl")
 
 # Enable hardware FP atomics for +/- ops.
 const ROCIndexableRef{Indexable <: ROCDeviceArray} = Atomix.IndexableRef{Indexable}
-const agent = UnsafeAtomics.Internal.LLVMSyncScope{:agent}()
 function Atomix.modify!(ref::ROCIndexableRef, op::OP, x, ord) where OP <: Union{typeof(+), typeof(-)}
     x = Atomix.asstorable(ref, x)
     ptr = Atomix.pointer(ref)
     root = Atomix.gcroot(ref)
-    GC.@preserve root UnsafeAtomics.modify!(ptr, op, x, ord, agent)
+    GC.@preserve root UnsafeAtomics.modify!(ptr, op, x, ord, syncscope_agent)
 end
 
 include("ROCKernels.jl")
diff --git a/src/device/Device.jl b/src/device/Device.jl
@@ -14,6 +14,7 @@ import ..Runtime
 import ..Mem
 import ..AMDGPU
 import .AMDGPU: method_table
+import ..UnsafeAtomics
 
 include("addrspaces.jl")
 include("globals.jl")
diff --git a/src/device/gcn/synchronization.jl b/src/device/gcn/synchronization.jl
@@ -1,10 +1,33 @@
+
+for ord in UnsafeAtomics.Internal.orderings
+    for sync in (AMDGPU.syncscope_agent, AMDGPU.syncscope_workgroup)
+        @eval function UnsafeAtomics.fence(::$(typeof(ord)), ::$(typeof(sync)))
+            Base.llvmcall(
+                    $("""
+                    define void @fence() #0 {
+                    entry:
+                        fence $sync $ord
+                        ret void
+                    }
+                    attributes #0 = { alwaysinline }
+                    """, "fence"), Nothing, Tuple{})
+        end
+    end
+end
+
 """
     sync_workgroup()
 
-Waits until all wavefronts in a workgroup have reached this call.
+Waits until all wavefronts in a workgroup have reached this call and that their memory accesses are visible to other threads in the workgroup.
 """
-@inline sync_workgroup() =
+@inline function sync_workgroup()
+    # This needs a fence as well since the barrier has no memory semantics.
+    # It's unclear which fence/how many fences are needed. To be safest we follow HIP's
+    # https://github.com/ROCm/llvm-project/blob/69549e0c2a54a526f90783f43b969871d9d3e41c/amd/device-libs/opencl/src/workgroup/wgbarrier.cl#L8-L38
+    UnsafeAtomics.fence(UnsafeAtomics.seq_cst, AMDGPU.syncscope_workgroup)
     ccall("llvm.amdgcn.s.barrier", llvmcall, Cvoid, ())
+    UnsafeAtomics.fence(UnsafeAtomics.seq_cst, AMDGPU.syncscope_workgroup)
+end
 
 """
     sync_workgroup_count(predicate::Cint)::Cint