Skip to content

Commit fa8a101

Browse files
vchuravygbaraldi
andcommitted
Add memory barrier semantics to sync_workgroup (#783)
Co-authored-by: Valentin Churavy <v.churavy@gmail.com> Co-authored-by: Gabriel Baraldi <baraldigabriel@gmail.com>
1 parent 4de89f6 commit fa8a101

3 files changed

Lines changed: 31 additions & 4 deletions

File tree

src/AMDGPU.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ include("memory.jl")
8282

8383
Base.Experimental.@MethodTable(method_table)
8484

85+
#needs to be before Device since sync uses this
86+
const syncscope_agent = UnsafeAtomics.Internal.LLVMSyncScope{:agent}()
87+
const syncscope_workgroup = UnsafeAtomics.Internal.LLVMSyncScope{:workgroup}()
88+
8589
# Device sources must load _before_ the compiler infrastructure,
8690
# because of generated functions.
8791
include("device/Device.jl")
@@ -128,12 +132,11 @@ include("random.jl")
128132

129133
# Enable hardware FP atomics for +/- ops.
130134
const ROCIndexableRef{Indexable <: ROCDeviceArray} = Atomix.IndexableRef{Indexable}
131-
const agent = UnsafeAtomics.Internal.LLVMSyncScope{:agent}()
132135
function Atomix.modify!(ref::ROCIndexableRef, op::OP, x, ord) where OP <: Union{typeof(+), typeof(-)}
133136
x = Atomix.asstorable(ref, x)
134137
ptr = Atomix.pointer(ref)
135138
root = Atomix.gcroot(ref)
136-
GC.@preserve root UnsafeAtomics.modify!(ptr, op, x, ord, agent)
139+
GC.@preserve root UnsafeAtomics.modify!(ptr, op, x, ord, syncscope_agent)
137140
end
138141

139142
include("ROCKernels.jl")

src/device/Device.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import ..Runtime
1414
import ..Mem
1515
import ..AMDGPU
1616
import .AMDGPU: method_table
17+
import ..UnsafeAtomics
1718

1819
include("addrspaces.jl")
1920
include("globals.jl")

src/device/gcn/synchronization.jl

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,33 @@
1+
2+
for ord in UnsafeAtomics.Internal.orderings
3+
for sync in (AMDGPU.syncscope_agent, AMDGPU.syncscope_workgroup)
4+
@eval function UnsafeAtomics.fence(::$(typeof(ord)), ::$(typeof(sync)))
5+
Base.llvmcall(
6+
$("""
7+
define void @fence() #0 {
8+
entry:
9+
fence $sync $ord
10+
ret void
11+
}
12+
attributes #0 = { alwaysinline }
13+
""", "fence"), Nothing, Tuple{})
14+
end
15+
end
16+
end
17+
118
"""
219
sync_workgroup()
320
4-
Waits until all wavefronts in a workgroup have reached this call.
21+
Waits until all wavefronts in a workgroup have reached this call and that their memory accesses are visible to other threads in the workgroup.
522
"""
6-
@inline sync_workgroup() =
23+
@inline function sync_workgroup()
24+
# This needs a fence as well since the barrier has no memory semantics.
25+
# It's unclear which fence/how many fences are needed. To be safest we follow HIP's
26+
# https://github.com/ROCm/llvm-project/blob/69549e0c2a54a526f90783f43b969871d9d3e41c/amd/device-libs/opencl/src/workgroup/wgbarrier.cl#L8-L38
27+
UnsafeAtomics.fence(UnsafeAtomics.seq_cst, AMDGPU.syncscope_workgroup)
728
ccall("llvm.amdgcn.s.barrier", llvmcall, Cvoid, ())
29+
UnsafeAtomics.fence(UnsafeAtomics.seq_cst, AMDGPU.syncscope_workgroup)
30+
end
831

932
"""
1033
sync_workgroup_count(predicate::Cint)::Cint

0 commit comments

Comments
 (0)