diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -26,8 +26,15 @@ seq_cst = __ATOMIC_SEQ_CST, }; +enum MemScopeTy { + all, // All threads on all devices + device, // All threads on the device + cgroup // All threads in the contention group, e.g. the team +}; + /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering); +uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::all); /// Atomically perform on \p V and \p *Addr with \p Ordering semantics. The /// result is stored in \p *Addr; diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -223,12 +223,15 @@ } else lgredFct(GlobalBuffer, ModBockId, reduce_data); +#ifndef __AMDGCN__ fence::system(atomic::seq_cst); +#endif // Increment team counter. // This counter is incremented by all teams in the current // BUFFER_SIZE chunk. - ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst); + ChunkTeamCount = + atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, atomic::device); } // Synchronize if (mapping::isSPMDMode()) diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -29,8 +29,8 @@ /// ///{ /// NOTE: This function needs to be implemented by every target. -uint32_t atomicInc(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering); +uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope); template Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { @@ -116,7 +116,8 @@ ///} // Forward declarations defined to be defined for AMDGCN and NVPTX. -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering); +uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope); void namedBarrierInit(); void namedBarrier(); void fenceTeam(atomic::OrderingTy Ordering); @@ -138,22 +139,90 @@ ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering) { +uint32_t atomicIncRelaxed(uint32_t *A, uint32_t V, + atomic::MemScopeTy MemScope) { + switch (MemScope) { + default: + __builtin_unreachable(); + case atomic::all: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::relaxed, ""); + case atomic::device: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::relaxed, "agent"); + case atomic::cgroup: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::relaxed, "workgroup"); + } +} + +uint32_t atomicIncAquire(uint32_t *A, uint32_t V, atomic::MemScopeTy MemScope) { + switch (MemScope) { + default: + __builtin_unreachable(); + case atomic::all: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::aquire, ""); + case atomic::device: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::aquire, "agent"); + case atomic::cgroup: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::aquire, "workgroup"); + } +} + +uint32_t atomicIncRelease(uint32_t *A, uint32_t V, + atomic::MemScopeTy MemScope) { + switch (MemScope) { + default: + __builtin_unreachable(); + case atomic::all: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::release, ""); + case atomic::device: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::release, "agent"); + case atomic::cgroup: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::release, "workgroup"); + } +} + +uint32_t atomicIncAcqRel(uint32_t *A, uint32_t V, atomic::MemScopeTy MemScope) { + switch (MemScope) { + default: + __builtin_unreachable(); + case atomic::all: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::acq_rel, ""); + case atomic::device: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::acq_rel, "agent"); + case atomic::cgroup: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::acq_rel, "workgroup"); + } +} + +uint32_t atomicIncSeqCst(uint32_t *A, uint32_t V, atomic::MemScopeTy MemScope) { + switch (MemScope) { + default: + __builtin_unreachable(); + case atomic::all: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::seq_cst, ""); + case atomic::device: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::seq_cst, "agent"); + case atomic::cgroup: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::seq_cst, "workgroup"); + } +} + +uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { // builtin_amdgcn_atomic_inc32 should expand to this switch when // passed a runtime value, but does not do so yet. Workaround here. switch (Ordering) { default: __builtin_unreachable(); case atomic::relaxed: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::relaxed, ""); + return atomicIncRelaxed(A, V, MemScope); case atomic::aquire: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::aquire, ""); + return atomicIncAquire(A, V, MemScope); case atomic::release: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::release, ""); + return atomicIncRelease(A, V, MemScope); case atomic::acq_rel: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::acq_rel, ""); + return atomicIncAcqRel(A, V, MemScope); case atomic::seq_cst: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::seq_cst, ""); + return atomicIncSeqCst(A, V, MemScope); } } @@ -308,8 +377,8 @@ device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -uint32_t atomicInc(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { +uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { return __nvvm_atom_inc_gen_ui(Address, Val); } @@ -480,8 +549,9 @@ #undef ATOMIC_INT_OP #undef ATOMIC_FP_OP -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - return impl::atomicInc(Addr, V, Ordering); +uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { + return impl::atomicInc(Addr, V, Ordering, MemScope); } void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }