diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -26,8 +26,15 @@ seq_cst = __ATOMIC_SEQ_CST, }; +enum MemScopeTy { + all, // All threads on all devices + device, // All threads on the device + cgroup // All threads in the contention group, e.g. the team +}; + /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering); +uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::all); /// Atomically perform on \p V and \p *Addr with \p Ordering semantics. The /// result is stored in \p *Addr; diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -228,7 +228,8 @@ // Increment team counter. // This counter is incremented by all teams in the current // BUFFER_SIZE chunk. - ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst); + ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, + atomic::MemScopeTy::device); } // Synchronize if (mapping::isSPMDMode()) diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -29,8 +29,8 @@ /// ///{ /// NOTE: This function needs to be implemented by every target. -uint32_t atomicInc(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering); +uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope); template Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { @@ -116,7 +116,8 @@ ///} // Forward declarations defined to be defined for AMDGCN and NVPTX. -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering); +uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope); void namedBarrierInit(); void namedBarrier(); void fenceTeam(atomic::OrderingTy Ordering); @@ -138,22 +139,35 @@ ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering) { +uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { // builtin_amdgcn_atomic_inc32 should expand to this switch when // passed a runtime value, but does not do so yet. Workaround here. + +#define ScopeSwitch(ORDER) \ + switch (MemScope) { \ + case atomic::MemScopeTy::all: \ + return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \ + case atomic::MemScopeTy::device: \ + return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \ + case atomic::MemScopeTy::cgroup: \ + return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \ + } + +#define Case(ORDER) \ + case ORDER: \ + ScopeSwitch(ORDER) + switch (Ordering) { default: __builtin_unreachable(); - case atomic::relaxed: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::relaxed, ""); - case atomic::aquire: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::aquire, ""); - case atomic::release: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::release, ""); - case atomic::acq_rel: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::acq_rel, ""); - case atomic::seq_cst: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::seq_cst, ""); + Case(atomic::relaxed); + Case(atomic::aquire); + Case(atomic::release); + Case(atomic::acq_rel); + Case(atomic::seq_cst); +#undef Case +#undef ScopeSwitch } } @@ -308,8 +322,8 @@ device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -uint32_t atomicInc(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { +uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { return __nvvm_atom_inc_gen_ui(Address, Val); } @@ -480,8 +494,9 @@ #undef ATOMIC_INT_OP #undef ATOMIC_FP_OP -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - return impl::atomicInc(Addr, V, Ordering); +uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { + return impl::atomicInc(Addr, V, Ordering, MemScope); } void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }