diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -44,38 +44,46 @@ } // namespace synchronize -namespace fence { - -/// Memory fence with \p Ordering semantics for the team. -void team(int Ordering); - -/// Memory fence with \p Ordering semantics for the contention group. -void kernel(int Ordering); - -/// Memory fence with \p Ordering semantics for the system. -void system(int Ordering); - -} // namespace fence - namespace atomic { +enum OrderingTy { + relaxed = __ATOMIC_RELAXED, + aquire = __ATOMIC_ACQUIRE, + release = __ATOMIC_RELEASE, + acq_rel = __ATOMIC_ACQ_REL, + seq_cst = __ATOMIC_SEQ_CST, +}; + /// Atomically load \p Addr with \p Ordering semantics. -uint32_t load(uint32_t *Addr, int Ordering); +uint32_t load(uint32_t *Addr, atomic::OrderingTy Ordering); /// Atomically store \p V to \p Addr with \p Ordering semantics. -void store(uint32_t *Addr, uint32_t V, int Ordering); +void store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering); +uint32_t inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); /// Atomically add \p V to \p *Addr with \p Ordering semantics. -uint32_t add(uint32_t *Addr, uint32_t V, int Ordering); +uint32_t add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); /// Atomically add \p V to \p *Addr with \p Ordering semantics. -uint64_t add(uint64_t *Addr, uint64_t V, int Ordering); +uint64_t add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering); } // namespace atomic +namespace fence { + +/// Memory fence with \p Ordering semantics for the team. +void team(atomic::OrderingTy Ordering); + +/// Memory fence with \p Ordering semantics for the contention group. +void kernel(atomic::OrderingTy Ordering); + +/// Memory fence with \p Ordering semantics for the system. +void system(atomic::OrderingTy Ordering); + +} // namespace fence + } // namespace _OMP #endif diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -211,7 +211,7 @@ // to the number of slots in the buffer. bool IsMaster = (ThreadId == 0); while (IsMaster) { - Bound = atomic::load(&IterCnt, __ATOMIC_SEQ_CST); + Bound = atomic::load(&IterCnt, atomic::seq_cst); if (TeamId < Bound + num_of_records) break; } @@ -223,12 +223,12 @@ } else lgredFct(GlobalBuffer, ModBockId, reduce_data); - fence::system(__ATOMIC_SEQ_CST); + fence::system(atomic::seq_cst); // Increment team counter. // This counter is incremented by all teams in the current // BUFFER_SIZE chunk. - ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST); + ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst); } // Synchronize if (mapping::isSPMDMode()) @@ -304,7 +304,7 @@ if (IsMaster && ChunkTeamCount == num_of_records - 1) { // Allow SIZE number of teams to proceed writing their // intermediate results to the global buffer. - atomic::add(&IterCnt, uint32_t(num_of_records), __ATOMIC_SEQ_CST); + atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst); } return 0; diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -29,47 +29,52 @@ /// ///{ /// NOTE: This function needs to be implemented by every target. -uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering); +uint32_t atomicInc(uint32_t *Address, uint32_t Val, + atomic::OrderingTy Ordering); -uint32_t atomicLoad(uint32_t *Address, int Ordering) { - return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST); +uint32_t atomicLoad(uint32_t *Address, atomic::OrderingTy Ordering) { + return __atomic_fetch_add(Address, 0U, Ordering); } -void atomicStore(uint32_t *Address, uint32_t Val, int Ordering) { +void atomicStore(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) { __atomic_store_n(Address, Val, Ordering); } -uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) { +uint32_t atomicAdd(uint32_t *Address, uint32_t Val, + atomic::OrderingTy Ordering) { return __atomic_fetch_add(Address, Val, Ordering); } -uint32_t atomicMax(uint32_t *Address, uint32_t Val, int Ordering) { +uint32_t atomicMax(uint32_t *Address, uint32_t Val, + atomic::OrderingTy Ordering) { return __atomic_fetch_max(Address, Val, Ordering); } -uint32_t atomicExchange(uint32_t *Address, uint32_t Val, int Ordering) { +uint32_t atomicExchange(uint32_t *Address, uint32_t Val, + atomic::OrderingTy Ordering) { uint32_t R; __atomic_exchange(Address, &Val, &R, Ordering); return R; } uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val, - int Ordering) { + atomic::OrderingTy Ordering) { (void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering, Ordering); return Compare; } -uint64_t atomicAdd(uint64_t *Address, uint64_t Val, int Ordering) { +uint64_t atomicAdd(uint64_t *Address, uint64_t Val, + atomic::OrderingTy Ordering) { return __atomic_fetch_add(Address, Val, Ordering); } ///} // Forward declarations defined to be defined for AMDGCN and NVPTX. -uint32_t atomicInc(uint32_t *A, uint32_t V, int Ordering); +uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering); void namedBarrierInit(); void namedBarrier(); -void fenceTeam(int Ordering); -void fenceKernel(int Ordering); -void fenceSystem(int Ordering); +void fenceTeam(atomic::OrderingTy Ordering); +void fenceKernel(atomic::OrderingTy Ordering); +void fenceSystem(atomic::OrderingTy Ordering); void syncWarp(__kmpc_impl_lanemask_t); void syncThreads(); void syncThreadsAligned() { syncThreads(); } @@ -84,22 +89,22 @@ ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -uint32_t atomicInc(uint32_t *A, uint32_t V, int Ordering) { +uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering) { // builtin_amdgcn_atomic_inc32 should expand to this switch when // passed a runtime value, but does not do so yet. Workaround here. switch (Ordering) { default: __builtin_unreachable(); - case __ATOMIC_RELAXED: - return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_RELAXED, ""); - case __ATOMIC_ACQUIRE: - return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_ACQUIRE, ""); - case __ATOMIC_RELEASE: - return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_RELEASE, ""); - case __ATOMIC_ACQ_REL: - return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_ACQ_REL, ""); - case __ATOMIC_SEQ_CST: - return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_SEQ_CST, ""); + case atomic::relaxed: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::relaxed, ""); + case atomic::aquire: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::aquire, ""); + case atomic::release: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::release, ""); + case atomic::acq_rel: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::acq_rel, ""); + case atomic::seq_cst: + return __builtin_amdgcn_atomic_inc32(A, V, atomic::seq_cst, ""); } } @@ -107,7 +112,7 @@ void namedBarrierInit() { // Don't have global ctors, and shared memory is not zero init - atomic::store(&namedBarrierTracker, 0u, __ATOMIC_RELEASE); + atomic::store(&namedBarrierTracker, 0u, atomic::release); } void namedBarrier() { @@ -117,7 +122,7 @@ uint32_t WarpSize = mapping::getWarpSize(); uint32_t NumWaves = NumThreads / WarpSize; - fence::team(__ATOMIC_ACQUIRE); + fence::team(atomic::aquire); // named barrier implementation for amdgcn. // Uses two 16 bit unsigned counters. One for the number of waves to have @@ -133,7 +138,7 @@ // Increment the low 16 bits once, using the lowest active thread. if (mapping::isLeaderInWarp()) { uint32_t load = atomic::add(&namedBarrierTracker, 1, - __ATOMIC_RELAXED); // commutative + atomic::relaxed); // commutative // Record the number of times the barrier has been passed uint32_t generation = load & 0xffff0000u; @@ -145,61 +150,61 @@ load &= 0xffff0000u; // because bits zeroed second // Reset the wave counter and release the waiting waves - atomic::store(&namedBarrierTracker, load, __ATOMIC_RELAXED); + atomic::store(&namedBarrierTracker, load, atomic::relaxed); } else { // more waves still to go, spin until generation counter changes do { __builtin_amdgcn_s_sleep(0); - load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED); + load = atomic::load(&namedBarrierTracker, atomic::relaxed); } while ((load & 0xffff0000u) == generation); } } - fence::team(__ATOMIC_RELEASE); + fence::team(atomic::release); } // sema checking of amdgcn_fence is aggressive. Intention is to patch clang // so that it is usable within a template environment and so that a runtime // value of the memory order is expanded to this switch within clang/llvm. -void fenceTeam(int Ordering) { +void fenceTeam(atomic::OrderingTy Ordering) { switch (Ordering) { default: __builtin_unreachable(); - case __ATOMIC_ACQUIRE: - return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); - case __ATOMIC_RELEASE: - return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); - case __ATOMIC_ACQ_REL: - return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "workgroup"); - case __ATOMIC_SEQ_CST: - return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); + case atomic::aquire: + return __builtin_amdgcn_fence(atomic::aquire, "workgroup"); + case atomic::release: + return __builtin_amdgcn_fence(atomic::release, "workgroup"); + case atomic::acq_rel: + return __builtin_amdgcn_fence(atomic::acq_rel, "workgroup"); + case atomic::seq_cst: + return __builtin_amdgcn_fence(atomic::seq_cst, "workgroup"); } } -void fenceKernel(int Ordering) { +void fenceKernel(atomic::OrderingTy Ordering) { switch (Ordering) { default: __builtin_unreachable(); - case __ATOMIC_ACQUIRE: - return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); - case __ATOMIC_RELEASE: - return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); - case __ATOMIC_ACQ_REL: - return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); - case __ATOMIC_SEQ_CST: - return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); + case atomic::aquire: + return __builtin_amdgcn_fence(atomic::aquire, "agent"); + case atomic::release: + return __builtin_amdgcn_fence(atomic::release, "agent"); + case atomic::acq_rel: + return __builtin_amdgcn_fence(atomic::acq_rel, "agent"); + case atomic::seq_cst: + return __builtin_amdgcn_fence(atomic::seq_cst, "agent"); } } -void fenceSystem(int Ordering) { +void fenceSystem(atomic::OrderingTy Ordering) { switch (Ordering) { default: __builtin_unreachable(); - case __ATOMIC_ACQUIRE: - return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, ""); - case __ATOMIC_RELEASE: - return __builtin_amdgcn_fence(__ATOMIC_RELEASE, ""); - case __ATOMIC_ACQ_REL: - return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, ""); - case __ATOMIC_SEQ_CST: - return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); + case atomic::aquire: + return __builtin_amdgcn_fence(atomic::aquire, ""); + case atomic::release: + return __builtin_amdgcn_fence(atomic::release, ""); + case atomic::acq_rel: + return __builtin_amdgcn_fence(atomic::acq_rel, ""); + case atomic::seq_cst: + return __builtin_amdgcn_fence(atomic::seq_cst, ""); } } @@ -226,7 +231,8 @@ #pragma omp begin declare variant match( \ device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) -uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { +uint32_t atomicInc(uint32_t *Address, uint32_t Val, + atomic::OrderingTy Ordering) { return __nvvm_atom_inc_gen_ui(Address, Val); } @@ -268,11 +274,11 @@ // called before it is defined // here the overload won't happen. Investigate lalter! void unsetLock(omp_lock_t *Lock) { - (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST); + (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst); } int testLock(omp_lock_t *Lock) { - return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST); + return atomicAdd((uint32_t *)Lock, 0u, atomic::seq_cst); } void initLock(omp_lock_t *Lock) { unsetLock(Lock); } @@ -281,7 +287,7 @@ void setLock(omp_lock_t *Lock) { // TODO: not sure spinning is a good idea here.. - while (atomicCAS((uint32_t *)Lock, UNSET, SET, __ATOMIC_SEQ_CST) != UNSET) { + while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst) != UNSET) { int32_t start = __nvvm_read_ptx_sreg_clock(); int32_t now; for (;;) { @@ -310,29 +316,29 @@ void synchronize::threadsAligned() { impl::syncThreadsAligned(); } -void fence::team(int Ordering) { impl::fenceTeam(Ordering); } +void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } -void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); } +void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); } -void fence::system(int Ordering) { impl::fenceSystem(Ordering); } +void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } -uint32_t atomic::load(uint32_t *Addr, int Ordering) { +uint32_t atomic::load(uint32_t *Addr, atomic::OrderingTy Ordering) { return impl::atomicLoad(Addr, Ordering); } -void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) { +void atomic::store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { impl::atomicStore(Addr, V, Ordering); } -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) { +uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { return impl::atomicInc(Addr, V, Ordering); } -uint32_t atomic::add(uint32_t *Addr, uint32_t V, int Ordering) { +uint32_t atomic::add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { return impl::atomicAdd(Addr, V, Ordering); } -uint64_t atomic::add(uint64_t *Addr, uint64_t V, int Ordering) { +uint64_t atomic::add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering) { return impl::atomicAdd(Addr, V, Ordering); } @@ -389,7 +395,7 @@ void __kmpc_flush(IdentTy *Loc) { FunctionTracingRAII(); - fence::kernel(__ATOMIC_SEQ_CST); + fence::kernel(atomic::seq_cst); } uint64_t __kmpc_warp_active_thread_mask(void) { diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp --- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -329,7 +329,7 @@ __kmpc_barrier(loc, threadId); if (tid == 0) { Cnt = 0; - fence::team(__ATOMIC_SEQ_CST); + fence::team(atomic::seq_cst); } __kmpc_barrier(loc, threadId); } @@ -346,7 +346,7 @@ unsigned int rank = utils::popc(active & lane_mask_lt); uint64_t warp_res = 0; if (rank == 0) { - warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST); + warp_res = atomic::add(&Cnt, change, atomic::seq_cst); } warp_res = utils::shuffle(active, warp_res, leader); return warp_res + rank;