diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -54,20 +54,60 @@ seq_cst = __ATOMIC_SEQ_CST, }; -/// Atomically load \p Addr with \p Ordering semantics. -uint32_t load(uint32_t *Addr, atomic::OrderingTy Ordering); - -/// Atomically store \p V to \p Addr with \p Ordering semantics. -void store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); - /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); +uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering); + +/// Atomically perform on \p V and \p *Addr with \p Ordering semantics. The +/// result is stored in \p *Addr; +/// { + +#define ATOMIC_COMMON_OP(TY) \ + TY add(TY *Addr, TY V, OrderingTy Ordering); \ + TY mul(TY *Addr, TY V, OrderingTy Ordering); \ + TY load(TY *Addr, OrderingTy Ordering); \ + void store(TY *Addr, TY V, OrderingTy Ordering); \ + bool cas(TY *Addr, TY ExpectedV, TY DesiredV, OrderingTy OrderingSucc, \ + OrderingTy OrderingFail); + +#define ATOMIC_FP_ONLY_OP(TY) \ + TY min(TY *Addr, TY V, OrderingTy Ordering); \ + TY max(TY *Addr, TY V, OrderingTy Ordering); + +#define ATOMIC_INT_ONLY_OP(TY) \ + TY min(TY *Addr, TY V, OrderingTy Ordering); \ + TY max(TY *Addr, TY V, OrderingTy Ordering); \ + TY bit_or(TY *Addr, TY V, OrderingTy Ordering); \ + TY bit_and(TY *Addr, TY V, OrderingTy Ordering); \ + TY bit_xor(TY *Addr, TY V, OrderingTy Ordering); + +#define ATOMIC_FP_OP(TY) \ + ATOMIC_FP_ONLY_OP(TY) \ + ATOMIC_COMMON_OP(TY) + +#define ATOMIC_INT_OP(TY) \ + ATOMIC_INT_ONLY_OP(TY) \ + ATOMIC_COMMON_OP(TY) + +// This needs to be kept in sync with the header. Also the reason we don't use +// templates here. +ATOMIC_INT_OP(int8_t) +ATOMIC_INT_OP(int16_t) +ATOMIC_INT_OP(int32_t) +ATOMIC_INT_OP(int64_t) +ATOMIC_INT_OP(uint8_t) +ATOMIC_INT_OP(uint16_t) +ATOMIC_INT_OP(uint32_t) +ATOMIC_INT_OP(uint64_t) +ATOMIC_FP_OP(float) +ATOMIC_FP_OP(double) + +#undef ATOMIC_INT_ONLY_OP +#undef ATOMIC_FP_ONLY_OP +#undef ATOMIC_COMMON_OP +#undef ATOMIC_INT_OP +#undef ATOMIC_FP_OP -/// Atomically add \p V to \p *Addr with \p Ordering semantics. -uint32_t add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); - -/// Atomically add \p V to \p *Addr with \p Ordering semantics. -uint64_t add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering); +///} } // namespace atomic diff --git a/openmp/libomptarget/DeviceRTL/include/Utils.h b/openmp/libomptarget/DeviceRTL/include/Utils.h --- a/openmp/libomptarget/DeviceRTL/include/Utils.h +++ b/openmp/libomptarget/DeviceRTL/include/Utils.h @@ -77,6 +77,11 @@ /// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). bool isSharedMemPtr(void *Ptr); +/// Return \p V typed punned as \p DstTy. +template inline DstTy convertViaPun(SrcTy V) { + return *((DstTy *)(&V)); +} + /// A pointer variable that has by design an `undef` value. Use with care. __attribute__((loader_uninitialized)) static void *const UndefPtr; diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -32,40 +32,87 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering); -uint32_t atomicLoad(uint32_t *Address, atomic::OrderingTy Ordering) { - return __atomic_fetch_add(Address, 0U, Ordering); +template +Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_add(Address, Val, Ordering); +} + +template +Ty atomicMul(Ty *Address, Ty V, atomic::OrderingTy Ordering) { + Ty TypedCurrentVal, TypedResultVal, TypedNewVal; + bool Success; + do { + TypedCurrentVal = atomic::load(Address, Ordering); + TypedNewVal = TypedCurrentVal * V; + Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, + atomic::relaxed); + } while (!Success); + return TypedResultVal; +} + +template Ty atomicLoad(Ty *Address, atomic::OrderingTy Ordering) { + return atomicAdd(Address, Ty(0), Ordering); } -void atomicStore(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) { +template +void atomicStore(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { __atomic_store_n(Address, Val, Ordering); } -uint32_t atomicAdd(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { - return __atomic_fetch_add(Address, Val, Ordering); +template +bool atomicCAS(Ty *Address, Ty ExpectedV, Ty DesiredV, + atomic::OrderingTy OrderingSucc, + atomic::OrderingTy OrderingFail) { + return __atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, + OrderingSucc, OrderingFail); } -uint32_t atomicMax(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { + +template +Ty atomicMin(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_min(Address, Val, Ordering); +} + +template +Ty atomicMax(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { return __atomic_fetch_max(Address, Val, Ordering); } +// TODO: Implement this with __atomic_fetch_max and remove the duplication. +template +Ty atomicMinFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + if (Val >= 0) + return atomicMin((STy *)Address, utils::convertViaPun(Val), Ordering); + return atomicMax((UTy *)Address, utils::convertViaPun(Val), Ordering); +} + +template +Ty atomicMaxFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + if (Val >= 0) + return atomicMax((STy *)Address, utils::convertViaPun(Val), Ordering); + return atomicMin((UTy *)Address, utils::convertViaPun(Val), Ordering); +} + +template +Ty atomicOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_or(Address, Val, Ordering); +} + +template +Ty atomicAnd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_and(Address, Val, Ordering); +} + +template +Ty atomicXOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_xor(Address, Val, Ordering); +} + uint32_t atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) { uint32_t R; __atomic_exchange(Address, &Val, &R, Ordering); return R; } -uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val, - atomic::OrderingTy Ordering) { - (void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering, - Ordering); - return Compare; -} - -uint64_t atomicAdd(uint64_t *Address, uint64_t Val, - atomic::OrderingTy Ordering) { - return __atomic_fetch_add(Address, Val, Ordering); -} ///} // Forward declarations defined to be defined for AMDGCN and NVPTX. @@ -287,7 +334,8 @@ void setLock(omp_lock_t *Lock) { // TODO: not sure spinning is a good idea here.. - while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst) != UNSET) { + while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, + atomic::seq_cst) != UNSET) { int32_t start = __nvvm_read_ptx_sreg_clock(); int32_t now; for (;;) { @@ -322,24 +370,84 @@ void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } -uint32_t atomic::load(uint32_t *Addr, atomic::OrderingTy Ordering) { - return impl::atomicLoad(Addr, Ordering); -} +#define ATOMIC_COMMON_OP(TY) \ + TY atomic::add(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicAdd(Addr, V, Ordering); \ + } \ + TY atomic::mul(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMul(Addr, V, Ordering); \ + } \ + TY atomic::load(TY *Addr, atomic::OrderingTy Ordering) { \ + return impl::atomicLoad(Addr, Ordering); \ + } \ + bool atomic::cas(TY *Addr, TY ExpectedV, TY DesiredV, \ + atomic::OrderingTy OrderingSucc, \ + atomic::OrderingTy OrderingFail) { \ + return impl::atomicCAS(Addr, ExpectedV, DesiredV, OrderingSucc, \ + OrderingFail); \ + } -void atomic::store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - impl::atomicStore(Addr, V, Ordering); -} +#define ATOMIC_FP_ONLY_OP(TY, STY, UTY) \ + TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMinFP(Addr, V, Ordering); \ + } \ + TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMaxFP(Addr, V, Ordering); \ + } \ + void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + impl::atomicStore(reinterpret_cast(Addr), \ + utils::convertViaPun(V), Ordering); \ + } -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - return impl::atomicInc(Addr, V, Ordering); -} +#define ATOMIC_INT_ONLY_OP(TY) \ + TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMin(Addr, V, Ordering); \ + } \ + TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMax(Addr, V, Ordering); \ + } \ + TY atomic::bit_or(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicOr(Addr, V, Ordering); \ + } \ + TY atomic::bit_and(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicAnd(Addr, V, Ordering); \ + } \ + TY atomic::bit_xor(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicXOr(Addr, V, Ordering); \ + } \ + void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + impl::atomicStore(Addr, V, Ordering); \ + } -uint32_t atomic::add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - return impl::atomicAdd(Addr, V, Ordering); -} +#define ATOMIC_FP_OP(TY, STY, UTY) \ + ATOMIC_FP_ONLY_OP(TY, STY, UTY) \ + ATOMIC_COMMON_OP(TY) + +#define ATOMIC_INT_OP(TY) \ + ATOMIC_INT_ONLY_OP(TY) \ + ATOMIC_COMMON_OP(TY) + +// This needs to be kept in sync with the header. Also the reason we don't use +// templates here. +ATOMIC_INT_OP(int8_t) +ATOMIC_INT_OP(int16_t) +ATOMIC_INT_OP(int32_t) +ATOMIC_INT_OP(int64_t) +ATOMIC_INT_OP(uint8_t) +ATOMIC_INT_OP(uint16_t) +ATOMIC_INT_OP(uint32_t) +ATOMIC_INT_OP(uint64_t) +ATOMIC_FP_OP(float, int32_t, uint32_t) +ATOMIC_FP_OP(double, int64_t, uint64_t) + +#undef ATOMIC_INT_ONLY_OP +#undef ATOMIC_FP_ONLY_OP +#undef ATOMIC_COMMON_OP +#undef ATOMIC_INT_OP +#undef ATOMIC_FP_OP -uint64_t atomic::add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering) { - return impl::atomicAdd(Addr, V, Ordering); +uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { + return impl::atomicInc(Addr, V, Ordering); } extern "C" {