diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -54,20 +54,54 @@ seq_cst = __ATOMIC_SEQ_CST, }; -/// Atomically load \p Addr with \p Ordering semantics. -uint32_t load(uint32_t *Addr, atomic::OrderingTy Ordering); - /// Atomically store \p V to \p Addr with \p Ordering semantics. -void store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); +void store(uint32_t *Addr, uint32_t V, OrderingTy Ordering); /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); - -/// Atomically add \p V to \p *Addr with \p Ordering semantics. -uint32_t add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); +uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering); + +/// Atomically perform on \p V and \p *Addr with \p Ordering semantics. The +/// result is stored in \p *Addr; +/// { + +#define ATOMIC_FP_AND_INT_OP(TY) \ + TY add(TY *Addr, TY V, OrderingTy Ordering); \ + TY mul(TY *Addr, TY V, OrderingTy Ordering); \ + TY load(TY *Addr, OrderingTy Ordering); \ + bool cas(TY *Addr, TY ExpectedV, TY Desired, OrderingTy OrderingSucc, \ + OrderingTy OrderingFail); + +#define ATOMIC_FP_ONLY_OP(TY) \ + TY min(TY *Addr, TY V, OrderingTy Ordering); \ + TY max(TY *Addr, TY V, OrderingTy Ordering); + +#define ATOMIC_FP_OP(TY) \ + ATOMIC_FP_AND_INT_OP(TY) \ + ATOMIC_FP_ONLY_OP(TY) + +#define ATOMIC_OP(TY) \ + ATOMIC_FP_AND_INT_OP(TY) \ + TY bit_or(TY *Addr, TY V, OrderingTy Ordering); \ + TY bit_and(TY *Addr, TY V, OrderingTy Ordering); \ + TY bit_xor(TY *Addr, TY V, OrderingTy Ordering); + +ATOMIC_OP(int8_t) +ATOMIC_OP(int16_t) +ATOMIC_OP(int32_t) +ATOMIC_OP(int64_t) +ATOMIC_OP(uint8_t) +ATOMIC_OP(uint16_t) +ATOMIC_OP(uint32_t) +ATOMIC_OP(uint64_t) +ATOMIC_FP_OP(float) +ATOMIC_FP_OP(double) + +#undef ATOMIC_FP_AND_INT_OP +#undef ATOMIC_FP_ONLY_OP +#undef ATOMIC_FP_OP +#undef ATOMIC_OP -/// Atomically add \p V to \p *Addr with \p Ordering semantics. -uint64_t add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering); +///} } // namespace atomic diff --git a/openmp/libomptarget/DeviceRTL/include/Utils.h b/openmp/libomptarget/DeviceRTL/include/Utils.h --- a/openmp/libomptarget/DeviceRTL/include/Utils.h +++ b/openmp/libomptarget/DeviceRTL/include/Utils.h @@ -74,6 +74,11 @@ return V - V % Align; } +/// Return \p V typed punned as \p DstTy. +template inline DstTy convertViaPun(SrcTy V) { + return *((DstTy *)(&V)); +} + /// A pointer variable that has by design an `undef` value. Use with care. __attribute__((loader_uninitialized)) static void *const UndefPtr; diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -32,22 +32,88 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering); -uint32_t atomicLoad(uint32_t *Address, atomic::OrderingTy Ordering) { - return __atomic_fetch_add(Address, 0U, Ordering); -} - void atomicStore(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) { __atomic_store_n(Address, Val, Ordering); } -uint32_t atomicAdd(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { - return __atomic_fetch_add(Address, Val, Ordering); -} -uint32_t atomicMax(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { - return __atomic_fetch_max(Address, Val, Ordering); -} +#define ATOMIC_FP_AND_INT_OP(TY) \ + TY atomicAdd(TY *Address, TY Val, atomic::OrderingTy Ordering) { \ + return __atomic_fetch_add(Address, Val, Ordering); \ + } \ + TY atomicMul(TY *Address, TY V, atomic::OrderingTy Ordering) { \ + TY TypedCurrentVal, TypedResultVal, TypedNewVal; \ + bool Success; \ + do { \ + TypedCurrentVal = atomic::load(Address, Ordering); \ + TypedNewVal = TypedCurrentVal * V; \ + Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, \ + atomic::relaxed); \ + } while (!Success); \ + return TypedResultVal; \ + } \ + TY atomicLoad(TY *Address, atomic::OrderingTy Ordering) { \ + return atomicAdd(Address, TY(0), Ordering); \ + } \ + bool atomicCAS(TY *Address, TY ExpectedV, TY DesiredV, \ + atomic::OrderingTy OrderingSucc, \ + atomic::OrderingTy OrderingFail) { \ + return __atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, \ + OrderingSucc, OrderingFail); \ + } + +#define ATOMIC_FP_ONLY_OP(TY, SINT_TY, UINT_TY) \ + TY atomicMin(TY *Address, TY Val, atomic::OrderingTy Ordering) { \ + if (Val >= 0) \ + return atomicMin((SINT_TY *)Address, utils::convertViaPun(Val), \ + Ordering); \ + return atomicMax((UINT_TY *)Address, utils::convertViaPun(Val), \ + Ordering); \ + } \ + TY atomicMax(TY *Address, TY Val, atomic::OrderingTy Ordering) { \ + if (Val >= 0) \ + return atomicMax((SINT_TY *)Address, utils::convertViaPun(Val), \ + Ordering); \ + return atomicMin((UINT_TY *)Address, utils::convertViaPun(Val), \ + Ordering); \ + } + +#define ATOMIC_FP_OP(TY, SINT_TY, UINT_TY) \ + ATOMIC_FP_AND_INT_OP(TY) \ + ATOMIC_FP_ONLY_OP(TY, SINT_TY, UINT_TY) + +#define ATOMIC_OP(TY) \ + ATOMIC_FP_AND_INT_OP(TY) \ + TY atomicOr(TY *Address, TY Val, atomic::OrderingTy Ordering) { \ + return __atomic_fetch_or(Address, Val, Ordering); \ + } \ + TY atomicAnd(TY *Address, TY Val, atomic::OrderingTy Ordering) { \ + return __atomic_fetch_and(Address, Val, Ordering); \ + } \ + TY atomicXOr(TY *Address, TY Val, atomic::OrderingTy Ordering) { \ + return __atomic_fetch_xor(Address, Val, Ordering); \ + } \ + TY atomicMin(TY *Address, TY Val, atomic::OrderingTy Ordering) { \ + return __atomic_fetch_min(Address, Val, Ordering); \ + } \ + TY atomicMax(TY *Address, TY Val, atomic::OrderingTy Ordering) { \ + return __atomic_fetch_max(Address, Val, Ordering); \ + } + +ATOMIC_OP(int8_t) +ATOMIC_OP(int16_t) +ATOMIC_OP(int32_t) +ATOMIC_OP(int64_t) +ATOMIC_OP(uint8_t) +ATOMIC_OP(uint16_t) +ATOMIC_OP(uint32_t) +ATOMIC_OP(uint64_t) +ATOMIC_FP_OP(float, int32_t, uint32_t) +ATOMIC_FP_OP(double, int64_t, uint64_t) + +#undef ATOMIC_FP_AND_INT_OP +#undef ATOMIC_FP_ONLY_OP +#undef ATOMIC_FP_OP +#undef ATOMIC_OP uint32_t atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) { @@ -55,17 +121,6 @@ __atomic_exchange(Address, &Val, &R, Ordering); return R; } -uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val, - atomic::OrderingTy Ordering) { - (void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering, - Ordering); - return Compare; -} - -uint64_t atomicAdd(uint64_t *Address, uint64_t Val, - atomic::OrderingTy Ordering) { - return __atomic_fetch_add(Address, Val, Ordering); -} ///} // Forward declarations defined to be defined for AMDGCN and NVPTX. @@ -287,7 +342,8 @@ void setLock(omp_lock_t *Lock) { // TODO: not sure spinning is a good idea here.. - while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst) != UNSET) { + while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, + atomic::seq_cst) != UNSET) { int32_t start = __nvvm_read_ptx_sreg_clock(); int32_t now; for (;;) { @@ -322,9 +378,62 @@ void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } -uint32_t atomic::load(uint32_t *Addr, atomic::OrderingTy Ordering) { - return impl::atomicLoad(Addr, Ordering); -} +#define ATOMIC_FP_AND_INT_OP(TY) \ + TY atomic::add(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicAdd(Addr, V, Ordering); \ + } \ + TY atomic::mul(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMul(Addr, V, Ordering); \ + } \ + TY atomic::load(TY *Addr, atomic::OrderingTy Ordering) { \ + return impl::atomicLoad(Addr, Ordering); \ + } \ + bool atomic::cas(TY *Addr, TY ExpectedV, TY DesiredV, \ + atomic::OrderingTy OrderingSucc, \ + atomic::OrderingTy OrderingFail) { \ + return impl::atomicCAS(Addr, ExpectedV, DesiredV, OrderingSucc, \ + OrderingFail); \ + } + +#define ATOMIC_FP_ONLY_OP(TY) \ + TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMin(Addr, V, Ordering); \ + } \ + TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMax(Addr, V, Ordering); \ + } + +#define ATOMIC_FP_OP(TY, SINT_TY, UINT_TY) \ + ATOMIC_FP_AND_INT_OP(TY) \ + ATOMIC_FP_ONLY_OP(TY, SINT_TY, UINT_TY) + +#define ATOMIC_OP(TY) \ + ATOMIC_FP_AND_INT_OP(TY) \ + TY atomic::bit_or(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicOr(Addr, V, Ordering); \ + } \ + TY atomic::bit_and(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicAnd(Addr, V, Ordering); \ + } \ + TY atomic::bit_xor(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicXOr(Addr, V, Ordering); \ + } + +ATOMIC_OP(int8_t) +ATOMIC_OP(int16_t) +ATOMIC_OP(int32_t) +ATOMIC_OP(int64_t) +ATOMIC_OP(uint8_t) +ATOMIC_OP(uint16_t) +ATOMIC_OP(uint32_t) +ATOMIC_OP(uint64_t) +ATOMIC_FP_OP(float) +ATOMIC_FP_OP(double) + +#undef ATOMIC_FP_AND_INT_OP +#undef ATOMIC_FP_ONLY_OP +#undef ATOMIC_FP_OP +#undef ATOMIC_OP void atomic::store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { impl::atomicStore(Addr, V, Ordering); @@ -334,14 +443,6 @@ return impl::atomicInc(Addr, V, Ordering); } -uint32_t atomic::add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - return impl::atomicAdd(Addr, V, Ordering); -} - -uint64_t atomic::add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering) { - return impl::atomicAdd(Addr, V, Ordering); -} - extern "C" { void __kmpc_ordered(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }