diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -44,14 +44,11 @@ namespace atomic { -/// Atomically read \p Addr with \p Ordering semantics. -uint32_t read(uint32_t *Addr, int Ordering); +/// Atomically load \p Addr with \p Ordering semantics. +uint32_t load(uint32_t *Addr, int Ordering); /// Atomically store \p V to \p Addr with \p Ordering semantics. -uint32_t store(uint32_t *Addr, uint32_t V, int Ordering); - -/// Atomically store \p V to \p Addr with \p Ordering semantics. -uint64_t store(uint64_t *Addr, uint64_t V, int Ordering); +void store(uint32_t *Addr, uint32_t V, int Ordering); /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering); diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -29,7 +29,7 @@ #pragma omp begin declare variant match(device = {arch(amdgcn)}) constexpr const llvm::omp::GV &getGridValue() { - return llvm::omp::AMDGPUGridValues; + return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>(); } uint32_t getGridDim(uint32_t n, uint16_t d) { diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -209,7 +209,7 @@ // to the number of slots in the buffer. bool IsMaster = (ThreadId == 0); while (IsMaster) { - Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST); + Bound = atomic::load((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST); if (TeamId < Bound + num_of_records) break; } diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -31,10 +31,14 @@ /// NOTE: This function needs to be implemented by every target. uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering); -uint32_t atomicRead(uint32_t *Address, int Ordering) { +uint32_t atomicLoad(uint32_t *Address, int Ordering) { return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST); } +void atomicStore(uint32_t *Address, uint32_t Val, int Ordering) { + __atomic_store_n(Address, Val, Ordering); +} + uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) { return __atomic_fetch_add(Address, Val, Ordering); } @@ -68,7 +72,7 @@ return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, ""); } -uint32_t SHARD(namedBarrierTracker); +uint32_t SHARED(namedBarrierTracker); void namedBarrierInit() { // Don't have global ctors, and shared memory is not zero init @@ -79,7 +83,7 @@ uint32_t NumThreads = omp_get_num_threads(); // assert(NumThreads % 32 == 0); - uint32_t WarpSize = maping::getWarpSize(); + uint32_t WarpSize = mapping::getWarpSize(); uint32_t NumWaves = NumThreads / WarpSize; fence::team(__ATOMIC_ACQUIRE); @@ -115,7 +119,7 @@ // more waves still to go, spin until generation counter changes do { __builtin_amdgcn_s_sleep(0); - load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED); + load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED); } while ((load & 0xffff0000u) == generation); } } @@ -192,7 +196,7 @@ void initLock(omp_lock_t *Lock) { unsetLock(Lock); } -void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); } +void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } void setLock(omp_lock_t *Lock) { // TODO: not sure spinning is a good idea here.. @@ -229,8 +233,12 @@ void fence::system(int Ordering) { impl::fenceSystem(Ordering); } -uint32_t atomic::read(uint32_t *Addr, int Ordering) { - return impl::atomicRead(Addr, Ordering); +uint32_t atomic::load(uint32_t *Addr, int Ordering) { + return impl::atomicLoad(Addr, Ordering); +} + +void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) { + impl::atomicStore(Addr, V, Ordering); } uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) { @@ -300,7 +308,7 @@ void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } -void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); } +void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -35,8 +35,9 @@ #pragma omp begin declare variant match(device = {arch(amdgcn)}) void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { - *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF)); - *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32); + static_assert(sizeof(unsigned long) == 8, ""); + *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL); + *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32); } uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { @@ -75,7 +76,7 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { int Width = mapping::getWarpSize(); - int Self = mapping::getgetThreadIdInWarp(); + int Self = mapping::getThreadIdInWarp(); int Index = SrcLane + (Self & ~(Width - 1)); return __builtin_amdgcn_ds_bpermute(Index << 2, Var); }