Index: openmp/libomptarget/DeviceRTL/src/Synchronization.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -130,6 +130,8 @@ void initLock(omp_lock_t *); void destroyLock(omp_lock_t *); void setLock(omp_lock_t *); +void unsetCriticalLock(omp_lock_t *); +void setCriticalLock(omp_lock_t *); /// AMDGCN Implementation /// @@ -269,6 +271,25 @@ void destroyLock(omp_lock_t *) { __builtin_trap(); } void setLock(omp_lock_t *) { __builtin_trap(); } +constexpr uint32_t UNSET = 0; +constexpr uint32_t SET = 1; + +void unsetCriticalLock(omp_lock_t *Lock) { + (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel); +} + +void setCriticalLock(omp_lock_t *Lock) { + uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1; + if (mapping::getThreadIdInWarp() == LowestActiveThread) { + fenceKernel(atomic::release); + while (!atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::relaxed, + atomic::relaxed)) { + __builtin_amdgcn_s_sleep(32); + } + fenceKernel(atomic::aquire); + } +} + #pragma omp end declare variant ///} @@ -450,6 +471,14 @@ return impl::atomicInc(Addr, V, Ordering); } +void unsetCriticalLock(omp_lock_t *Lock) { + impl::unsetLock(Lock); +} + +void setCriticalLock(omp_lock_t *Lock) { + impl::setLock(Lock); +} + extern "C" { void __kmpc_ordered(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); } @@ -518,12 +547,12 @@ void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { FunctionTracingRAII(); - omp_set_lock(reinterpret_cast(Name)); + impl::setCriticalLock(reinterpret_cast(Name)); } void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { FunctionTracingRAII(); - omp_unset_lock(reinterpret_cast(Name)); + impl::unsetCriticalLock(reinterpret_cast(Name)); } void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } Index: openmp/libomptarget/test/offloading/target_critical_region.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/test/offloading/target_critical_region.cpp @@ -0,0 +1,36 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// UNSUPPORTED: nvptx64-nvidia-cuda +// UNSUPPORTED: nvptx64-nvidia-cuda-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +#define N 1000000 + +int A[N]; +int main() { + for (int i = 0; i < N; i++) + A[i] = 1; + + int sum[1]; + sum[0] = 0; + +#pragma omp target teams distribute parallel for num_teams(256) \ + schedule(static, 1) map(to \ + : A[:N]) map(tofrom \ + : sum[:1]) + { + for (int i = 0; i < N; i++) { +#pragma omp critical + { sum[0] += A[i]; } + } + } + + // CHECK: SUM = 1000000 + printf("SUM = %d\n", sum[0]); + + return 0; +}