diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -262,12 +262,34 @@ void syncThreads() { __builtin_amdgcn_s_barrier(); } void syncThreadsAligned() { syncThreads(); } -// TODO: Don't have wavefront lane locks. Possibly can't have them. -void unsetLock(omp_lock_t *) { __builtin_trap(); } -int testLock(omp_lock_t *) { __builtin_trap(); } -void initLock(omp_lock_t *) { __builtin_trap(); } -void destroyLock(omp_lock_t *) { __builtin_trap(); } -void setLock(omp_lock_t *) { __builtin_trap(); } +constexpr uint32_t UNSET = 0; +constexpr uint32_t SET = 1; + +void unsetLock(omp_lock_t *Lock) { + (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel); +} + +int testLock(omp_lock_t *Lock) { + if (atomicAdd((uint32_t *)Lock, 0u, atomic::relaxed) != UNSET) + return false; + setLock(Lock); + return true; +} + +void initLock(omp_lock_t *Lock) { unsetLock(Lock); } +void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void setLock(omp_lock_t *Lock) { + uint64_t lowestActiveThread = utils::ffs(mapping::activemask()) - 1; + if (mapping::getThreadIdInWarp() == lowestActiveThread) { + fenceKernel(atomic::release); + while (!atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::relaxed, + atomic::relaxed) != UNSET) { + __builtin_amdgcn_s_sleep(32); + } + fenceKernel(atomic::aquire); + } +} #pragma omp end declare variant ///} diff --git a/openmp/libomptarget/test/offloading/target_critical_region.cpp b/openmp/libomptarget/test/offloading/target_critical_region.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/target_critical_region.cpp @@ -0,0 +1,36 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// UNSUPPORTED: nvptx64-nvidia-cuda +// UNSUPPORTED: nvptx64-nvidia-cuda-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +#define N 1000000 + +int A[N]; +int main() { + for (int i = 0; i < N; i++) + A[i] = 1; + + int sum[1]; + sum[0] = 0; + +#pragma omp target teams distribute parallel for num_teams(256) \ + schedule(static, 1) map(to \ + : A[:N]) map(tofrom \ + : sum[:1]) + { + for (int i = 0; i < N; i++) { +#pragma omp critical + { sum[0] += A[i]; } + } + } + + // CHECK: SUM = 1000000 + printf("SUM = %d\n", sum[0]); + + return 0; +}