diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -27,6 +27,21 @@ /// Synchronize all threads in a block. void threads(); +/// Synchronizing threads is allowed even if they all hit different instances of +/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more +/// restrictive in that it requires all threads to hit the same instance. The +/// noinline is removed by the openmp-opt pass and helps to preserve the +/// information till then. +///{ +#pragma omp begin assumes ext_aligned_barrier + +/// Synchronize all threads in a block, they are are reaching the same +/// instruction (hence all threads in the block are "aligned"). +__attribute__((noinline)) void threadsAligned(); + +#pragma omp end assumes +///} + } // namespace synchronize namespace fence { diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -69,7 +69,7 @@ const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; if (IsSPMD) { inititializeRuntime(/* IsSPMD */ true); - synchronize::threads(); + synchronize::threadsAligned(); } else { inititializeRuntime(/* IsSPMD */ false); // No need to wait since only the main threads will execute user diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -94,18 +94,36 @@ uint32_t NumThreads = determineNumberOfThreads(num_threads); if (mapping::isSPMDMode()) { - synchronize::threads(); + // Avoid the race between the read of the `icv::Level` above and the write + // below by synchronizing all threads here. + synchronize::threadsAligned(); { + // Note that the order here is important. `icv::Level` has to be updated + // last or the other updates will cause a thread specific state to be + // created. state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads, 1u, TId == 0); state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0); state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0); - synchronize::threads(); + + // Synchronize all threads after the main thread (TId == 0) set up the + // team state properly. + synchronize::threadsAligned(); + + ASSERT(state::ParallelTeamSize == NumThreads); + ASSERT(icv::ActiveLevel == 1u); + ASSERT(icv::Level == 1u); if (TId < NumThreads) invokeMicrotask(TId, 0, fn, args, nargs); - synchronize::threads(); + + // Synchronize all threads at the end of a parallel region. + synchronize::threadsAligned(); } + + ASSERT(state::ParallelTeamSize == 1u); + ASSERT(icv::ActiveLevel == 0u); + ASSERT(icv::Level == 0u); return; } @@ -131,6 +149,9 @@ } { + // Note that the order here is important. `icv::Level` has to be updated + // last or the other updates will cause a thread specific state to be + // created. state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads, 1u, true); state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -41,8 +41,8 @@ ///{ extern "C" { -void *malloc(uint64_t Size); -void free(void *Ptr); +__attribute__((leaf)) void *malloc(uint64_t Size); +__attribute__((leaf)) void free(void *Ptr); } ///} diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -132,6 +132,8 @@ void syncThreads() { __builtin_amdgcn_s_barrier(); } +void syncThreadsAligned() { syncThreads(); } + void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); } void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); } @@ -179,6 +181,11 @@ asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory"); } +void syncThreadsAligned() { + constexpr int BarrierNo = 9; + asm volatile("barrier.sync.aligned %0;" : : "r"(BarrierNo) : "memory"); +} + constexpr uint32_t OMP_SPIN = 1000; constexpr uint32_t UNSET = 0; constexpr uint32_t SET = 1; @@ -227,6 +234,8 @@ void synchronize::threads() { impl::syncThreads(); } +void synchronize::threadsAligned() { impl::syncThreadsAligned(); } + void fence::team(int Ordering) { impl::fenceTeam(Ordering); } void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); } @@ -275,7 +284,7 @@ __attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { - synchronize::threads(); + synchronize::threadsAligned(); } int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {