diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -16,34 +16,6 @@ namespace ompx { -namespace synchronize { - -/// Initialize the synchronization machinery. Must be called by all threads. -void init(bool IsSPMD); - -/// Synchronize all threads in a warp identified by \p Mask. -void warp(LaneMaskTy Mask); - -/// Synchronize all threads in a block. -void threads(); - -/// Synchronizing threads is allowed even if they all hit different instances of -/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more -/// restrictive in that it requires all threads to hit the same instance. The -/// noinline is removed by the openmp-opt pass and helps to preserve the -/// information till then. -///{ -#pragma omp begin assumes ext_aligned_barrier - -/// Synchronize all threads in a block, they are are reaching the same -/// instruction (hence all threads in the block are "aligned"). -__attribute__((noinline)) void threadsAligned(); - -#pragma omp end assumes -///} - -} // namespace synchronize - namespace atomic { enum OrderingTy { @@ -111,6 +83,38 @@ } // namespace atomic +namespace synchronize { + +/// Initialize the synchronization machinery. Must be called by all threads. +void init(bool IsSPMD); + +/// Synchronize all threads in a warp identified by \p Mask. +void warp(LaneMaskTy Mask); + +/// Synchronize all threads in a block and perform a fence before and after the +/// barrier according to \p Ordering. Note that the fence might be part of the +/// barrier. +void threads(atomic::OrderingTy Ordering); + +/// Synchronizing threads is allowed even if they all hit different instances of +/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more +/// restrictive in that it requires all threads to hit the same instance. The +/// noinline is removed by the openmp-opt pass and helps to preserve the +/// information till then. +///{ +#pragma omp begin assumes ext_aligned_barrier + +/// Synchronize all threads in a block, they are reaching the same instruction +/// (hence all threads in the block are "aligned"). Also perform a fence before +/// and after the barrier according to \p Ordering. Note that the +/// fence might be part of the barrier if the target offers this. +__attribute__((noinline)) void threadsAligned(atomic::OrderingTy Ordering); + +#pragma omp end assumes +///} + +} // namespace synchronize + namespace fence { /// Memory fence with \p Ordering semantics for the team. diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -40,7 +40,7 @@ ParallelRegionFnTy WorkFn = nullptr; // Wait for the signal that we have a new work function. - synchronize::threads(); + synchronize::threads(atomic::seq_cst); // Retrieve the work function from the runtime. bool IsActive = __kmpc_kernel_parallel(&WorkFn); @@ -56,7 +56,7 @@ __kmpc_kernel_end_parallel(); } - synchronize::threads(); + synchronize::threads(atomic::seq_cst); } while (true); } @@ -74,7 +74,7 @@ Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; if (IsSPMD) { inititializeRuntime(/* IsSPMD */ true); - synchronize::threadsAligned(); + synchronize::threadsAligned(atomic::relaxed); } else { inititializeRuntime(/* IsSPMD */ false); // No need to wait since only the main threads will execute user @@ -83,6 +83,10 @@ if (IsSPMD) { state::assumeInitialState(IsSPMD); + + // Synchronize to ensure the assertions above are in an aligned region. + // The barrier is eliminated later. + synchronize::threadsAligned(atomic::relaxed); return -1; } @@ -132,7 +136,11 @@ FunctionTracingRAII(); const bool IsSPMD = Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; + + synchronize::threadsAligned(atomic::acq_rel); state::assumeInitialState(IsSPMD); + synchronize::threadsAligned(atomic::relaxed); + if (IsSPMD) return; diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -113,7 +113,7 @@ if (mapping::isSPMDMode()) { // Avoid the race between the read of the `icv::Level` above and the write // below by synchronizing all threads here. - synchronize::threadsAligned(); + synchronize::threadsAligned(atomic::seq_cst); { // Note that the order here is important. `icv::Level` has to be updated // last or the other updates will cause a thread specific state to be @@ -128,28 +128,36 @@ // Synchronize all threads after the main thread (TId == 0) set up the // team state properly. - synchronize::threadsAligned(); + synchronize::threadsAligned(atomic::acq_rel); state::ParallelTeamSize.assert_eq(NumThreads, ident, /* ForceTeamState */ true); icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true); icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true); + // Ensure we synchronize before we run user code to avoid invalidating the + // assumptions above. + synchronize::threadsAligned(atomic::relaxed); + if (TId < NumThreads) invokeMicrotask(TId, 0, fn, args, nargs); // Synchronize all threads at the end of a parallel region. - synchronize::threadsAligned(); + synchronize::threadsAligned(atomic::seq_cst); } // Synchronize all threads to make sure every thread exits the scope above; // otherwise the following assertions and the assumption in // __kmpc_target_deinit may not hold. - synchronize::threadsAligned(); + synchronize::threadsAligned(atomic::acq_rel); state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true); icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true); icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true); + + // Ensure we synchronize to create an aligned region around the assumptions. + synchronize::threadsAligned(atomic::relaxed); + return; } @@ -243,9 +251,9 @@ /* ForceTeamState */ true); // Master signals work to activate workers. - synchronize::threads(); + synchronize::threads(atomic::seq_cst); // Master waits for workers to signal. - synchronize::threads(); + synchronize::threads(atomic::seq_cst); } if (nargs) diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -123,8 +123,8 @@ void fenceKernel(atomic::OrderingTy Ordering); void fenceSystem(atomic::OrderingTy Ordering); void syncWarp(__kmpc_impl_lanemask_t); -void syncThreads(); -void syncThreadsAligned() { syncThreads(); } +void syncThreads(atomic::OrderingTy Ordering); +void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } void unsetLock(omp_lock_t *); int testLock(omp_lock_t *); void initLock(omp_lock_t *); @@ -261,8 +261,16 @@ // AMDGCN doesn't need to sync threads in a warp } -void syncThreads() { __builtin_amdgcn_s_barrier(); } -void syncThreadsAligned() { syncThreads(); } +void syncThreads(atomic::OrderingTy Ordering) { + if (Ordering != atomic::relaxed) + fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst); + + __builtin_amdgcn_s_barrier(); + + if (Ordering != atomic::relaxed) + fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst); +} +void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } // TODO: Don't have wavefront lane locks. Possibly can't have them. void unsetLock(omp_lock_t *) { __builtin_trap(); } @@ -327,12 +335,12 @@ void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } -void syncThreads() { +void syncThreads(atomic::OrderingTy Ordering) { constexpr int BarrierNo = 8; asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory"); } -void syncThreadsAligned() { __syncthreads(); } +void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); } constexpr uint32_t OMP_SPIN = 1000; constexpr uint32_t UNSET = 0; @@ -381,9 +389,13 @@ void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } -void synchronize::threads() { impl::syncThreads(); } +void synchronize::threads(atomic::OrderingTy Ordering) { + impl::syncThreads(Ordering); +} -void synchronize::threadsAligned() { impl::syncThreadsAligned(); } +void synchronize::threadsAligned(atomic::OrderingTy Ordering) { + impl::syncThreadsAligned(Ordering); +} void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } @@ -504,13 +516,13 @@ __attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); - synchronize::threadsAligned(); + synchronize::threadsAligned(atomic::OrderingTy::seq_cst); } __attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); - synchronize::threads(); + synchronize::threads(atomic::OrderingTy::seq_cst); } int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { diff --git a/openmp/libomptarget/test/offloading/barrier_fence.c b/openmp/libomptarget/test/offloading/barrier_fence.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/barrier_fence.c @@ -0,0 +1,75 @@ +// RUN: %libomptarget-compile-generic -fopenmp-offload-mandatory -O3 +// RUN: %libomptarget-run-generic + +#include +#include + +struct IdentTy; +void __kmpc_barrier_simple_spmd(struct IdentTy *Loc, int32_t TId); +void __kmpc_barrier_simple_generic(struct IdentTy *Loc, int32_t TId); + +#pragma omp begin declare target device_type(nohost) +static int A[512] __attribute__((address_space(3), loader_uninitialized)); +static int B[512 * 32] __attribute__((loader_uninitialized)); +#pragma omp end declare target + +int main() { + printf("Testing simple spmd barrier\n"); + for (int r = 0; r < 50; r++) { +#pragma omp target teams distribute thread_limit(512) num_teams(440) + for (int j = 0; j < 512 * 32; ++j) { +#pragma omp parallel firstprivate(j) + { + int TId = omp_get_thread_num(); + int TeamId = omp_get_team_num(); + int NT = omp_get_num_threads(); + // Sequential + for (int i = 0; i < NT; ++i) { + // Test shared memory globals + if (TId == i) + A[i] = i + j; + __kmpc_barrier_simple_spmd(0, TId); + if (A[i] != i + j) + __builtin_trap(); + __kmpc_barrier_simple_spmd(0, TId); + // Test generic globals + if (TId == i) + B[TeamId] = i; + __kmpc_barrier_simple_spmd(0, TId); + if (B[TeamId] != i) + __builtin_trap(); + __kmpc_barrier_simple_spmd(0, TId); + } + } + } + } + + printf("Testing simple generic barrier\n"); + for (int r = 0; r < 50; r++) { +#pragma omp target teams distribute thread_limit(512) num_teams(440) + for (int j = 0; j < 512 * 32; ++j) { +#pragma omp parallel firstprivate(j) + { + int TId = omp_get_thread_num(); + int TeamId = omp_get_team_num(); + int NT = omp_get_num_threads(); + // Sequential + for (int i = 0; i < NT; ++i) { + if (TId == i) + A[i] = i + j; + __kmpc_barrier_simple_generic(0, TId); + if (A[i] != i + j) + __builtin_trap(); + __kmpc_barrier_simple_generic(0, TId); + if (TId == i) + B[TeamId] = i; + __kmpc_barrier_simple_generic(0, TId); + if (B[TeamId] != i) + __builtin_trap(); + __kmpc_barrier_simple_generic(0, TId); + } + } + } + } + return 0; +}