diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -51,7 +51,7 @@ src/omptarget-nvptx.cu src/parallel.cu src/reduction.cu - src/sync.cu + src/sync.cpp src/task.cu ) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cpp b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cpp @@ -0,0 +1,92 @@ +//===--- sync.cpp --- OpenMP synchronization operations ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic implementation for synchronization primitives. +// +//===----------------------------------------------------------------------===// + +#include "debug.h" +#include "target_impl.h" + +/// Perform a barrier operation that might cause a cancellation. +EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *Loc, int32_t TID) { + __kmpc_impl_barrier(Loc, TID); + return /* should be cancelled */ false; +} + +/// Perform a barrier operation. +EXTERN void __kmpc_barrier(kmp_Ident *Loc, int32_t TID) { + __kmpc_impl_barrier(Loc, TID); +} + +/// Perform a simple barrier operation in SPMD-mode. +EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *Loc, int32_t TID) { + __kmpc_impl_barrier(Loc, TID); +} + +/// Perform a simple barrier operation in non-SPMD-mode. +EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *Loc, int32_t TID) { + __kmpc_impl_barrier(Loc, TID); +} + +/// Function to be called at the beginning of an "ordered" region. +EXTERN void __kmpc_ordered(kmp_Ident *, int32_t) { + PRINT0(LD_IO, "call kmpc_ordered\n"); +} + +/// Function to be called at the end of an "ordered" region. +EXTERN void __kmpc_end_ordered(kmp_Ident *, int32_t) { + PRINT0(LD_IO, "call kmpc_end_ordered\n"); +} + +/// Create two functions, one to be called before entering region which returns +/// a non-zero value if the region should be entered, and one to be called after +/// the region was executed. The names of the function will be __kmpc_NAME and +/// __kmcp_end_NAME. The predicate under which the region is entered is provided +/// as ENTERING_PREDICATE. +#define REGION_DELIMITERS(NAME, ENTERING_PREDICATE) \ + \ + EXTERN int32_t __kmpc_##NAME(kmp_Ident *, int32_t GlobalTID) { \ + PRINT0(LD_IO, "call " #NAME "\n"); \ + return ENTERING_PREDICATE(GlobalTID); \ + } \ + \ + EXTERN void __kmpc_end_##NAME(kmp_Ident *, int32_t GlobalTID) { \ + PRINT0(LD_IO, "call " #NAME "\n"); \ + ASSERT0(LT_FUSSY, ENTERING_PREDICATE(GlobalTID), \ + "Region end function executed by thread which should not have " \ + "entered"); \ + } + +/// Region delimiter functions for "master". +///{ +REGION_DELIMITERS(master, IsTeamMaster) +///} + +/// Region delimiter functions for "single" implemented the same as master. +///{ +REGION_DELIMITERS(single, IsTeamMaster) +///} + +/// Perform a "flush" operation. +EXTERN void __kmpc_flush(kmp_Ident *Loc) { + PRINT0(LD_IO, "call kmpc_flush\n"); + __kmpc_impl_flush(Loc); +} + +/// Return the bit-mask of active threads in the warp. +/// +/// FIXME: Warps are a detail we should get rid of here. +EXTERN int32_t __kmpc_warp_active_thread_mask() { + PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); + return __kmpc_impl_active_thread_mask(); +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ /dev/null @@ -1,143 +0,0 @@ -//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Include all synchronization. -// -//===----------------------------------------------------------------------===// - -#include "omptarget-nvptx.h" - -//////////////////////////////////////////////////////////////////////////////// -// KMP Ordered calls -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_ordered\n"); -} - -EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_end_ordered\n"); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP Barriers -//////////////////////////////////////////////////////////////////////////////// - -// a team is a block: we can use CUDA native synchronization mechanism -// FIXME: what if not all threads (warps) participate to the barrier? -// We may need to implement it differently - -EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) { - PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); - __kmpc_barrier(loc_ref, tid); - PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); - return 0; -} - -EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) { - if (checkRuntimeUninitialized(loc_ref)) { - ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref), - "Expected SPMD mode with uninitialized runtime."); - __kmpc_barrier_simple_spmd(loc_ref, tid); - } else { - tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); - int numberOfActiveOMPThreads = - GetNumberOfOmpThreads(checkSPMDMode(loc_ref)); - if (numberOfActiveOMPThreads > 1) { - if (checkSPMDMode(loc_ref)) { - __kmpc_barrier_simple_spmd(loc_ref, tid); - } else { - // The #threads parameter must be rounded up to the WARPSIZE. - int threads = - WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); - - PRINT(LD_SYNC, - "call kmpc_barrier with %d omp threads, sync parameter %d\n", - (int)numberOfActiveOMPThreads, (int)threads); - // Barrier #1 is for synchronization among active threads. - named_sync(L1_BARRIER, threads); - } - } // numberOfActiveOMPThreads > 1 - PRINT0(LD_SYNC, "completed kmpc_barrier\n"); - } -} - -// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0 -// parallel region and that all worker threads participate. -EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) { - PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n"); - // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. - __SYNCTHREADS(); - PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n"); -} - -// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0 -// parallel region and that all worker threads participate. -EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) { - int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE; - // The #threads parameter must be rounded up to the WARPSIZE. - int threads = - WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); - - PRINT(LD_SYNC, - "call kmpc_barrier_simple_generic with %d omp threads, sync parameter " - "%d\n", - (int)numberOfActiveOMPThreads, (int)threads); - // Barrier #1 is for synchronization among active threads. - named_sync(L1_BARRIER, threads); - PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n"); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP MASTER -//////////////////////////////////////////////////////////////////////////////// - -EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_master\n"); - return IsTeamMaster(global_tid); -} - -EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_end_master\n"); - ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP SINGLE -//////////////////////////////////////////////////////////////////////////////// - -EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_single\n"); - // decide to implement single with master; master get the single - return IsTeamMaster(global_tid); -} - -EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_end_single\n"); - // decide to implement single with master: master get the single - ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); - // sync barrier is explicitely called... so that is not a problem -} - -//////////////////////////////////////////////////////////////////////////////// -// Flush -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_flush(kmp_Ident *loc) { - PRINT0(LD_IO, "call kmpc_flush\n"); - __threadfence(); -} - -//////////////////////////////////////////////////////////////////////////////// -// Vote -//////////////////////////////////////////////////////////////////////////////// - -EXTERN int32_t __kmpc_warp_active_thread_mask() { - PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); - return __ACTIVEMASK(); -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -14,6 +14,8 @@ #ifndef TARGET_IMPL_H #define TARGET_IMPL_H +#include "omptarget-nvptx.h" + /// Atomically increment the pointee of \p Ptr by \p Val and return the original /// value of the pointee. template T __kmpc_impl_atomic_add(T *Ptr, T Val) { @@ -26,4 +28,53 @@ return atomicExch(Ptr, Val); } +/// Return the bit-mask representing active threads. +template T __kmpc_impl_active_thread_mask() { + return __ACTIVEMASK(); +} + +/// Perform an "omp flush" operation. +void __kmpc_impl_flush(kmp_Ident *) { + __threadfence(); +} + +/// Perform an "omp barrier" operation for various modes described as +/// combinations of "(non)-cancellable", "(non-)simple", and "(non-)SPMD". +/// +/// Note: A team is a block: we can use CUDA native synchronization mechanism. +/// +/// FIXME: What if not all threads (warps) participate to the barrier? We may +/// need to implement it differently +template +void __kmpc_impl_barrier(kmp_Ident *Loc, int32_t TID) { + // Try to justify SPMD mode first as it allows a simple barrier + // implementation. + bool InSPMD = IsSPMD || checkRuntimeUninitialized(Loc) || checkSPMDMode(Loc); + + if (InSPMD) { + PRINT(LD_SYNC, "call kmpc%s_barrier%s_spmd\n", + IsCancellable ? "_cancel" : "", IsSimple ? "_simple" : ""); + // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. + __SYNCTHREADS(); + } else { + int NumberOfActiveOMPThreads = GetNumberOfOmpThreads(InSPMD); + if (NumberOfActiveOMPThreads > 1) { + // The #threads parameter must be rounded up to the WARPSIZE. + int NumThreads = + WARPSIZE * ((NumberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc%s_barrier%s with %d omp NumThreads, sync parameter %d\n", + IsCancellable ? "_cancel" : "", IsSimple ? "_simple" : "", + NumberOfActiveOMPThreads, NumThreads); + + // Barrier #1 is for synchronization among active NumThreads. + named_sync(L1_BARRIER, NumThreads); + } + } + PRINT(LD_SYNC, "completed kmpc%s_barrier%s%s\n", + IsCancellable ? "_cancel" : "", IsSimple ? "_simple" : "", + InSPMD ? "_spmd" : ""); +} + #endif // TARGET_IMPL_H