diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -281,6 +281,12 @@ /// TODO void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t); +/// TODO +bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn); + +/// TODO +void __kmpc_kernel_end_simd(); + /// TODO void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind); diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h --- a/openmp/libomptarget/DeviceRTL/include/Mapping.h +++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h @@ -25,7 +25,7 @@ #pragma omp end declare target /// Initialize the mapping machinery. -void init(bool IsSPMD); +void init(int8_t Mode); /// Return true if the kernel is executed in SPMD mode. bool isSPMDMode(); @@ -33,6 +33,9 @@ /// Return true if the kernel is executed in generic mode. bool isGenericMode(); +/// Return true if the kernel is executed in SIMD mode. +bool isSIMDMode(); + /// Return true if the executing thread is the main thread in generic mode. bool isMainThreadInGenericMode(); @@ -55,6 +58,12 @@ /// Return the thread Id in the block, in [0, getBlockSize()). uint32_t getThreadIdInBlock(); +/// Return the logic thread Id, which depends on how we map an OpenMP thread to +/// the target device. In non-SIMD mode, we map an OpenMP thread to a device +/// thread. In SIMD mode, we map an OpenMP thread to a warp, and each thread in +/// the warp is a SIMD lane. +uint32_t getLogicThreadId(); + /// Return the warp id in the block. uint32_t getWarpId(); @@ -79,6 +88,19 @@ /// Return the number of processing elements on the device. uint32_t getNumberOfProcessorElements(); +namespace utils { +/// Return true if \p Mode indicates SPMD mode. +inline bool isSPMDMode(int8_t Mode) { return Mode & OMP_TGT_EXEC_MODE_SPMD; } + +/// Return true if \p Mode indicates generic mode. +inline bool isGenericMode(int8_t Mode) { + return Mode & OMP_TGT_EXEC_MODE_GENERIC; +} + +/// Return true if \p Mode indicates SIMD mode. +inline bool isSIMDMode(int8_t Mode) { return Mode & OMP_TGT_EXEC_MODE_SIMD; } +} // namespace utils + } // namespace mapping } // namespace _OMP diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h --- a/openmp/libomptarget/DeviceRTL/include/State.h +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -24,7 +24,7 @@ inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; /// Initialize the state machinery. Must be called by all threads. -void init(bool IsSPMD); +void init(int8_t Mode); /// TODO enum ValueKind { @@ -37,6 +37,10 @@ VK_RunSchedChunk, VK_ParallelRegionFn, VK_ParallelTeamSize, + // SIMD + VK_SIMDLevel, + VK_SIMDRegionFn, + VK_SIMDLaneWidth, }; /// TODO @@ -145,10 +149,20 @@ inline state::PtrValue ParallelRegionFn; +/// TODO +inline state::Value SIMDLaneWidth; + +/// TODO +inline state::PtrValue SIMDRegionFn; + void runAndCheckState(void(Func(void))); void assumeInitialState(bool IsSPMD); +/// Propagate the thread state from the leader in the warp to the rest of SIMD +/// workers. This function should only be called in SIMD mode. +void propagateThreadState(unsigned SIMDLen); + } // namespace state namespace icv { @@ -171,6 +185,9 @@ /// TODO inline state::Value RunSched; +/// TODO +inline state::Value SIMDLevel; + } // namespace icv namespace memory { diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -19,7 +19,7 @@ namespace synchronize { /// Initialize the synchronization machinery. Must be called by all threads. -void init(bool IsSPMD); +void init(int8_t Mode); /// Synchronize all threads in a warp identified by \p Mask. void warp(LaneMaskTy Mask); diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h --- a/openmp/libomptarget/DeviceRTL/include/Types.h +++ b/openmp/libomptarget/DeviceRTL/include/Types.h @@ -150,6 +150,8 @@ using ParallelRegionFnTy = void *; +using SIMDRegionFnTy = void *; + using CriticalNameTy = int32_t[8]; struct omp_lock_t { @@ -181,6 +183,7 @@ enum OMPTgtExecModeFlags : int8_t { OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, OMP_TGT_EXEC_MODE_SPMD = 1 << 1, + OMP_TGT_EXEC_MODE_SIMD = 1 << 2, }; #define __PRAGMA(STR) _Pragma(#STR) diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -21,17 +21,17 @@ #pragma omp declare target -static void inititializeRuntime(bool IsSPMD) { +static void inititializeRuntime(int Mode) { // Order is important here. - synchronize::init(IsSPMD); - mapping::init(IsSPMD); - state::init(IsSPMD); + synchronize::init(Mode); + mapping::init(Mode); + state::init(Mode); } /// Simple generic state machine for worker threads. static void genericStateMachine(IdentTy *Ident) { - uint32_t TId = mapping::getThreadIdInBlock(); + uint32_t TId = mapping::getLogicThreadId(); do { ParallelRegionFnTy WorkFn = 0; @@ -58,6 +58,31 @@ } while (true); } +namespace { +void runSIMDStateMachine(IdentTy *Ident) { + uint32_t LaneId = mapping::getThreadIdInWarp(); + do { + SIMDRegionFnTy WorkFn = nullptr; + + // Wait for the signal that we have a new work function. + synchronize::warp(mapping::activemask()); + + // Retrieve the work function from the runtime. + bool IsActive = __kmpc_kernel_simd(&WorkFn); + + if (!WorkFn) + return; + + if (IsActive) { + ((void (*)(uint32_t, uint32_t))WorkFn)(0, LaneId); + __kmpc_kernel_end_simd(); + } + + synchronize::warp(mapping::activemask()); + } while (true); +} +} // namespace + extern "C" { /// Initialization @@ -66,16 +91,21 @@ /// int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode, bool UseGenericStateMachine, bool) { - const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; - if (IsSPMD) { - inititializeRuntime(/* IsSPMD */ true); - synchronize::threads(); - } else { - inititializeRuntime(/* IsSPMD */ false); - // No need to wait since only the main threads will execute user - // code and workers will run into a barrier right away. + inititializeRuntime(Mode); + + // For all SIMD workers, start the simd state machine. + if (mapping::utils::isSIMDMode(Mode)) { + uint32_t LaneId = mapping::getThreadIdInWarp(); + if (LaneId) { + runSIMDStateMachine(Ident); + return LaneId; + } } + const bool IsSPMD = mapping::utils::isSPMDMode(Mode); + if (IsSPMD) + synchronize::threads(); + if (IsSPMD) { state::assumeInitialState(IsSPMD); return -1; @@ -98,7 +128,8 @@ /// \param Ident Source location identification, can be NULL. /// void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) { - const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; + const bool IsSPMD = mapping::utils::isSPMDMode(Mode); + state::assumeInitialState(IsSPMD); if (IsSPMD) return; diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -178,7 +178,7 @@ bool mapping::isLeaderInWarp() { __kmpc_impl_lanemask_t Active = mapping::activemask(); __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); - return utils::popc(Active & LaneMaskLT) == 0; + return ::_OMP::utils::popc(Active & LaneMaskLT) == 0; } LaneMaskTy mapping::activemask() { return impl::activemask(); } @@ -191,6 +191,13 @@ uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); } +uint32_t mapping::getLogicThreadId() { + if (mapping::isSIMDMode()) + return mapping::getWarpId(); + + return mapping::getThreadIdInBlock(); +} + uint32_t mapping::getBlockSize() { return impl::getBlockSize(); } uint32_t mapping::getKernelSize() { return impl::getKernelSize(); } @@ -214,16 +221,20 @@ /// Execution mode /// ///{ -static int SHARED(IsSPMDMode); +static int8_t SHARED(ExecutionMode); -void mapping::init(bool IsSPMD) { +void mapping::init(int8_t Mode) { if (!mapping::getThreadIdInBlock()) - IsSPMDMode = IsSPMD; + ExecutionMode = Mode; } -bool mapping::isSPMDMode() { return IsSPMDMode; } +bool mapping::isSPMDMode() { return mapping::utils::isSPMDMode(ExecutionMode); } -bool mapping::isGenericMode() { return !isSPMDMode(); } +bool mapping::isGenericMode() { + return mapping::utils::isGenericMode(ExecutionMode); +} + +bool mapping::isSIMDMode() { return mapping::utils::isSIMDMode(ExecutionMode); } ///} extern "C" { diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -49,20 +49,43 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { uint32_t NThreadsICV = NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; - uint32_t NumThreads = mapping::getBlockSize(); + + const bool IsSIMDMode = mapping::isSIMDMode(); + + uint32_t NumThreads = + IsSIMDMode ? mapping::getNumberOfWarpsInBlock() : mapping::getBlockSize(); if (NThreadsICV != 0 && NThreadsICV < NumThreads) NumThreads = NThreadsICV; // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP. - if (NumThreads < mapping::getWarpSize()) - NumThreads = 1; - else - NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); + // We don't need this for SIMD mode because an OpenMP thread is mapped to a + // warp on the device and it can be any number. + if (!IsSIMDMode) { + if (NumThreads < mapping::getWarpSize()) + NumThreads = 1; + else + NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); + } return NumThreads; } +uint32_t determineSIMDLen(int32_t SIMDLen, int32_t SafeLen) { + ASSERT(mapping::isSIMDMode()); + + // TODO: This is probably not right if the schedule is different. + if (SafeLen < SIMDLen) + SIMDLen = SafeLen; + + // We currently maps an OpenMP thread to a warp in SIMD mode. If the simdlen + // is larger than the warp size, we have to ceil it. + if (SIMDLen > mapping::getWarpSize()) + SIMDLen = mapping::getWarpSize(); + + return SIMDLen; +} + // Invoke an outlined parallel function unwrapping arguments (up to 32). void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn, void **args, int64_t nargs) { @@ -78,11 +101,57 @@ extern "C" { +void __kmpc_simd_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t safelen, + int32_t simdlen, int order, void *fn, void *wrapper_fn, + void **args, int64_t nargs) { + // Handle non-SIMD case first, which can be: + // - if clause is evaluted to false + // - simdlen is set to 1 + // - it is already in simd region + const uint32_t LogicThreadId = mapping::getLogicThreadId(); + if (OMP_UNLIKELY(!if_expr || simdlen == 1 || safelen == 1 || + icv::SIMDLevel)) { + invokeMicrotask(LogicThreadId, 0, fn, args, nargs); + return; + } + + // Only the leader of each warp can execute the following code. + ASSERT(mapping::isLeaderInWarp()); + + const uint32_t SIMDLen = determineSIMDLen(simdlen, safelen); + + if (LogicThreadId == 0) + state::SIMDLaneWidth = SIMDLen; + + // Propagates the thread state to all SIMD workers from the leader. + state::propagateThreadState(SIMDLen); + + // Synchronize all threads (leaders). + synchronize::threads(); + + { + state::ValueRAII SIMDRegionFnRAII(state::SIMDRegionFn, wrapper_fn, + (void *)nullptr, true); + state::ValueRAII SIMDLevelRAII(icv::SIMDLevel, 1u, 0u, true); + + // Signal SIMD workers + synchronize::warp(mapping::activemask()); + + // TODO: Leader in warp also has to execute the SIMD region. + // What we need: + // - A work-sharing function that can take both thread id and lane id into + // consideration. + + // Synchronize after execution of the SIMD region. + synchronize::warp(mapping::activemask()); + } +} + void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t num_threads, int proc_bind, void *fn, void *wrapper_fn, void **args, int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); + uint32_t TId = mapping::getLogicThreadId(); // Handle the serialized case first, same for SPMD/non-SPMD. if (OMP_UNLIKELY(!if_expr || icv::Level)) { __kmpc_serialized_parallel(ident, TId); @@ -156,7 +225,7 @@ return false; // Set to true for workers participating in the parallel region. - uint32_t TId = mapping::getThreadIdInBlock(); + uint32_t TId = mapping::getLogicThreadId(); bool ThreadIsActive = TId < state::ParallelTeamSize; return ThreadIsActive; } @@ -170,6 +239,24 @@ ASSERT(!mapping::isSPMDMode()); } +__attribute__((noinline)) bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn) { + // Work function and arguments for L1 SIMD region. + *WorkFn = state::SIMDRegionFn; + + // If this is the termination signal from the master, quit early. + if (!*WorkFn) + return false; + + // Set to true for workers participating in the parallel region. + uint32_t LaneId = mapping::getThreadIdInWarp(); + bool LaneActive = LaneId < state::SIMDLaneWidth; + return LaneActive; +} + +__attribute__((noinline)) void __kmpc_kernel_end_simd() { + // TODO: Some clean-up of SIMD execution +} + void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) { state::enterDataEnvironment(); ++icv::Level; diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -203,7 +203,7 @@ struct TeamStateTy { /// TODO: provide a proper init function. - void init(bool IsSPMD); + void init(int Mode); bool operator==(const TeamStateTy &) const; @@ -224,8 +224,13 @@ TeamStateTy SHARED(TeamState); -void TeamStateTy::init(bool IsSPMD) { - ICVState.NThreadsVar = mapping::getBlockSize(); +void TeamStateTy::init(int Mode) { + // In SIMD mode, we map an OpenMP thread to a warp. + if (mapping::utils::isSIMDMode(Mode)) + ICVState.NThreadsVar = mapping::getNumberOfWarpsInBlock(); + else + ICVState.NThreadsVar = mapping::getBlockSize(); + ICVState.LevelVar = 0; ICVState.ActiveLevelVar = 0; ICVState.MaxActiveLevelsVar = 1; @@ -357,7 +362,8 @@ __builtin_unreachable(); } -void state::init(bool IsSPMD) { +void state::init(int8_t Mode) { + const bool IsSPMD = mapping::utils::isSPMDMode(Mode); SharedMemorySmartStack.init(IsSPMD); if (!mapping::getThreadIdInBlock()) TeamState.init(IsSPMD); @@ -404,6 +410,15 @@ ASSERT(mapping::isSPMDMode() == IsSPMD); } +void state::propagateThreadState(unsigned SIMDLen) { + ASSERT(mapping::isSIMDMode()); + ASSERT(mapping::isLeaderInWarp()); + + const uint32_t TId = mapping::getThreadIdInBlock(); + for (int I = 1; I < SIMDLen; ++I) + ThreadStates[I + TId] = ThreadStates[TId]; +} + extern "C" { void omp_set_dynamic(int V) {} @@ -434,7 +449,7 @@ } int omp_get_ancestor_thread_num(int Level) { - return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); + return returnValIfLevelIsActive(Level, mapping::getLogicThreadId(), 0); } int omp_get_thread_num(void) { diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -214,8 +214,8 @@ } // namespace impl -void synchronize::init(bool IsSPMD) { - if (!IsSPMD) +void synchronize::init(int8_t Mode) { + if (!mapping::utils::isSPMDMode(Mode) || mapping::utils::isSIMDMode(Mode)) impl::namedBarrierInit(); } diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp --- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -210,7 +210,7 @@ static void dispatch_init(IdentTy *loc, int32_t threadId, kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, DynamicScheduleTracker *DST) { - int tid = mapping::getThreadIdInBlock(); + int tid = mapping::getLogicThreadId(); T tnum = omp_get_num_threads(); T tripCount = ub - lb + 1; // +1 because ub is inclusive ASSERT0(LT_FUSSY, threadId < tnum, diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -1084,20 +1084,31 @@ KernelTy *KernelInfo = reinterpret_cast(TgtEntryPtr); const bool IsSPMDGenericMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; + KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; const bool IsSPMDMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; + !IsSPMDGenericMode && + (KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_SPMD); const bool IsGenericMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; + !IsSPMDGenericMode && + (KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_GENERIC); + const bool IsSIMDMode = + KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_SIMD; int CudaThreadsPerBlock; if (ThreadLimit > 0) { - DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); - CudaThreadsPerBlock = ThreadLimit; - // Add master warp if necessary - if (IsGenericMode) { - DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); - CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; + if (IsSIMDMode) { + DP("Setting CUDA threads per block to requested %d\n", + ThreadLimit * DeviceData[DeviceId].WarpSize); + CudaThreadsPerBlock = ThreadLimit * DeviceData[DeviceId].WarpSize; + } else { + DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); + CudaThreadsPerBlock = ThreadLimit; + // Add master warp if necessary + if (IsGenericMode) { + DP("Adding master warp: +%d threads\n", + DeviceData[DeviceId].WarpSize); + CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; + } } } else { DP("Setting CUDA threads per block to default %d\n",