Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -3456,7 +3456,11 @@ // Create all the blocks: // // InitCB = __kmpc_target_init(...) - // bool IsWorker = InitCB >= 0; + // BlockHwSize = + // __kmpc_get_hardware_num_threads_in_block(); + // WarpSize = __kmpc_get_warp_size(); + // BlockSize = BlockHwSize - WarpSize; + // bool IsWorker = InitCB >= 0 && InitCB < BlockSize; // if (IsWorker) { // SMBeginBB: __kmpc_barrier_simple_generic(...); // void *WorkFn; @@ -3509,16 +3513,38 @@ const DebugLoc &DLoc = KernelInitCB->getDebugLoc(); ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc); - InitBB->getTerminator()->eraseFromParent(); - Instruction *IsWorker = - ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, - ConstantInt::get(KernelInitCB->getType(), -1), - "thread.is_worker", InitBB); - IsWorker->setDebugLoc(DLoc); - BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB); Module &M = *Kernel->getParent(); + auto &OMPInfoCache = static_cast(A.getInfoCache()); + FunctionCallee BlockHwSizeFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_get_hardware_num_threads_in_block); + FunctionCallee WarpSizeFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_get_warp_size); + Instruction *BlockHwSize = + CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB); + BlockHwSize->setDebugLoc(DLoc); + Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB); + WarpSize->setDebugLoc(DLoc); + Instruction *BlockSize = + BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB); + BlockSize->setDebugLoc(DLoc); + Instruction *InBlock = + ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_ULT, KernelInitCB, + BlockSize, "thread.in_block", InitBB); + InBlock->setDebugLoc(DLoc); + + Instruction *NotMaster = + ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SGE, KernelInitCB, + ConstantInt::get(KernelInitCB->getType(), 0), + "thread.not_master", InitBB); + NotMaster->setDebugLoc(DLoc); + Instruction *IsWorker = BinaryOperator::CreateAnd( + NotMaster, InBlock, "thread.is_worker", InitBB); + IsWorker->setDebugLoc(DLoc); + BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB); // Create local storage for the work function pointer. const DataLayout &DL = M.getDataLayout(); @@ -3528,7 +3554,6 @@ "worker.work_fn.addr", &Kernel->getEntryBlock().front()); WorkFnAI->setDebugLoc(DLoc); - auto &OMPInfoCache = static_cast(A.getInfoCache()); OMPInfoCache.OMPBuilder.updateToLocation( OpenMPIRBuilder::LocationDescription( IRBuilder<>::InsertPoint(StateMachineBeginBB, Index: openmp/libomptarget/DeviceRTL/src/Kernel.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -86,7 +86,21 @@ if (mapping::isInitialThreadInLevel0(IsSPMD)) return -1; - if (UseGenericStateMachine) + // Enter the generic state machine if enabled and if this thread can possibly + // be an active worker thread. + // + // The latter check is important for NVIDIA Pascal (but not Volta) and AMD + // GPU. In those cases, a single thread can apparently satisfy a barrier on + // behalf of all threads in the same warp. Thus, it would not be safe for + // other threads in the master thread's warp to reach the first + // synchronize::threads call in genericStateMachine before the master thread + // reaches its corresponding synchronize::threads call: that would permit all + // active worker threads to proceed before the master thread has actually set + // state::ParallelRegionFn, and then they would immediately quit without + // doing any work. mapping::getBlockSize() does not include any of the master + // thread's warp, so none of its threads can ever be active worker threads. + if (UseGenericStateMachine && + mapping::getThreadIdInBlock() < mapping::getBlockSize()) genericStateMachine(Ident); return mapping::getThreadIdInBlock(); Index: openmp/libomptarget/deviceRTLs/common/src/omptarget.cu =================================================================== --- openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -179,8 +179,8 @@ EXTERN bool __kmpc_kernel_parallel(void**WorkFn); static void __kmpc_target_region_state_machine(ident_t *Ident) { - int TId = __kmpc_get_hardware_thread_id_in_block(); + do { void* WorkFn = 0; @@ -225,7 +225,22 @@ if (TId == GetMasterThreadID()) return -1; - if (UseGenericStateMachine) + // Enter the generic state machine if enabled and if this thread can possibly + // be an active worker thread. + // + // The latter check is important for NVIDIA Pascal (but not Volta) and AMD + // GPU. In those cases, a single thread can apparently satisfy a barrier on + // behalf of all threads in the same warp. Thus, it would not be safe for + // other threads in the master thread's warp to reach the first + // __kmpc_barrier_simple_spmd call in __kmpc_target_region_state_machine + // before the master thread reaches its corresponding + // __kmpc_barrier_simple_spmd call: that would permit all active worker + // threads to proceed before the master thread has actually set + // omptarget_nvptx_workFn, and then they would immediately quit without + // doing any work. GetNumberOfWorkersInTeam() does not include any of the + // master thread's warp, so none of its threads can ever be active worker + // threads. + if (UseGenericStateMachine && TId < GetNumberOfWorkersInTeam()) __kmpc_target_region_state_machine(Ident); return TId; Index: openmp/libomptarget/deviceRTLs/common/src/support.cu =================================================================== --- openmp/libomptarget/deviceRTLs/common/src/support.cu +++ openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -231,6 +231,7 @@ __attribute__((used, weak, optnone)) void keepAlive() { __kmpc_get_hardware_thread_id_in_block(); __kmpc_get_hardware_num_threads_in_block(); + __kmpc_get_warp_size(); __kmpc_barrier_simple_spmd(nullptr, 0); __kmpc_barrier_simple_generic(nullptr, 0); } Index: openmp/libomptarget/deviceRTLs/target_interface.h =================================================================== --- openmp/libomptarget/deviceRTLs/target_interface.h +++ openmp/libomptarget/deviceRTLs/target_interface.h @@ -23,6 +23,7 @@ EXTERN int GetBlockIdInKernel(); EXTERN NOINLINE int __kmpc_get_hardware_num_blocks(); EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block(); +EXTERN unsigned __kmpc_get_warp_size(); EXTERN unsigned GetWarpId(); EXTERN unsigned GetWarpSize(); EXTERN unsigned GetLaneId(); Index: openmp/libomptarget/test/offloading/bug51781.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/offloading/bug51781.c @@ -0,0 +1,38 @@ +// Use the generic state machine. On some architectures, other threads in the +// master thread's warp must avoid barrier instructions. +// +// RUN: %libomptarget-compile-run-and-check-generic + +// SPMDize. There is no master thread, so there's no issue. +// +// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt > %t.spmd 2>&1 +// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=SPMD -input-file=%t.spmd +// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=SPMD -input-file=%t.spmd +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic +// +// SPMD: Transformed generic-mode kernel to SPMD-mode. + +// Use the custom state machine, which must avoid the same barrier problem as +// the generic state machine. +// +// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt \ +// RUN: -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1 +// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom +// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic +// +// CUSTOM: Rewriting generic-mode kernel with a customized state machine. + +#include +int main() { + int x = 0, y = 1; + #pragma omp target teams num_teams(1) map(tofrom:x, y) + { + x = 5; + #pragma omp parallel + y = 6; + } + // CHECK: 5, 6 + printf("%d, %d\n", x, y); + return 0; +}