diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -38,19 +38,7 @@ llvm::SmallVector Work; struct EntryFunctionState { - llvm::BasicBlock *ExitBB = nullptr; - }; - - class WorkerFunctionState { - public: - llvm::Function *WorkerFn; - const CGFunctionInfo &CGFI; SourceLocation Loc; - - WorkerFunctionState(CodeGenModule &CGM, SourceLocation Loc); - - private: - void createWorkerFunction(CodeGenModule &CGM); }; ExecutionMode getExecutionMode() const; @@ -60,20 +48,13 @@ /// Get barrier to synchronize all threads in a block. void syncCTAThreads(CodeGenFunction &CGF); - /// Emit the worker function for the current target region. - void emitWorkerFunction(WorkerFunctionState &WST); + /// Helper for target directive initialization. + void emitKernelInit(CodeGenFunction &CGF, EntryFunctionState &EST, + bool IsSPMD); - /// Helper for worker function. Emit body of worker loop. - void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST); - - /// Helper for non-SPMD target entry function. Guide the master and - /// worker threads to their respective locations. - void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, - WorkerFunctionState &WST); - - /// Signal termination of OMP execution for non-SPMD target entry - /// function. - void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + /// Helper for target directive finalization. + void emitKernelDeinit(CodeGenFunction &CGF, EntryFunctionState &EST, + bool IsSPMD); /// Helper for generic variables globalization prolog. void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc, @@ -82,13 +63,6 @@ /// Helper for generic variables globalization epilog. void emitGenericVarsEpilog(CodeGenFunction &CGF, bool WithSPMDCheck = false); - /// Helper for SPMD mode target directive's entry function. - void emitSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, - const OMPExecutableDirective &D); - - /// Signal termination of SPMD mode execution. - void emitSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); - // // Base class overrides. // diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -553,63 +553,6 @@ "nvptx_lane_id"); } -/// Get the value of the thread_limit clause in the teams directive. -/// For the 'generic' execution mode, the runtime encodes thread_limit in -/// the launch parameters, always starting thread_limit+warpSize threads per -/// CTA. The threads in the last warp are reserved for master execution. -/// For the 'spmd' execution mode, all threads in a CTA are part of the team. -static llvm::Value *getThreadLimit(CodeGenFunction &CGF, - bool IsInSPMDExecutionMode = false) { - CGBuilderTy &Bld = CGF.Builder; - auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); - llvm::Value *ThreadLimit = nullptr; - if (IsInSPMDExecutionMode) - ThreadLimit = RT.getGPUNumThreads(CGF); - else { - llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF); - llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF); - ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit"); - } - assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit"); - return ThreadLimit; -} - -/// Get the thread id of the OMP master thread. -/// The master thread id is the first thread (lane) of the last warp in the -/// GPU block. Warp size is assumed to be some power of 2. -/// Thread id is 0 indexed. -/// E.g: If NumThreads is 33, master id is 32. -/// If NumThreads is 64, master id is 32. -/// If NumThreads is 1024, master id is 992. -static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); - llvm::Value *NumThreads = RT.getGPUNumThreads(CGF); - // We assume that the warp size is a power of 2. - llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1)); - - llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)); - return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid"); -} - -CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState( - CodeGenModule &CGM, SourceLocation Loc) - : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()), - Loc(Loc) { - createWorkerFunction(CGM); -} - -void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction( - CodeGenModule &CGM) { - // Create an worker function with no arguments. - - WorkerFn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - /*placeholder=*/"_worker", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI); - WorkerFn->setDoesNotRecurse(); -} - CGOpenMPRuntimeGPU::ExecutionMode CGOpenMPRuntimeGPU::getExecutionMode() const { return CurrentExecutionMode; @@ -1073,23 +1016,19 @@ const RegionCodeGenTy &CodeGen) { ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode); EntryFunctionState EST; - WorkerFunctionState WST(CGM, D.getBeginLoc()); - Work.clear(); WrapperFunctionsMap.clear(); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { CGOpenMPRuntimeGPU::EntryFunctionState &EST; - CGOpenMPRuntimeGPU::WorkerFunctionState &WST; public: - NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, - CGOpenMPRuntimeGPU::WorkerFunctionState &WST) - : EST(EST), WST(WST) {} + NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST) + : EST(EST) {} void Enter(CodeGenFunction &CGF) override { auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); - RT.emitNonSPMDEntryHeader(CGF, EST, WST); + RT.emitKernelInit(CGF, EST, /* IsSPMD */ false); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } @@ -1097,93 +1036,33 @@ auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); RT.clearLocThreadIdInsertPt(CGF); - RT.emitNonSPMDEntryFooter(CGF, EST); + RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false); } - } Action(EST, WST); + } Action(EST); CodeGen.setAction(Action); IsInTTDRegion = true; emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; - - // Now change the name of the worker function to correspond to this target - // region's entry function. - WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker")); - - // Create the worker function - emitWorkerFunction(WST); } -// Setup NVPTX threads for master-worker OpenMP scheme. -void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF, - EntryFunctionState &EST, - WorkerFunctionState &WST) { +void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF, + EntryFunctionState &EST, bool IsSPMD) { CGBuilderTy &Bld = CGF.Builder; - - llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); - llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); - llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); - EST.ExitBB = CGF.createBasicBlock(".exit"); - - auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); - llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF); - llvm::Value *ThreadLimit = getThreadLimit(CGF); - llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit); - Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); - - CGF.EmitBlock(WorkerBB); - emitCall(CGF, WST.Loc, WST.WorkerFn); - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(MasterCheckBB); - GPUThreadID = RT.getGPUThreadID(CGF); - llvm::Value *MasterThreadID = getMasterThreadID(CGF); - llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID); - Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); - - CGF.EmitBlock(MasterBB); - IsInTargetMasterThreadRegion = true; - // SEQUENTIAL (MASTER) REGION START - // First action in sequential region: - // Initialize the state of the OpenMP runtime library on the GPU. - // TODO: Optimize runtime initialization and pass in correct value. - llvm::Value *Args[] = {getThreadLimit(CGF), - Bld.getInt16(/*RequiresOMPRuntime=*/1)}; - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_kernel_init), - Args); - - emitGenericVarsProlog(CGF, WST.Loc); + Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime())); + IsInTargetMasterThreadRegion = IsSPMD; + if (!IsSPMD) + emitGenericVarsProlog(CGF, EST.Loc); } -void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF, - EntryFunctionState &EST) { - IsInTargetMasterThreadRegion = false; - if (!CGF.HaveInsertPoint()) - return; - - emitGenericVarsEpilog(CGF); - - if (!EST.ExitBB) - EST.ExitBB = CGF.createBasicBlock(".exit"); - - llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); - CGF.EmitBranch(TerminateBB); +void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, + EntryFunctionState &EST, + bool IsSPMD) { + if (!IsSPMD) + emitGenericVarsEpilog(CGF); - CGF.EmitBlock(TerminateBB); - // Signal termination condition. - // TODO: Optimize runtime initialization and pass in correct value. - llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)}; - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_kernel_deinit), - Args); - // Barrier to terminate worker threads. - syncCTAThreads(CGF); - // Master thread jumps to exit point. - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(EST.ExitBB); - EST.ExitBB = nullptr; + CGBuilderTy &Bld = CGF.Builder; + OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime()); } void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, @@ -1202,23 +1081,21 @@ class NVPTXPrePostActionTy : public PrePostActionTy { CGOpenMPRuntimeGPU &RT; CGOpenMPRuntimeGPU::EntryFunctionState &EST; - const OMPExecutableDirective &D; public: NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, - CGOpenMPRuntimeGPU::EntryFunctionState &EST, - const OMPExecutableDirective &D) - : RT(RT), EST(EST), D(D) {} + CGOpenMPRuntimeGPU::EntryFunctionState &EST) + : RT(RT), EST(EST) {} void Enter(CodeGenFunction &CGF) override { - RT.emitSPMDEntryHeader(CGF, EST, D); + RT.emitKernelInit(CGF, EST, /* IsSPMD */ true); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } void Exit(CodeGenFunction &CGF) override { RT.clearLocThreadIdInsertPt(CGF); - RT.emitSPMDEntryFooter(CGF, EST); + RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true); } - } Action(*this, EST, D); + } Action(*this, EST); CodeGen.setAction(Action); IsInTTDRegion = true; emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, @@ -1226,54 +1103,6 @@ IsInTTDRegion = false; } -void CGOpenMPRuntimeGPU::emitSPMDEntryHeader( - CodeGenFunction &CGF, EntryFunctionState &EST, - const OMPExecutableDirective &D) { - CGBuilderTy &Bld = CGF.Builder; - - // Setup BBs in entry function. - llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); - EST.ExitBB = CGF.createBasicBlock(".exit"); - - llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true), - /*RequiresOMPRuntime=*/ - Bld.getInt16(RequiresFullRuntime ? 1 : 0)}; - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init), - Args); - - CGF.EmitBranch(ExecuteBB); - - CGF.EmitBlock(ExecuteBB); - - IsInTargetMasterThreadRegion = true; -} - -void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF, - EntryFunctionState &EST) { - IsInTargetMasterThreadRegion = false; - if (!CGF.HaveInsertPoint()) - return; - - if (!EST.ExitBB) - EST.ExitBB = CGF.createBasicBlock(".exit"); - - llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit"); - CGF.EmitBranch(OMPDeInitBB); - - CGF.EmitBlock(OMPDeInitBB); - // DeInitialize the OMP state in the runtime; called by all active threads. - llvm::Value *Args[] = {/*RequiresOMPRuntime=*/ - CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)}; - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2), - Args); - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(EST.ExitBB); - EST.ExitBB = nullptr; -} - // Create a unique global variable to indicate the execution mode of this target // region. The execution mode is either 'generic', or 'spmd' depending on the // target directive. This variable is picked up by the offload library to setup @@ -1290,137 +1119,6 @@ CGM.addCompilerUsedGlobal(GVMode); } -void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) { - ASTContext &Ctx = CGM.getContext(); - - CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); - CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {}, - WST.Loc, WST.Loc); - emitWorkerLoop(CGF, WST); - CGF.FinishFunction(); -} - -void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF, - WorkerFunctionState &WST) { - // - // The workers enter this loop and wait for parallel work from the master. - // When the master encounters a parallel region it sets up the work + variable - // arguments, and wakes up the workers. The workers first check to see if - // they are required for the parallel region, i.e., within the # of requested - // parallel threads. The activated workers load the variable arguments and - // execute the parallel work. - // - - CGBuilderTy &Bld = CGF.Builder; - - llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); - llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); - llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); - llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); - llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - - CGF.EmitBranch(AwaitBB); - - // Workers wait for work from master. - CGF.EmitBlock(AwaitBB); - // Wait for parallel work - syncCTAThreads(CGF); - - Address WorkFn = - CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); - Address ExecStatus = - CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); - CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); - CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); - - // TODO: Optimize runtime initialization and pass in correct value. - llvm::Value *Args[] = {WorkFn.getPointer()}; - llvm::Value *Ret = - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_kernel_parallel), - Args); - Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); - - // On termination condition (workid == 0), exit loop. - llvm::Value *WorkID = Bld.CreateLoad(WorkFn); - llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate"); - Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); - - // Activate requested workers. - CGF.EmitBlock(SelectWorkersBB); - llvm::Value *IsActive = - Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); - Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); - - // Signal start of parallel region. - CGF.EmitBlock(ExecuteBB); - // Skip initialization. - setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); - - // Process work items: outlined parallel functions. - for (llvm::Function *W : Work) { - // Try to match this outlined function. - llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy); - - llvm::Value *WorkFnMatch = - Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); - - llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); - llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); - Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); - - // Execute this outlined function. - CGF.EmitBlock(ExecuteFNBB); - - // Insert call to work function via shared wrapper. The shared - // wrapper takes two arguments: - // - the parallelism level; - // - the thread ID; - emitCall(CGF, WST.Loc, W, - {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); - - // Go to end of parallel region. - CGF.EmitBranch(TerminateBB); - - CGF.EmitBlock(CheckNextBB); - } - // Default case: call to outlined function through pointer if the target - // region makes a declare target call that may contain an orphaned parallel - // directive. - auto *ParallelFnTy = - llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty}, - /*isVarArg=*/false); - llvm::Value *WorkFnCast = - Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo()); - // Insert call to work function via shared wrapper. The shared - // wrapper takes two arguments: - // - the parallelism level; - // - the thread ID; - emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast}, - {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); - // Go to end of parallel region. - CGF.EmitBranch(TerminateBB); - - // Signal end of parallel region. - CGF.EmitBlock(TerminateBB); - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel), - llvm::None); - CGF.EmitBranch(BarrierBB); - - // All active and inactive workers wait at a barrier after parallel region. - CGF.EmitBlock(BarrierBB); - // Barrier after parallel region. - syncCTAThreads(CGF); - CGF.EmitBranch(AwaitBB); - - // Exit target region. - CGF.EmitBlock(ExitBB); - // Skip initialization. - clearLocThreadIdInsertPt(CGF); -} - void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t, @@ -1806,11 +1504,8 @@ CGBuilderTy &Bld = CGF.Builder; llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); - if (WFn) { + if (WFn) ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); - // Remember for post-processing in worker loop. - Work.emplace_back(WFn); - } llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); // Create a private scope that will globalize the arguments diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -779,6 +779,29 @@ llvm::ConstantInt *Size, const llvm::Twine &Name = Twine("")); + /// The `omp target` interface + /// + /// For more information about the usage of this interface, + /// \see openmp/libomptarget/deviceRTLs/common/include/target.h + /// + ///{ + + /// Create a runtime call for kmpc_target_init + /// + /// \param Loc The insert and source location description. + /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not. + /// \param RequiresFullRuntime Indicate if a full device runtime is necessary. + InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime); + + /// Create a runtime call for kmpc_target_deinit + /// + /// \param Loc The insert and source location description. + /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not. + /// \param RequiresFullRuntime Indicate if a full device runtime is necessary. + void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime); + + ///} + /// Declarations for LLVM-IR types (simple, array, function and structure) are /// generated below. Their names are defined and used in OpenMPKinds.def. Here /// we provide the declarations, the initializeTypes function will provide the diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -409,10 +409,8 @@ /* Int */ Int32, /* kmp_task_t */ VoidPtr) /// OpenMP Device runtime functions -__OMP_RTL(__kmpc_kernel_init, false, Void, Int32, Int16) -__OMP_RTL(__kmpc_kernel_deinit, false, Void, Int16) -__OMP_RTL(__kmpc_spmd_kernel_init, false, Void, Int32, Int16) -__OMP_RTL(__kmpc_spmd_kernel_deinit_v2, false, Void, Int16) +__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int1, Int1, Int1) +__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int1, Int1) __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr) __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32, VoidPtr, VoidPtr, VoidPtrPtr, SizeTy) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/DebugInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Value.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -2191,6 +2192,70 @@ return Builder.CreateCall(Fn, Args); } +OpenMPIRBuilder::InsertPointTy +OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime) { + if (!updateToLocation(Loc)) + return Loc.IP; + + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + Value *Ident = getOrCreateIdent(SrcLocStr); + ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD); + ConstantInt *UseGenericStateMachine = + ConstantInt::getBool(Int32->getContext(), !IsSPMD); + ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime); + + Function *Fn = getOrCreateRuntimeFunctionPtr( + omp::RuntimeFunction::OMPRTL___kmpc_target_init); + + CallInst *ThreadKind = + Builder.CreateCall(Fn, {Ident, IsSPMDVal, UseGenericStateMachine, RequiresFullRuntimeVal}); + + Value *ExecUserCode = Builder.CreateICmpEQ( + ThreadKind, ConstantInt::get(ThreadKind->getType(), -1), "exec_user_code"); + + // ThreadKind = __kmpc_target_init(...) + // if (ThreadKind == -1) + // user_code + // else + // return; + + auto *UI = Builder.CreateUnreachable(); + BasicBlock *CheckBB = UI->getParent(); + BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry"); + + BasicBlock *WorkerExitBB = BasicBlock::Create( + CheckBB->getContext(), "worker.exit", CheckBB->getParent()); + Builder.SetInsertPoint(WorkerExitBB); + Builder.CreateRetVoid(); + + auto *CheckBBTI = CheckBB->getTerminator(); + Builder.SetInsertPoint(CheckBBTI); + Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB); + + CheckBBTI->eraseFromParent(); + UI->eraseFromParent(); + + // Continue in the "user_code" block, see diagram above and in + // openmp/libomptarget/deviceRTLs/common/include/target.h . + return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt()); +} + +void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc, + bool IsSPMD, bool RequiresFullRuntime) { + if (!updateToLocation(Loc)) + return; + + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + Value *Ident = getOrCreateIdent(SrcLocStr); + ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD); + ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime); + + Function *Fn = getOrCreateRuntimeFunctionPtr( + omp::RuntimeFunction::OMPRTL___kmpc_target_deinit); + + Builder.CreateCall(Fn, {Ident, IsSPMDVal, RequiresFullRuntimeVal}); +} + std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef Parts, StringRef FirstSeparator, StringRef Separator) { diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -26,9 +26,6 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/IntrinsicsNVPTX.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" @@ -37,7 +34,6 @@ #include "llvm/Transforms/Utils/CallGraphUpdater.h" #include "llvm/Transforms/Utils/CodeExtractor.h" -using namespace llvm::PatternMatch; using namespace llvm; using namespace omp; @@ -2341,10 +2337,12 @@ AllCallSitesKnown)) SingleThreadedBBs.erase(&F->getEntryBlock()); - // Check if the edge into the successor block compares a thread-id function to - // a constant zero. - // TODO: Use AAValueSimplify to simplify and propogate constants. - // TODO: Check more than a single use for thread ID's. + auto &OMPInfoCache = static_cast(A.getInfoCache()); + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; + + // Check if the edge into the successor block compares the __kmpc_target_init + // result with -1. If we are in non-SPMD-mode that signals only the main + // thread will execute the edge. auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) { if (!Edge || !Edge->isConditional()) return false; @@ -2355,31 +2353,20 @@ if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality()) return false; - // Temporarily match the pattern generated by clang for teams regions. - // TODO: Remove this once the new runtime is in place. - ConstantInt *One, *NegOne; - CmpInst::Predicate Pred; - auto &&m_ThreadID = m_Intrinsic(); - auto &&m_WarpSize = m_Intrinsic(); - auto &&m_BlockSize = m_Intrinsic(); - if (match(Cmp, m_Cmp(Pred, m_ThreadID, - m_And(m_Sub(m_BlockSize, m_ConstantInt(One)), - m_Xor(m_Sub(m_WarpSize, m_ConstantInt(One)), - m_ConstantInt(NegOne)))))) - if (One->isOne() && NegOne->isMinusOne() && - Pred == CmpInst::Predicate::ICMP_EQ) - return true; - ConstantInt *C = dyn_cast(Cmp->getOperand(1)); - if (!C || !C->isZero()) + if (!C) return false; - if (auto *II = dyn_cast(Cmp->getOperand(0))) - if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x) - return true; - if (auto *II = dyn_cast(Cmp->getOperand(0))) - if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x) - return true; + // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!) + if (C->isAllOnesValue()) { + auto *CB = dyn_cast(Cmp->getOperand(0)); + if (!CB || CB->getCalledFunction() != RFI.Declaration) + return false; + const int InitIsSPMDArgNo = 1; + auto *IsSPMDModeCI = + dyn_cast(CB->getOperand(InitIsSPMDArgNo)); + return IsSPMDModeCI && IsSPMDModeCI->isZero(); + } return false; }; @@ -2394,7 +2381,7 @@ for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB); PredBB != PredEndBB; ++PredBB) { if (!IsInitialThreadOnly(dyn_cast((*PredBB)->getTerminator()), - BB)) + BB)) IsInitialThread &= SingleThreadedBBs.contains(*PredBB); } diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll --- a/llvm/test/Transforms/OpenMP/replace_globalization.ll +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -3,10 +3,16 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64" +%struct.ident_t = type { i32, i32, i32, i32, i8* } + @S = external local_unnamed_addr global i8* +@0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @0, i32 0, i32 0) }, align 8 ; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory ; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory +; CHECK-REMARKS-NOT: 6 bytes + ; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef ; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef @@ -25,14 +31,15 @@ define void @bar() { call void @baz() call void @qux() + call void @negative_qux_spmd() ret void } ; CHECK: call void @use.internalized(i8* nofree writeonly addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* [[SHARED_X]], i32 0, i32 0) to i8*)) define internal void @baz() { entry: - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %cmp = icmp eq i32 %tid, 0 + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 true) + %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %master, label %exit master: %x = call i8* @__kmpc_alloc_shared(i64 16), !dbg !11 @@ -48,20 +55,30 @@ ; CHECK: call void @use.internalized(i8* nofree writeonly addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* [[SHARED_Y]], i32 0, i32 0) to i8*)) define internal void @qux() { entry: - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %ntid = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - %warpsize = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - %0 = sub nuw i32 %warpsize, 1 - %1 = sub nuw i32 %ntid, 1 - %2 = xor i32 %0, -1 - %master_tid = and i32 %1, %2 - %3 = icmp eq i32 %tid, %master_tid - br i1 %3, label %master, label %exit + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) + %0 = icmp eq i32 %call, -1 + br i1 %0, label %master, label %exit master: %y = call i8* @__kmpc_alloc_shared(i64 4), !dbg !12 %y_on_stack = bitcast i8* %y to [4 x i32]* - %4 = bitcast [4 x i32]* %y_on_stack to i8* - call void @use(i8* %4) + %1 = bitcast [4 x i32]* %y_on_stack to i8* + call void @use(i8* %1) + call void @__kmpc_free_shared(i8* %y) + br label %exit +exit: + ret void +} + +define internal void @negative_qux_spmd() { +entry: + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 true, i1 true, i1 true) + %0 = icmp eq i32 %call, -1 + br i1 %0, label %master, label %exit +master: + %y = call i8* @__kmpc_alloc_shared(i64 6), !dbg !12 + %y_on_stack = bitcast i8* %y to [6 x i32]* + %1 = bitcast [6 x i32]* %y_on_stack to i8* + call void @use(i8* %1) call void @__kmpc_free_shared(i8* %y) br label %exit exit: @@ -85,6 +102,7 @@ declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll --- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll +++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll @@ -3,8 +3,13 @@ ; REQUIRES: asserts ; ModuleID = 'single_threaded_exeuction.c' -define weak void @kernel() { - call void @__kmpc_kernel_init(i32 512, i16 1) +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [1 x i8] c"\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, align 8 + +define void @kernel() { + call void @__kmpc_kernel_prepare_parallel(i8* null) call void @nvptx() call void @amdgcn() ret void @@ -19,8 +24,8 @@ ; Function Attrs: noinline define internal void @nvptx() { entry: - %call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %cmp = icmp eq i32 %call, 0 + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false) + %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %if.then, label %if.end if.then: @@ -40,8 +45,8 @@ ; Function Attrs: noinline define internal void @amdgcn() { entry: - %call = call i32 @llvm.amdgcn.workitem.id.x() - %cmp = icmp eq i32 %call, 0 + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) + %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %if.then, label %if.end if.then: @@ -87,7 +92,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() -declare void @__kmpc_kernel_init(i32, i16) +declare void @__kmpc_kernel_prepare_parallel(i8*) + +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) attributes #0 = { cold noinline } diff --git a/openmp/libomptarget/deviceRTLs/common/include/target.h b/openmp/libomptarget/deviceRTLs/common/include/target.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/include/target.h @@ -0,0 +1,94 @@ +//===-- target.h ---------- OpenMP device runtime target implementation ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target region interfaces are simple interfaces designed to allow middle-end +// (=LLVM) passes to analyze and transform the code. To achieve good performance +// it may be required to run the associated passes. However, implementations of +// this interface shall always provide a correct implementation as close to the +// user expected code as possible. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H +#define LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H + +#include + +extern "C" { + +/// Forward declaration of the source location identifier "ident". +typedef struct ident ident_t; + +/// The target region _kernel_ interface for GPUs +/// +/// This deliberatly simple interface provides the middle-end (=LLVM) with +/// easier means to reason about the semantic of the code and transform it as +/// well. The runtime calls are therefore also desiged to carry sufficient +/// information necessary for optimizations. +/// +/// +/// Intended usage: +/// +/// \code +/// void kernel(...) { +/// ThreadKind = __kmpc_target_init(Ident, /* IsSPMD */ false, +/// /* UseGenericStateMachine */ true, +/// /* RequiresFullRuntime */ ... ); +/// if (ThreadKind == -1) { +/// // User defined kernel code. +/// } +/// __kmpc_target_deinit(...); +/// } +/// \endcode +/// +/// Which can be transformed to: +/// +/// \code +/// void kernel(...) { +/// ThreadKind = __kmpc_target_init(Ident, /* IsSPMD */ false, +/// /* UseGenericStateMachine */ false, +/// /* RequiresFullRuntime */ ... ); +/// if (ThreadKind == -1) { +/// // User defined kernel code. +/// } else { +/// assume(ThreadKind == ThreadId); +/// // Custom, kernel-specific state machine code. +/// } +/// __kmpc_target_deinit(...); +/// } +/// \endcode +/// +/// +///{ + +/// Initialization +/// +/// Must be called by all threads. +/// +/// \param Ident Source location identification, can be NULL. +/// +int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, + bool UseGenericStateMachine, + bool RequiresFullRuntime); + +/// De-Initialization +/// +/// Must be called by the main thread in generic mode, can be called by all +/// threads. Must be called by all threads in SPMD mode. +/// +/// In non-SPMD, this function releases the workers trapped in a state machine +/// and also any memory dynamically allocated by the runtime. +/// +/// \param Ident Source location identification, can be NULL. +/// +void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD, + bool RequiresFullRuntime); + +///} +} +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -12,6 +12,7 @@ #pragma omp declare target #include "common/omptarget.h" +#include "common/support.h" #include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// @@ -26,16 +27,18 @@ // init entry points //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { +static void __kmpc_generic_kernel_init() { PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", OMPTARGET_NVPTX_VERSION); - ASSERT0(LT_FUSSY, RequiresOMPRuntime, - "Generic always requires initialized runtime."); - setExecutionParameters(Generic, RuntimeInitialized); - for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I) - parallelLevel[I] = 0; + + if (GetLaneId() == 0) + parallelLevel[GetWarpId()] = 0; int threadIdInBlock = GetThreadIdInBlock(); + if (threadIdInBlock != GetMasterThreadID()) + return; + + setExecutionParameters(Generic, RuntimeInitialized); ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), "__kmpc_kernel_init() must be called by team master warp only!"); PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n"); @@ -47,7 +50,7 @@ omptarget_nvptx_device_State[slot].Dequeue(); // init thread private - int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false); + int threadId = 0; omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId); // init team context @@ -62,20 +65,17 @@ // set number of threads and thread limit in team to started value omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - nThreads = GetNumberOfThreadsInBlock(); - threadLimit = ThreadLimit; + nThreads = GetNumberOfWorkersInTeam(); + threadLimit = nThreads; - if (!__kmpc_is_spmd_exec_mode()) - omptarget_nvptx_globalArgs.Init(); + omptarget_nvptx_globalArgs.Init(); __kmpc_data_sharing_init_stack(); __kmpc_impl_target_init(); } -EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { +static void __kmpc_generic_kernel_deinit() { PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n"); - ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, - "Generic always requires initialized runtime."); // Enqueue omp state object for use by another team. int slot = usedSlotIdx; omptarget_nvptx_device_State[slot].Enqueue( @@ -84,12 +84,11 @@ omptarget_nvptx_workFn = 0; } -EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, - int16_t RequiresOMPRuntime) { +static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) { PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); - setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized - : RuntimeUninitialized); + setExecutionParameters(Spmd, RequiresFullRuntime ? RuntimeInitialized + : RuntimeUninitialized); int threadId = GetThreadIdInBlock(); if (threadId == 0) { usedSlotIdx = __kmpc_impl_smid() % MAX_SM; @@ -100,11 +99,8 @@ 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); } __kmpc_data_sharing_init_stack(); - if (!RequiresOMPRuntime) { - // Runtime is not required - exit. - __kmpc_impl_syncthreads(); + if (!RequiresFullRuntime) return; - } // // Team Context Initialization. @@ -138,16 +134,17 @@ newTaskDescr); // init thread private from init value + int ThreadLimit = GetNumberOfProcsInTeam(/* IsSPMD */ true); PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of " "%d threads\n", (int)newTaskDescr->ThreadId(), (int)ThreadLimit); } -EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) { +static void __kmpc_spmd_kernel_deinit(bool RequiresFullRuntime) { // We're not going to pop the task descr stack of each thread since // there are no more parallel regions in SPMD mode. - if (!RequiresOMPRuntime) + if (!RequiresFullRuntime) return; __kmpc_impl_syncthreads(); @@ -165,4 +162,68 @@ return (execution_param & ModeMask) == Spmd; } +EXTERN bool __kmpc_kernel_parallel(void**WorkFn); + +static void __kmpc_target_region_state_machine(ident_t *Ident) { + + int TId = GetThreadIdInBlock(); + do { + void* WorkFn = 0; + + // Wait for the signal that we have a new work function. + __kmpc_barrier_simple_spmd(Ident, TId); + + + // Retrieve the work function from the runtime. + bool IsActive = __kmpc_kernel_parallel(&WorkFn); + + // If there is nothing more to do, break out of the state machine by + // returning to the caller. + if (!WorkFn) + return; + + if (IsActive) { + ((void(*)(uint32_t,uint32_t))WorkFn)(0, TId); + __kmpc_kernel_end_parallel(); + } + + __kmpc_barrier_simple_spmd(Ident, TId); + + } while (true); +} + +EXTERN +int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, + bool UseGenericStateMachine, + bool RequiresFullRuntime) { + int TId = GetThreadIdInBlock(); + if (IsSPMD) + __kmpc_spmd_kernel_init(RequiresFullRuntime); + else + __kmpc_generic_kernel_init(); + + if (IsSPMD) { + __kmpc_barrier_simple_spmd(Ident, TId); + return -1; + } + + if (TId == GetMasterThreadID()) + return -1; + + if (UseGenericStateMachine) + __kmpc_target_region_state_machine(Ident); + + return TId; +} + +EXTERN +void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD, + bool RequiresFullRuntime) { + if (IsSPMD) + __kmpc_spmd_kernel_deinit(RequiresFullRuntime); + else + __kmpc_generic_kernel_deinit(); +} + + #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -331,7 +331,7 @@ (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); // Master signals work to activate workers. - __kmpc_barrier_simple_spmd(nullptr, 0); + __kmpc_barrier_simple_spmd(ident, 0); // OpenMP [2.5, Parallel Construct, p.49] // There is an implied barrier at the end of a parallel region. After the @@ -339,7 +339,7 @@ // execution of the enclosing task region. // // The master waits at this barrier until all workers are done. - __kmpc_barrier_simple_spmd(nullptr, 0); + __kmpc_barrier_simple_spmd(ident, 0); // Decrement parallel level for non-SPMD warps. for (int I = 0; I < NumWarps; ++I) diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -416,11 +416,11 @@ int32_t cancelVal); // non standard -EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); -EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); -EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, - int16_t RequiresOMPRuntime); -EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); +EXTERN int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, + bool UseGenericStateMachine, + bool RequiresFullRuntime); +EXTERN void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD, + bool RequiresFullRuntime); EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn); EXTERN bool __kmpc_kernel_parallel(void **WorkFn); EXTERN void __kmpc_kernel_end_parallel(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -60,7 +60,13 @@ return Mask; } -EXTERN void __kmpc_impl_syncthreads() { __syncthreads(); } +EXTERN void __kmpc_impl_syncthreads() { + int barrier = 2; + asm volatile("barrier.sync %0;" + : + : "r"(barrier) + : "memory"); +} EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); @@ -75,7 +81,7 @@ // The named barrier for active parallel threads of a team in an L1 parallel // region to synchronize with each other. int barrier = 1; - asm volatile("bar.sync %0, %1;" + asm volatile("barrier.sync %0, %1;" : : "r"(barrier), "r"(num_threads) : "memory");