diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -70,6 +70,18 @@ #define OMP_IDENT_FLAG(Enum, ...) constexpr auto Enum = omp::IdentFlag::Enum; #include "llvm/Frontend/OpenMP/OMPKinds.def" +/// IDs for all omp runtime library tasking flag encodings (see +/// their defintion in openmp/runtime/src/kmp.h). +enum class TaskingFlag { +#define OMP_TASKING_FLAG(Enum, Str, Value) Enum = Value, +#include "llvm/Frontend/OpenMP/OMPKinds.def" + LLVM_MARK_AS_BITMASK_ENUM(0x7FFFFFFF) +}; + +#define OMP_TASKING_FLAG(Enum, ...) \ + constexpr auto Enum = omp::TaskingFlag::Enum; +#include "llvm/Frontend/OpenMP/OMPKinds.def" + /// Parse \p Str and return the directive it matches or OMPD_unknown if none. Directive getOpenMPDirectiveKind(StringRef Str); diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -75,6 +75,21 @@ /// NOTE: Temporary solution until Clang CG is gone. void popFinalizationCB() { FinalizationStack.pop_back(); } + /// Helper that keeps all information of a depend clause. + struct DependClauseInfo { + Value *BasePtr; + Value *Length; + + /// The values are chosen such that we can `or` them with the length to + /// create the `flags` member of a kmp_depend_info struct. + enum : uint64_t { + IN = 1uL << 61, + OUT = 1uL << 61, + IN_OUT = IN | OUT, + MUTEX_IN_OUT_SET = 1uL << 63, + } Type; + }; + /// Callback type for body (=inner region) code generation /// /// The callback takes code locations as arguments, each describing a @@ -168,6 +183,32 @@ Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable); + /// Generator for '#omp task' + /// + /// \param Loc The location where the barrier directive was encountered. + /// \param BodyGenCB Callback that will generate the region code. + /// \param PrivCB Callback to copy a given variable (think copy constructor). + /// \param FiniCB Callback to finalize variable copies. + /// \param IfCondition The evaluated 'if' clause expression, if any. + /// \param FinalCondition The evaluated 'final' clause expression, if any. + /// \param UntiedFlag Flag to indicate if this is an untied task. + /// \param MergableFlag Flag to indicate if this is a mergable task. + /// \param DependClauseInfos Information containted in the depend clauses. + /// \param PriorityValue The priority of the task. + /// \param EventHandle The evaluated 'detach' clause expression, if any. + /// \param IsCancellable Flag to indicate a cancellable parallel region. + /// + /// TODO: Affinitiy + /// + /// \returns The insertion point after the task. + InsertPointTy + CreateTask(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, + PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, + Value *IfCondition, Value *FinalCondition, bool UntiedFlag, + bool MergableFlag, + SmallVectorImpl &DependClauseInfos, + unsigned PriorityValue, Value *EventHandle, bool IsCancellable); + ///} private: @@ -214,6 +255,17 @@ omp::Directive DK, bool ForceSimpleCall, bool CheckCancelFlag); + InsertPointTy + emitOutlinedRegion(const LocationDescription &Loc, + function_ref RTLCallCB, Value *Ident, + Value *ThreadID, BodyGenCallbackTy BodyGenCB, + PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, + omp::Directive DK, bool IsCancellable, Value *IfCondition, + function_ref AlternativeCB); + + Value *emitLocalDependenceInfoArray( + SmallVectorImpl &DependClauseInfos); + /// The finalization stack made up of finalize callbacks currently in-flight, /// wrapped into FinalizationInfo objects that reference also the finalization /// target block and the kind of cancellable directive. diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -116,6 +116,8 @@ __OMP_TYPE(Int32) __OMP_TYPE(Int8Ptr) __OMP_TYPE(Int32Ptr) +__OMP_TYPE(Int64) +OMP_TYPE(VoidPtr, Int8Ptr) #undef __OMP_TYPE #undef OMP_TYPE @@ -134,6 +136,7 @@ OMP_STRUCT_TYPE(VarName, "struct." #Name, __VA_ARGS__) __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr) +__OMP_STRUCT_TYPE(DependInfo, kmp_depend_info, Int64, Int64) #undef __OMP_STRUCT_TYPE #undef OMP_STRUCT_TYPE @@ -146,6 +149,7 @@ OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, __VA_ARGS__) __OMP_FUNCTION_TYPE(ParallelTask, true, Void, Int32Ptr, Int32Ptr) +__OMP_FUNCTION_TYPE(TaskFn, false, Void, VoidPtr) #undef __OMP_FUNCTION_TYPE #undef OMP_FUNCTION_TYPE @@ -168,8 +172,11 @@ __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_global_thread_num, false, Int32, IdentPtr) __OMP_RTL(__kmpc_fork_call, true, Void, IdentPtr, Int32, ParallelTaskPtr) -__OMP_RTL(__kmpc_push_num_threads, false, Void, IdentPtr, Int32, /* Int */Int32) -__OMP_RTL(__kmpc_push_proc_bind, false, Void, IdentPtr, Int32, /* Int */Int32) +__OMP_RTL(__kmpc_task, false, Int32, IdentPtr, Int32, Int32, Int32, Int32, + VoidPtr, TaskFnPtr, Int32, DependInfoPtr, Int32) +__OMP_RTL(__kmpc_push_num_threads, false, Void, IdentPtr, Int32, + /* Int */ Int32) +__OMP_RTL(__kmpc_push_proc_bind, false, Void, IdentPtr, Int32, /* Int */ Int32) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32) @@ -244,7 +251,6 @@ ///} - /// Proc bind kinds /// ///{ @@ -266,3 +272,24 @@ #undef OMP_PROC_BIND_KIND ///} + +/// KMP tasking bit flags +/// +///{ + +#ifndef OMP_TASKING_FLAG +#define OMP_TASKING_FLAG(Enum, Str, Value) +#endif + +#define __OMP_TASKING_FLAG(Name, Value) \ + OMP_TASKING_FLAG(OMP_TASKING_FLAG_##Name, #Name, Value) + +__OMP_TASKING_FLAG(TIEDNESS, 1 << 0) +__OMP_TASKING_FLAG(FINAL, 1 << 1) +__OMP_TASKING_FLAG(DESTRUCTORS_THUNK, 1 << 4) +__OMP_TASKING_FLAG(PRIORITY_SPECIFIED, 1 << 6) + +#undef __OMP_TASKING_FLAG +#undef OMP_TASKING_FLAG + +///} diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -310,34 +310,12 @@ Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin()); } -IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel( - const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, - PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, - Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) { - if (!updateToLocation(Loc)) - return Loc.IP; - - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); - Value *ThreadID = getOrCreateThreadID(Ident); - - if (NumThreads) { - // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads) - Value *Args[] = { - Ident, ThreadID, - Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)}; - Builder.CreateCall( - getOrCreateRuntimeFunction(OMPRTL___kmpc_push_num_threads), Args); - } - - if (ProcBind != OMP_PROC_BIND_default) { - // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind) - Value *Args[] = { - Ident, ThreadID, - ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)}; - Builder.CreateCall(getOrCreateRuntimeFunction(OMPRTL___kmpc_push_proc_bind), - Args); - } +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitOutlinedRegion( + const LocationDescription &Loc, function_ref RTLCallCB, + Value *Ident, Value *ThreadID, BodyGenCallbackTy BodyGenCB, + PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, omp::Directive DK, + bool IsCancellable, Value *IfCondition, + function_ref AlternativeCB) { BasicBlock *InsertBB = Builder.GetInsertBlock(); Function *OuterFn = InsertBB->getParent(); @@ -347,19 +325,22 @@ SmallVector ToBeDeleted; Builder.SetInsertPoint(OuterFn->getEntryBlock().getFirstNonPHI()); - AllocaInst *TIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); - AllocaInst *ZeroAddr = Builder.CreateAlloca(Int32, nullptr, "zero.addr"); - - // If there is an if condition we actually use the TIDAddr and ZeroAddr in the - // program, otherwise we only need them for modeling purposes to get the - // associated arguments in the outlined function. In the former case, - // initialize the allocas properly, in the latter case, delete them later. - if (IfCondition) { - Builder.CreateStore(Constant::getNullValue(Int32), TIDAddr); - Builder.CreateStore(Constant::getNullValue(Int32), ZeroAddr); - } else { - ToBeDeleted.push_back(TIDAddr); - ToBeDeleted.push_back(ZeroAddr); + AllocaInst *TIDAddr = nullptr, *ZeroAddr = nullptr; + if (DK == OMPD_parallel) { + TIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); + ZeroAddr = Builder.CreateAlloca(Int32, nullptr, "zero.addr"); + + // If there is an if condition we actually use the TIDAddr and ZeroAddr in + // the program, otherwise we only need them for modeling purposes to get the + // associated arguments in the outlined function. In the former case, + // initialize the allocas properly, in the latter case, delete them later. + if (IfCondition) { + Builder.CreateStore(Constant::getNullValue(Int32), TIDAddr); + Builder.CreateStore(Constant::getNullValue(Int32), ZeroAddr); + } else { + ToBeDeleted.push_back(TIDAddr); + ToBeDeleted.push_back(ZeroAddr); + } } // Create an artificial insertion point that will also ensure the blocks we @@ -370,14 +351,17 @@ if (IfCondition) SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI); + StringRef Suffix = (DK == OMPD_parallel ? "omp.par" : "omp.task"); + StringRef Prefix = (DK == OMPD_parallel ? "omp.par." : "omp.task."); + BasicBlock *ThenBB = ThenTI->getParent(); - BasicBlock *PRegEntryBB = ThenBB->splitBasicBlock(ThenTI, "omp.par.entry"); - BasicBlock *PRegBodyBB = - PRegEntryBB->splitBasicBlock(ThenTI, "omp.par.region"); - BasicBlock *PRegPreFiniBB = - PRegBodyBB->splitBasicBlock(ThenTI, "omp.par.pre_finalize"); - BasicBlock *PRegExitBB = - PRegPreFiniBB->splitBasicBlock(ThenTI, "omp.par.exit"); + BasicBlock *ORegEntryBB = ThenBB->splitBasicBlock(ThenTI, Prefix + "entry"); + BasicBlock *ORegBodyBB = + ORegEntryBB->splitBasicBlock(ThenTI, Prefix + "region"); + BasicBlock *ORegPreFiniBB = + ORegBodyBB->splitBasicBlock(ThenTI, Prefix + "pre_finalize"); + BasicBlock *ORegExitBB = + ORegPreFiniBB->splitBasicBlock(ThenTI, Prefix + "exit"); auto FiniCBWrapper = [&](InsertPointTy IP) { // Hide "open-ended" blocks from the given FiniCB by setting the right jump @@ -385,78 +369,80 @@ if (IP.getBlock()->end() == IP.getPoint()) { IRBuilder<>::InsertPointGuard IPG(Builder); Builder.restoreIP(IP); - Instruction *I = Builder.CreateBr(PRegExitBB); + Instruction *I = Builder.CreateBr(ORegExitBB); IP = InsertPointTy(I->getParent(), I->getIterator()); } assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 && - IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB && + IP.getBlock()->getTerminator()->getSuccessor(0) == ORegExitBB && "Unexpected insertion point for finalization call!"); return FiniCB(IP); }; - FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable}); + FinalizationStack.push_back({FiniCBWrapper, DK, IsCancellable}); // Generate the privatization allocas in the block that will become the entry // of the outlined function. - InsertPointTy AllocaIP(PRegEntryBB, - PRegEntryBB->getTerminator()->getIterator()); + InsertPointTy AllocaIP(ORegEntryBB, + ORegEntryBB->getTerminator()->getIterator()); Builder.restoreIP(AllocaIP); - AllocaInst *PrivTIDAddr = - Builder.CreateAlloca(Int32, nullptr, "tid.addr.local"); - Instruction *PrivTID = Builder.CreateLoad(PrivTIDAddr, "tid"); - - // Add some fake uses for OpenMP provided arguments. - ToBeDeleted.push_back(Builder.CreateLoad(TIDAddr, "tid.addr.use")); - ToBeDeleted.push_back(Builder.CreateLoad(ZeroAddr, "zero.addr.use")); + AllocaInst *PrivTIDAddr = nullptr; + Instruction *PrivTID = nullptr; + if (DK == OMPD_parallel) { + PrivTIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr.local"); + PrivTID = Builder.CreateLoad(PrivTIDAddr, "tid"); + // Add some fake uses for OpenMP provided arguments. + ToBeDeleted.push_back(Builder.CreateLoad(TIDAddr, "tid.addr.use")); + ToBeDeleted.push_back(Builder.CreateLoad(ZeroAddr, "zero.addr.use")); + } // ThenBB // | // V - // PRegionEntryBB <- Privatization allocas are placed here. + // ORegionEntryBB <- Privatization allocas are placed here. // | // V - // PRegionBodyBB <- BodeGen is invoked here. + // ORegionBodyBB <- BodeGen is invoked here. // | // V - // PRegPreFiniBB <- The block we will start finalization from. + // ORegPreFiniBB <- The block we will start finalization from. // | // V - // PRegionExitBB <- A common exit to simplify block collection. + // ORegionExitBB <- A common exit to simplify block collection. // LLVM_DEBUG(dbgs() << "Before body codegen: " << *UI->getFunction() << "\n"); // Let the caller create the body. assert(BodyGenCB && "Expected body generation callback!"); - InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin()); - BodyGenCB(AllocaIP, CodeGenIP, *PRegPreFiniBB); + InsertPointTy CodeGenIP(ORegBodyBB, ORegBodyBB->begin()); + BodyGenCB(AllocaIP, CodeGenIP, *ORegPreFiniBB); LLVM_DEBUG(dbgs() << "After body codegen: " << *UI->getFunction() << "\n"); - SmallPtrSet ParallelRegionBlockSet; - SmallVector ParallelRegionBlocks, Worklist; - ParallelRegionBlockSet.insert(PRegEntryBB); - ParallelRegionBlockSet.insert(PRegExitBB); + SmallPtrSet OutlinedRegionBlockSet; + SmallVector OutlinedRegionBlocks, Worklist; + OutlinedRegionBlockSet.insert(ORegEntryBB); + OutlinedRegionBlockSet.insert(ORegExitBB); - // Collect all blocks in-between PRegEntryBB and PRegExitBB. - Worklist.push_back(PRegEntryBB); + // Collect all blocks in-between ORegEntryBB and ORegExitBB. + Worklist.push_back(ORegEntryBB); while (!Worklist.empty()) { BasicBlock *BB = Worklist.pop_back_val(); - ParallelRegionBlocks.push_back(BB); + OutlinedRegionBlocks.push_back(BB); for (BasicBlock *SuccBB : successors(BB)) - if (ParallelRegionBlockSet.insert(SuccBB).second) + if (OutlinedRegionBlockSet.insert(SuccBB).second) Worklist.push_back(SuccBB); } CodeExtractorAnalysisCache CEAC(*OuterFn); - CodeExtractor Extractor(ParallelRegionBlocks, /* DominatorTree */ nullptr, - /* AggregateArgs */ false, + CodeExtractor Extractor(OutlinedRegionBlocks, /* DominatorTree */ nullptr, + /* AggregateArgs */ DK != OMPD_parallel, /* BlockFrequencyInfo */ nullptr, /* BranchProbabilityInfo */ nullptr, /* AssumptionCache */ nullptr, /* AllowVarArgs */ true, /* AllowAlloca */ true, - /* Suffix */ ".omp_par"); + /* Suffix */ Suffix); // Find inputs to, outputs from the code region. BasicBlock *CommonExit = nullptr; @@ -476,12 +462,12 @@ SmallVector Uses; for (Use &U : V.uses()) if (auto *UserI = dyn_cast(U.getUser())) - if (ParallelRegionBlockSet.count(UserI->getParent())) + if (OutlinedRegionBlockSet.count(UserI->getParent())) Uses.push_back(&U); Value *ReplacementValue = nullptr; CallInst *CI = dyn_cast(&V); - if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) { + if (CI && PrivTID && CI->getCalledFunction() == TIDRTLFn.getCallee()) { ReplacementValue = PrivTID; } else { Builder.restoreIP( @@ -507,14 +493,20 @@ LLVM_DEBUG(dbgs() << "After privatization: " << *UI->getFunction() << "\n"); LLVM_DEBUG({ - for (auto *BB : ParallelRegionBlocks) - dbgs() << " PBR: " << BB->getName() << "\n"; + for (auto *BB : OutlinedRegionBlocks) + dbgs() << " OBR: " << BB->getName() << "\n"; }); // Add some known attributes to the outlined function. Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); - OutlinedFn->addParamAttr(0, Attribute::NoAlias); - OutlinedFn->addParamAttr(1, Attribute::NoAlias); + if (DK == OMPD_parallel) { + OutlinedFn->addParamAttr(0, Attribute::NoAlias); + OutlinedFn->addParamAttr(1, Attribute::NoAlias); + } else if (!OutlinedFn->arg_empty()) { + assert(OutlinedFn->arg_size() == 1); + assert(OutlinedFn->arg_begin()->getType()->isPointerTy()); + OutlinedFn->addParamAttr(0, Attribute::NoAlias); + } OutlinedFn->addFnAttr(Attribute::NoUnwind); OutlinedFn->addFnAttr(Attribute::NoRecurse); @@ -530,30 +522,115 @@ // made our own entry block after all. { BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock(); - assert(ArtificialEntry.getUniqueSuccessor() == PRegEntryBB); - assert(PRegEntryBB->getUniquePredecessor() == &ArtificialEntry); - PRegEntryBB->moveBefore(&ArtificialEntry); - ArtificialEntry.eraseFromParent(); + assert(ArtificialEntry.getUniqueSuccessor() == ORegEntryBB); + assert(ORegEntryBB->getUniquePredecessor() == &ArtificialEntry); + ORegEntryBB->moveBefore(&ArtificialEntry); + MergeBlockIntoPredecessor(ORegEntryBB); + ORegEntryBB = &OutlinedFn->getEntryBlock(); } LLVM_DEBUG(dbgs() << "PP Outlined function: " << *OutlinedFn << "\n"); - assert(&OutlinedFn->getEntryBlock() == PRegEntryBB); + assert(&OutlinedFn->getEntryBlock() == ORegEntryBB); assert(OutlinedFn && OutlinedFn->getNumUses() == 1); - assert(OutlinedFn->arg_size() >= 2 && - "Expected at least tid and bounded tid as arguments"); - unsigned NumCapturedVars = OutlinedFn->arg_size() - /* tid & bounded tid */ 2; + if (DK == OMPD_parallel) { + assert(OutlinedFn->arg_size() >= 2 && + "Expected at least tid and bounded tid as arguments"); + } else if (!OutlinedFn->arg_empty()) { + assert(OutlinedFn->arg_size() == 1 && + OutlinedFn->arg_begin()->getType()->isPointerTy() && + "Expected a single struct pointer argument"); + } CallInst *CI = cast(OutlinedFn->user_back()); - CI->getParent()->setName("omp_parallel"); + CI->getParent()->setName(Prefix + "issue"); Builder.SetInsertPoint(CI); - // Build call __kmpc_fork_call(Ident, n, microtask, var1, .., varn); - Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars), - Builder.CreateBitCast(OutlinedFn, ParallelTaskPtr)}; + // Let the caller create the actual runtime call. + RTLCallCB(*CI); + + LLVM_DEBUG(dbgs() << "With runtime call placed: " + << *Builder.GetInsertBlock()->getParent() << "\n"); + + InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end()); + InsertPointTy ExitIP(ORegExitBB, ORegExitBB->end()); + UI->eraseFromParent(); + + // Initialize the local TID stack location with the argument value. + if (DK == OMPD_parallel) { + Builder.SetInsertPoint(PrivTID); + Function::arg_iterator OutlinedAI = OutlinedFn->arg_begin(); + Builder.CreateStore(Builder.CreateLoad(OutlinedAI), PrivTIDAddr); + } + + // If no "if" clause was present we do not need the call created during + // outlining, otherwise we reuse it in the serialized parallel region. + if (!ElseTI) { + CI->eraseFromParent(); + } else { + + // If an "if" clause was present we are now generating the serialized + // version into the "else" branch. + Builder.SetInsertPoint(ElseTI); + + CI->removeFromParent(); + + // Let the caller create the actual alternative handling code. + AlternativeCB(*CI); + + LLVM_DEBUG(dbgs() << "With `if-clause` alternative code: " + << *Builder.GetInsertBlock()->getParent() << "\n"); + } + + // Adjust the finalization stack, verify the adjustment, and call the + // finalize function a last time to finalize values between the pre-fini block + // and the exit block if we left the parallel "the normal way". + auto FiniInfo = FinalizationStack.pop_back_val(); + (void)FiniInfo; + assert(FiniInfo.DK == DK && "Unexpected finalization stack state!"); + + Instruction *PreFiniTI = ORegPreFiniBB->getTerminator(); + assert(PreFiniTI->getNumSuccessors() == 1 && + PreFiniTI->getSuccessor(0)->size() == 1 && + isa(PreFiniTI->getSuccessor(0)->getTerminator()) && + "Unexpected CFG structure!"); + + InsertPointTy PreFiniIP(ORegPreFiniBB, PreFiniTI->getIterator()); + FiniCB(PreFiniIP); + + for (Instruction *I : ToBeDeleted) + I->eraseFromParent(); + + return AfterIP; +} + +IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel( + const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, + PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, + Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) { + if (!updateToLocation(Loc)) + return Loc.IP; + + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + Value *Ident = getOrCreateIdent(SrcLocStr); + Value *ThreadID = getOrCreateThreadID(Ident); - SmallVector RealArgs; - RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs)); - RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end()); + if (NumThreads) { + // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads) + Value *Args[] = { + Ident, ThreadID, + Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)}; + Builder.CreateCall( + getOrCreateRuntimeFunction(OMPRTL___kmpc_push_num_threads), Args); + } + + if (ProcBind != OMP_PROC_BIND_default) { + // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind) + Value *Args[] = { + Ident, ThreadID, + ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)}; + Builder.CreateCall(getOrCreateRuntimeFunction(OMPRTL___kmpc_push_proc_bind), + Args); + } FunctionCallee RTLFn = getOrCreateRuntimeFunction(OMPRTL___kmpc_fork_call); if (auto *F = dyn_cast(RTLFn.getCallee())) { @@ -573,69 +650,165 @@ } } - Builder.CreateCall(RTLFn, RealArgs); - - LLVM_DEBUG(dbgs() << "With fork_call placed: " - << *Builder.GetInsertBlock()->getParent() << "\n"); - - InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end()); - InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end()); - UI->eraseFromParent(); - - // Initialize the local TID stack location with the argument value. - Builder.SetInsertPoint(PrivTID); - Function::arg_iterator OutlinedAI = OutlinedFn->arg_begin(); - Builder.CreateStore(Builder.CreateLoad(OutlinedAI), PrivTIDAddr); + auto RTLCallCB = [this, Ident, &RTLFn](CallInst &CI) { + // Build call __kmpc_fork_call(Ident, n, microtask, var1, .., varn); + Function *OutlinedFn = CI.getCalledFunction(); + unsigned NumCapturedVars = + OutlinedFn->arg_size() - /* tid & bounded tid */ 2; - // If no "if" clause was present we do not need the call created during - // outlining, otherwise we reuse it in the serialized parallel region. - if (!ElseTI) { - CI->eraseFromParent(); - } else { + SmallVector Args; + Args.reserve(3 + OutlinedFn->arg_size()); + Args.push_back(Ident); + Args.push_back(Builder.getInt32(NumCapturedVars)); + Args.push_back(Builder.CreateBitCast(OutlinedFn, ParallelTaskPtr)); + Args.append(CI.arg_begin() + /* tid & bound tid */ 2, CI.arg_end()); - // If an "if" clause was present we are now generating the serialized - // version into the "else" branch. - Builder.SetInsertPoint(ElseTI); + Builder.CreateCall(RTLFn, Args); + }; + auto AlternativeCB = [this, Ident, ThreadID](CallInst &CI) { // Build calls __kmpc_serialized_parallel(&Ident, GTid); Value *SerializedParallelCallArgs[] = {Ident, ThreadID}; Builder.CreateCall( getOrCreateRuntimeFunction(OMPRTL___kmpc_serialized_parallel), SerializedParallelCallArgs); - // OutlinedFn(>id, &zero, CapturedStruct); - CI->removeFromParent(); - Builder.Insert(CI); + Builder.Insert(&CI); // __kmpc_end_serialized_parallel(&Ident, GTid); Value *EndArgs[] = {Ident, ThreadID}; Builder.CreateCall( getOrCreateRuntimeFunction(OMPRTL___kmpc_end_serialized_parallel), EndArgs); + }; - LLVM_DEBUG(dbgs() << "With serialized parallel region: " - << *Builder.GetInsertBlock()->getParent() << "\n"); + return emitOutlinedRegion(Loc, RTLCallCB, Ident, ThreadID, BodyGenCB, PrivCB, + FiniCB, OMPD_parallel, IsCancellable, IfCondition, + AlternativeCB); +} + +Value *OpenMPIRBuilder::emitLocalDependenceInfoArray( + SmallVectorImpl &DependClauseInfos) { + + // Create the array and move it to the entry block. + AllocaInst *DependAI = + Builder.CreateAlloca(DependInfo, DependClauseInfos.size()); + DependAI->moveBefore( + &*DependAI->getFunction()->getEntryBlock().getFirstInsertionPt()); + + // Iterate over the dependence clauses and build the code that fills the + // information in the kmp_depend_info_t. + SmallVector Indices; + Indices.resize(2); + + Value *Zero = Builder.getInt32(0), *One = Builder.getInt32(1); + for (unsigned u = 0, e = DependClauseInfos.size(); u < e; u++) { + Indices[0] = Builder.getInt32(u); + Indices[1] = Zero; + Value *BasePtrAddr = Builder.CreateGEP(DependAI, Indices); + Value *BasePtrVal = + Builder.CreatePtrToInt(DependClauseInfos[u].BasePtr, Int64); + Builder.CreateStore(BasePtrVal, BasePtrAddr); + + Indices[2] = One; + Value *LengthAndFlagsAddr = Builder.CreateGEP(DependAI, Indices); + Value *LengthVal = DependClauseInfos[u].Length; + Value *LengthAndFlagsVal = Builder.CreateAnd( + LengthVal, Builder.getInt64(DependClauseInfos[u].Type)); + Builder.CreateStore(LengthAndFlagsVal, LengthAndFlagsAddr); } + return DependAI; +} - // Adjust the finalization stack, verify the adjustment, and call the - // finalize function a last time to finalize values between the pre-fini block - // and the exit block if we left the parallel "the normal way". - auto FiniInfo = FinalizationStack.pop_back_val(); - (void)FiniInfo; - assert(FiniInfo.DK == OMPD_parallel && - "Unexpected finalization stack state!"); +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::CreateTask( + const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, + PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, + Value *FinalCondition, bool UntiedFlag, bool MergableFlag, + SmallVectorImpl &DependClauseInfos, + unsigned PriorityValue, Value *EventHandle, bool IsCancellable) { + if (!updateToLocation(Loc)) + return Loc.IP; - Instruction *PreFiniTI = PRegPreFiniBB->getTerminator(); - assert(PreFiniTI->getNumSuccessors() == 1 && - PreFiniTI->getSuccessor(0)->size() == 1 && - isa(PreFiniTI->getSuccessor(0)->getTerminator()) && - "Unexpected CFG structure!"); + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + Value *Ident = getOrCreateIdent(SrcLocStr); + Value *ThreadID = getOrCreateThreadID(Ident); - InsertPointTy PreFiniIP(PRegPreFiniBB, PreFiniTI->getIterator()); - FiniCB(PreFiniIP); + FunctionCallee RTLFn = getOrCreateRuntimeFunction(OMPRTL___kmpc_task); + if (auto *F = dyn_cast(RTLFn.getCallee())) { + if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) { + llvm::LLVMContext &Ctx = F->getContext(); + MDBuilder MDB(Ctx); + // Annotate the callback behavior of the __kmpc_task: + // - The callback callee is argument number 6 (task_entry). + // - The only argument of the callback callee is argument 5. + F->addMetadata( + llvm::LLVMContext::MD_callback, + *llvm::MDNode::get( + Ctx, {MDB.createCallbackEncoding(2, {5}, + /* VarArgsArePassed */ false)})); + } + } - for (Instruction *I : ToBeDeleted) - I->eraseFromParent(); + uint32_t Flags = 0; + Flags |= UntiedFlag ? 0 : unsigned(OMP_TASKING_FLAG_TIEDNESS); + + auto RTLCallCB = [this, Ident, ThreadID, FinalCondition, IfCondition, &RTLFn, + &DependClauseInfos, Flags](CallInst &CI) { + assert(CI.getCalledFunction() && "TODO"); + // Build call __kmpc_task(ident_t *loc_ref, + // kmp_int32 gtid, + // kmp_int32 flags, + // kmp_int32 final, + // kmp_uint32 sizeof_shared_and_private_vars, + // void *shared_and_private_vars, + // kmp_task_routine_t task_entry, + // kmp_uint32 num_depend_infos, + // kmp_depend_info_t *depend_infos, + // kmp_int32 if_condition) + Function *OutlinedFn = CI.getCalledFunction(); + + unsigned SharedAndPrivateVarsSize = 0; + Value *ArgOp = Constant::getNullValue(VoidPtr); + if (CI.getNumArgOperands()) { + assert(CI.getNumArgOperands() == 1 && "TODO"); + assert(CI.getArgOperand(0)->getType()->isPointerTy() && "TODO"); + ArgOp = CI.getArgOperand(0); + Type *ArgTy = ArgOp->getType()->getPointerElementType(); + const DataLayout &DL = M.getDataLayout(); + SharedAndPrivateVarsSize = DL.getTypeAllocSize(ArgTy); + } - return AfterIP; + Value *DependenceArray = Constant::getNullValue(DependInfoPtr); + if (!DependClauseInfos.empty()) + DependenceArray = emitLocalDependenceInfoArray(DependClauseInfos); + + SmallVector Args; + Args.resize(10); + Args[0] = Ident; + Args[1] = ThreadID; + Args[2] = Builder.getInt32(Flags); + if (FinalCondition) + Args[3] = Builder.CreateZExtOrTrunc(FinalCondition, Int32); + else + Args[3] = Builder.getInt32(0); + Args[4] = Builder.getInt32(SharedAndPrivateVarsSize); + Args[5] = Builder.CreateBitCast(ArgOp, VoidPtr); + Args[6] = Builder.CreateBitCast(OutlinedFn, TaskFnPtr); + Args[7] = Builder.getInt32(DependClauseInfos.size()); + Args[8] = DependenceArray; + if (IfCondition) + Args[9] = Builder.CreateZExtOrTrunc(IfCondition, Int32); + else + Args[9] = Builder.getInt32(1); + + Builder.CreateCall(RTLFn, Args); + }; + + auto AlternativeCB = [](CallInst &) { + // The new __kmpc_task will handle the if-condition internally. + }; + + return emitOutlinedRegion(Loc, RTLCallCB, Ident, ThreadID, BodyGenCB, PrivCB, + FiniCB, OMPD_task, IsCancellable, + /* IfCondition */ nullptr, AlternativeCB); } diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -31,6 +31,7 @@ FunctionType::get(Type::getVoidTy(Ctx), {Type::getInt32Ty(Ctx)}, /*isVarArg=*/false); F = Function::Create(FTy, Function::ExternalLinkage, "", M.get()); + F->arg_begin()->setName("func.arg"); BB = BasicBlock::Create(Ctx, "", F); DIBuilder DIB(*M); @@ -613,4 +614,183 @@ } } +TEST_F(OpenMPIRBuilderTest, TaskSimple) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + F->setName("func"); + IRBuilder<> Builder(BB); + + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + + AllocaInst *PrivAI = nullptr; + + unsigned NumBodiesGenerated = 0; + unsigned NumPrivatizedVars = 0; + unsigned NumFinalizationPoints = 0; + + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + BasicBlock &ContinuationIP) { + ++NumBodiesGenerated; + + Builder.restoreIP(AllocaIP); + PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); + Builder.CreateStore(F->arg_begin(), PrivAI); + + Builder.restoreIP(CodeGenIP); + Value *PrivLoad = Builder.CreateLoad(PrivAI, "local.use"); + Value *Cmp = Builder.CreateICmpNE(F->arg_begin(), PrivLoad); + Instruction *ThenTerm, *ElseTerm; + SplitBlockAndInsertIfThenElse(Cmp, CodeGenIP.getBlock()->getTerminator(), + &ThenTerm, &ElseTerm); + + Builder.SetInsertPoint(ThenTerm); + Builder.CreateBr(&ContinuationIP); + ThenTerm->eraseFromParent(); + }; + + auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + Value &VPtr, Value *&ReplacementValue) -> InsertPointTy { + ++NumPrivatizedVars; + + if (!isa(VPtr)) { + EXPECT_EQ(&VPtr, F->arg_begin()); + ReplacementValue = &VPtr; + return CodeGenIP; + } + + // Trivial copy (=firstprivate). + Builder.restoreIP(AllocaIP); + Type *VTy = VPtr.getType()->getPointerElementType(); + Value *V = Builder.CreateLoad(VTy, &VPtr, VPtr.getName() + ".reload"); + ReplacementValue = Builder.CreateAlloca(VTy, 0, VPtr.getName() + ".copy"); + Builder.restoreIP(CodeGenIP); + Builder.CreateStore(V, ReplacementValue); + return CodeGenIP; + }; + + auto FiniCB = [&](InsertPointTy CodeGenIP) { ++NumFinalizationPoints; }; + + SmallVector DependClauseInfos; + IRBuilder<>::InsertPoint AfterIP = + OMPBuilder.CreateTask(Loc, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, + false, false, DependClauseInfos, 0, nullptr, false); + + EXPECT_EQ(NumBodiesGenerated, 1U); + EXPECT_EQ(NumPrivatizedVars, 1U); + EXPECT_EQ(NumFinalizationPoints, 1U); + + Builder.restoreIP(AfterIP); + Builder.CreateRetVoid(); + + EXPECT_NE(PrivAI, nullptr); + Function *OutlinedFn = PrivAI->getFunction(); + EXPECT_NE(F, OutlinedFn); + EXPECT_FALSE(verifyModule(*M)); + EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoUnwind)); + EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoRecurse)); + + EXPECT_EQ(OutlinedFn->arg_size(), 1U); + EXPECT_TRUE(OutlinedFn->hasParamAttribute(0, Attribute::NoAlias)); + + EXPECT_TRUE(OutlinedFn->hasInternalLinkage()); + + EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent()); + EXPECT_EQ(OutlinedFn->getNumUses(), 1U); + User *Usr = OutlinedFn->user_back(); + ASSERT_TRUE(isa(Usr)); + CallInst *ForkCI = dyn_cast(Usr->user_back()); + ASSERT_NE(ForkCI, nullptr); + + EXPECT_EQ(ForkCI->getCalledFunction()->getName(), "__kmpc_task"); + EXPECT_EQ(ForkCI->getNumArgOperands(), 10U); + EXPECT_TRUE(isa(ForkCI->getArgOperand(0))); + EXPECT_EQ(ForkCI->getArgOperand(2), + ConstantInt::get(Type::getInt32Ty(Ctx), 1U)); + EXPECT_EQ(ForkCI->getArgOperand(3), + ConstantInt::get(Type::getInt32Ty(Ctx), 0U)); + EXPECT_EQ(ForkCI->getArgOperand(4), + ConstantInt::get(Type::getInt32Ty(Ctx), 4U)); + EXPECT_EQ(ForkCI->getArgOperand(7), + ConstantInt::get(Type::getInt32Ty(Ctx), 0U)); + EXPECT_TRUE(isa(ForkCI->getArgOperand(8))); + EXPECT_EQ(ForkCI->getArgOperand(9), + ConstantInt::get(Type::getInt32Ty(Ctx), 1U)); +} + +TEST_F(OpenMPIRBuilderTest, TaskSimpleEmpty) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + F->setName("func"); + IRBuilder<> Builder(BB); + + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + + Instruction *InOutlinedFn = nullptr; + + unsigned NumBodiesGenerated = 0; + unsigned NumPrivatizedVars = 0; + unsigned NumFinalizationPoints = 0; + + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + BasicBlock &ContinuationIP) { + ++NumBodiesGenerated; + InOutlinedFn = &*CodeGenIP.getPoint(); + }; + + auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + Value &VPtr, Value *&ReplacementValue) -> InsertPointTy { + ++NumPrivatizedVars; + return CodeGenIP; + }; + + auto FiniCB = [&](InsertPointTy CodeGenIP) { ++NumFinalizationPoints; }; + + SmallVector DependClauseInfos; + IRBuilder<>::InsertPoint AfterIP = + OMPBuilder.CreateTask(Loc, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, + false, false, DependClauseInfos, 0, nullptr, false); + + EXPECT_EQ(NumBodiesGenerated, 1U); + EXPECT_EQ(NumPrivatizedVars, 0U); + EXPECT_EQ(NumFinalizationPoints, 1U); + + Builder.restoreIP(AfterIP); + Builder.CreateRetVoid(); + + EXPECT_NE(InOutlinedFn, nullptr); + Function *OutlinedFn = InOutlinedFn->getFunction(); + EXPECT_NE(F, OutlinedFn); + EXPECT_FALSE(verifyModule(*M)); + EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoUnwind)); + EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoRecurse)); + + EXPECT_EQ(OutlinedFn->arg_size(), 0U); + + EXPECT_TRUE(OutlinedFn->hasInternalLinkage()); + + EXPECT_EQ(OutlinedFn->getNumUses(), 1U); + User *Usr = OutlinedFn->user_back(); + ASSERT_TRUE(isa(Usr)); + CallInst *ForkCI = dyn_cast(Usr->user_back()); + ASSERT_NE(ForkCI, nullptr); + + EXPECT_EQ(ForkCI->getCalledFunction()->getName(), "__kmpc_task"); + EXPECT_EQ(ForkCI->getNumArgOperands(), 10U); + EXPECT_TRUE(isa(ForkCI->getArgOperand(0))); + EXPECT_EQ(ForkCI->getArgOperand(2), + ConstantInt::get(Type::getInt32Ty(Ctx), 1U)); + EXPECT_EQ(ForkCI->getArgOperand(3), + ConstantInt::get(Type::getInt32Ty(Ctx), 0U)); + EXPECT_EQ(ForkCI->getArgOperand(4), + ConstantInt::get(Type::getInt32Ty(Ctx), 0U)); + EXPECT_TRUE(isa(ForkCI->getArgOperand(5))); + EXPECT_EQ(ForkCI->getArgOperand(7), + ConstantInt::get(Type::getInt32Ty(Ctx), 0U)); + EXPECT_TRUE(isa(ForkCI->getArgOperand(8))); + EXPECT_EQ(ForkCI->getArgOperand(9), + ConstantInt::get(Type::getInt32Ty(Ctx), 1U)); +} + } // namespace diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2270,7 +2270,8 @@ unsigned complete : 1; /* 1==complete, 0==not complete */ unsigned freed : 1; /* 1==freed, 0==allocateed */ unsigned native : 1; /* 1==gcc-compiled task, 0==intel */ - unsigned reserved31 : 7; /* reserved for library use */ + unsigned ompirbuilder : 1; /* 1==omp-ir-builder task, 0==classic code gen */ + unsigned reserved31 : 6; /* reserved for library use */ } kmp_tasking_flags_t; @@ -3700,6 +3701,18 @@ kmp_task_t *task); #endif // TASK_UNUSED +typedef void (*kmp_task_routine_t)(void *); + +/// Create a new task and schedule it. +/// +/// Note that \p final is passed explicitly to make sure \p flags is a compile +/// time constant at the call sites. +KMP_EXPORT kmp_int32 __kmpc_task( + ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, kmp_int32 final, + kmp_uint32 sizeof_shared_and_private_vars, void *shared_and_private_vars, + kmp_task_routine_t task_entry, kmp_uint32 num_depend_infos, + kmp_depend_info_t *depend_infos, kmp_int32 if_condition); + /* ------------------------------------------------------------------------ */ KMP_EXPORT void __kmpc_taskgroup(ident_t *loc, int gtid); diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -1540,12 +1540,9 @@ } #endif -#ifdef KMP_GOMP_COMPAT - if (taskdata->td_flags.native) { + if (taskdata->td_flags.ompirbuilder || taskdata->td_flags.native) { ((void (*)(void *))(*(task->routine)))(task->shareds); - } else -#endif /* KMP_GOMP_COMPAT */ - { + } else { (*(task->routine))(gtid, task); } KMP_POP_PARTITIONED_TIMER(); @@ -1734,6 +1731,56 @@ return res; } +kmp_int32 __kmpc_task(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, + kmp_int32 final, + kmp_uint32 sizeof_shared_and_private_vars, + void *shared_and_private_vars, + kmp_task_routine_t task_entry, + kmp_uint32 num_depend_infos, + kmp_depend_info_t *depend_infos, kmp_int32 if_condition) { + // Adjsut the flags to include final and indicate that this is an ompirbuilder + // task. + kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; + input_flags->final = final; + input_flags->ompirbuilder = true; + + // Compute the size of the task object. + size_t sizeof_shared_and_private_vars_padded = + if_condition * + __kmp_round_up_to_val(sizeof_shared_and_private_vars, sizeof(void *)); + size_t sizeof_kmp_task_t_padded = + __kmp_round_up_to_val(sizeof(kmp_task_t), CACHE_LINE); + + // Create the task object. + static_assert(sizeof(kmp_tasking_flags_t) <= sizeof(kmp_int32), + "Flags cannot be represented by an int32!"); + kmp_task_t *task = __kmpc_omp_task_alloc( + loc_ref, gtid, flags, sizeof_kmp_task_t_padded, + sizeof_shared_and_private_vars_padded, kmp_routine_entry_t(task_entry)); + + if (if_condition) { + // Copy the shared and private values into the task. + char *dest = static_cast(task->shareds); + memcpy(dest, shared_and_private_vars, sizeof_shared_and_private_vars); + + // Schedule the task. + if (num_depend_infos == 0) + return __kmpc_omp_task(loc_ref, gtid, task); + else + return __kmpc_omp_task_with_deps(loc_ref, gtid, task, num_depend_infos, + depend_infos, 0, NULL); + } + + // if(0) handling + __kmpc_omp_wait_deps(loc_ref, gtid, num_depend_infos, depend_infos, 0, NULL); + __kmpc_omp_task_begin_if0(loc_ref, gtid, task); + task_entry(shared_and_private_vars); + __kmpc_omp_task_complete_if0(loc_ref, gtid, task); + + // Indicate the task was *not* queued. + return 0; +} + // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule // a taskloop task with the correct OMPT return address //