diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h new file mode 100644 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h @@ -0,0 +1,260 @@ +//===-- CGOpenMPRuntimeTRegion.h --- OpenMP RT TRegion interface codegen --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Code generation interface for OpenMP target offloading though the generic +// Target Region (TRegion) interface. +// +// See openmp/libomptarget/deviceRTLs/common/target_Region.h for further +// information on the interface functions and their intended use. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H +#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H + +#include "CGOpenMPRuntime.h" +#include "llvm/ADT/SmallBitVector.h" + +namespace clang { +namespace CodeGen { + +class CGOpenMPRuntimeTRegion : public CGOpenMPRuntime { + // TODO: The target region interface only covers kernel codes for now. This + // therefore codegen implicitly assumes the target region kernel + // interface is targeted. Once a second target region interface is put + // in place, e.g., specialized to many-core offloading, we might need + // to make the target interface explicit. + + /// Create an outlined function for a target kernel. + /// + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param CodeGen Object containing the target statements. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + const RegionCodeGenTy &CodeGen); + + /// Helper for generic kernel mode, target directive's entry function. + void emitKernelHeader(CodeGenFunction &CGF, llvm::BasicBlock *&ExitBB); + + /// Signal termination of generic mode execution. + void emitKernelFooter(CodeGenFunction &CGF, llvm::BasicBlock *ExitBB); + + // + // Base class overrides. + // + + /// Creates offloading entry for the provided entry ID \a ID, + /// address \a Addr, size \a Size, and flags \a Flags. + void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, + uint64_t Size, int32_t Flags, + llvm::GlobalValue::LinkageTypes Linkage) override; + + /// Emit outlined function for 'target' directive. + /// + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitTargetOutlinedFunction(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + +protected: + /// Get the function name of an outlined region, customized to the target. + StringRef getOutlinedHelperName() const override { return ".omp_TRegion."; } + +public: + explicit CGOpenMPRuntimeTRegion(CodeGenModule &CGM); + + /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 + /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. + virtual void emitProcBindClause(CodeGenFunction &CGF, + OpenMPProcBindClauseKind ProcBind, + SourceLocation Loc) override; + + /// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 + /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads' + /// clause. + /// \param NumThreads An integer value of threads. + virtual void emitNumThreadsClause(CodeGenFunction &CGF, + llvm::Value *NumThreads, + SourceLocation Loc) override; + + /// Set the number of teams to \p NumTeams and the thread limit to + /// \p ThreadLimit. + /// + /// \param NumTeams An integer expression of teams. + /// \param ThreadLimit An integer expression of threads. + void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, + const Expr *ThreadLimit, SourceLocation Loc) override; + + /// Emits inlined function for the specified OpenMP parallel directive. + /// + /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, + /// kmp_int32 BoundID, struct context_vars*). + /// \param D OpenMP directive. + /// \param ThreadIDVar Variable for thread id in the current OpenMP region. + /// \param InnermostKind Kind of innermost directive (for simple directives it + /// is a directive itself, for combined - its innermost directive). + /// \param CodeGen Code generation sequence for the \a D directive. + llvm::Function * + emitParallelOutlinedFunction(const OMPExecutableDirective &D, + const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, + const RegionCodeGenTy &CodeGen) override; + + /// Emits code for parallel or serial call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// \param OutlinedFn Outlined function to be run in parallel threads. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// \param IfCond Condition in the associated 'if' clause, if it was + /// specified, nullptr otherwise. + void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Function *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond) override; + + /// Emits a critical region. + /// \param CriticalName Name of the critical region. + /// \param CriticalOpGen Generator for the statement associated with the given + /// critical region. + /// \param Hint Value of the 'hint' clause (optional). + void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, + const RegionCodeGenTy &CriticalOpGen, + SourceLocation Loc, + const Expr *Hint = nullptr) override; + + /// Emit a code for reduction clause. + /// + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + /// \param Options List of options for reduction codegen: + /// WithNowait true if parent directive has also nowait clause, false + /// otherwise. + /// SimpleReduction Emit reduction operation only. Used for omp simd + /// directive on the host. + /// ReductionKind The kind of reduction to perform. + virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, + ArrayRef Privates, + ArrayRef LHSExprs, + ArrayRef RHSExprs, + ArrayRef ReductionOps, + ReductionOptionsTy Options) override; + + /// Emits OpenMP-specific function prolog. + /// Required for device constructs. + void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override; + + /// Cleans up references to the objects in finished function. + /// + void functionFinished(CodeGenFunction &CGF) override; + + /// Choose a default value for the dist_schedule clause. + void + getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, + const OMPLoopDirective &S, + OpenMPDistScheduleClauseKind &ScheduleKind, + llvm::Value *&Chunk) const override; + + /// Choose a default value for the schedule clause. + void getDefaultScheduleAndChunk(CodeGenFunction &CGF, + const OMPLoopDirective &S, + OpenMPScheduleClauseKind &ScheduleKind, + const Expr *&ChunkExpr) const override; + + /// Perform check on requires decl to ensure that target architecture + /// supports unified addressing + void checkArchForUnifiedAddressing(CodeGenModule &CGM, + const OMPRequiresDecl *D) const override; + + /// Emits inlined function for the specified OpenMP teams + // directive. + /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, + /// kmp_int32 BoundID, struct context_vars*). + /// \param D OpenMP directive. + /// \param ThreadIDVar Variable for thread id in the current OpenMP region. + /// \param InnermostKind Kind of innermost directive (for simple directives it + /// is a directive itself, for combined - its innermost directive). + /// \param CodeGen Code generation sequence for the \a D directive. + llvm::Function * + emitTeamsOutlinedFunction(const OMPExecutableDirective &D, + const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, + const RegionCodeGenTy &CodeGen) override; + + /// Emits code for teams call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// \param OutlinedFn Outlined function to be run by team masters. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// + void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, + SourceLocation Loc, llvm::Function *OutlinedFn, + ArrayRef CapturedVars) override; + +protected: + + /// Hook to allow derived classes to perform checks on the AST that justify + /// SPMD mode. + virtual bool isKnownSPMDMode() const { return false; } + + /// Hook to allow derived classes to perform checks on the AST that justify + /// execution without runtime support. + virtual bool mayNeedRuntimeSupport() const { return true; } + + /// Hook to allow derived classes to perform checks on the AST that justify + /// execution without data sharing support. + virtual bool mayPerformDataSharing() const { return true; } + +private: + /// Simple container for a wrapper of an outlined parallel function and the + /// layout of the passed variables (= captured variables, both shared and + /// firstprivate). + struct WrapperInfo { + llvm::Function *WrapperFn = nullptr; + llvm::StructType *SharedVarsStructTy = nullptr; + llvm::StructType *PrivateVarsStructTy = nullptr; + llvm::SmallBitVector CaptureIsPrivate; + }; + + /// Map an outlined function to its wrapper and shared struct type. The latter + /// defines the layout of the payload and the wrapper will unpack that payload + /// and pass the values to the outlined function. + llvm::DenseMap WrapperInfoMap; + + /// Emit function which wraps the outline parallel region + /// and controls the parameters which are passed to this function. + /// The wrapper ensures that the outlined function is called + /// with the correct arguments when data is shared. + void createParallelDataSharingWrapper(llvm::Function *OutlinedParallelFn, + const OMPExecutableDirective &D); +}; + +} // namespace CodeGen +} // namespace clang + +#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEKERNEL_H diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp @@ -0,0 +1,712 @@ +//===-- CGOpenMPRuntimeTRegion.cpp - OpenMP RT TRegion interface codegen --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of the code generation interface for OpenMP target offloading +// though the Target Region (TRegion) interface. +// +// See the file comment in CGOpenMPRuntimeTRegion.h for more information. +// +//===----------------------------------------------------------------------===// + +#include "CGOpenMPRuntimeTRegion.h" +#include "CodeGenFunction.h" +#include "clang/AST/StmtVisitor.h" + +using namespace clang; +using namespace CodeGen; + +namespace { + +/// Enums for all functions in the target +enum OpenMPTargetRuntimeLibraryCalls { + OMPRTL__kmpc_target_region_kernel_init, + OMPRTL__kmpc_target_region_kernel_deinit, + OMPRTL__kmpc_target_region_kernel_parallel, +}; + +/// Return the runtime function declaration specified by \p Function. +static llvm::Function *getOrCreateRuntimeFunctionDeclaration( + CGOpenMPRuntimeTRegion &CG, CodeGenModule &CGM, + OpenMPTargetRuntimeLibraryCalls Function) { + + llvm::Function *RTFn; + auto *I1Ty = llvm::IntegerType::getInt1Ty(CGM.getLLVMContext()); + switch (Function) { + case OMPRTL__kmpc_target_region_kernel_init: { + // char __kmpc_target_region_kernel_init(bool UseSPMDMode, + // bool UseStateMachine, + // bool RequiresOMPRuntime, + // bool RequiresDataSharing); + llvm::Type *TypeParams[] = {I1Ty, I1Ty, I1Ty, I1Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.Int8Ty, TypeParams, /* isVarArg */ false); + RTFn = cast( + CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_init") + .getCallee()); + break; + } + case OMPRTL__kmpc_target_region_kernel_deinit: { + // void __kmpc_target_region_kernel_deinit(bool UseSPMDMode, + // bool RequiredOMPRuntime); + llvm::Type *TypeParams[] = {I1Ty, I1Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false); + RTFn = cast( + CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_deinit") + .getCallee()); + break; + } + case OMPRTL__kmpc_target_region_kernel_parallel: { + // typedef void (*ParallelWorkFnTy)(void *, void *); + auto *ParWorkFnTy = + llvm::FunctionType::get(CGM.VoidTy, {CGM.VoidPtrTy, CGM.VoidPtrTy}, + /* isVarArg */ false); + // + // void __kmpc_target_region_kernel_parallel(bool UseSPMDMode, + // bool RequiredOMPRuntime, + // ParallelWorkFnTy WorkFn, + // void *SharedVars, + // uint16_t SharedVarsBytes, + // void *PrivateVars, + // uint16_t PrivateVarsBytes, + // bool SharedPointers); + llvm::Type *TypeParams[] = { + I1Ty, I1Ty, ParWorkFnTy->getPointerTo(), + CGM.VoidPtrTy, CGM.Int16Ty, CGM.VoidPtrTy, + CGM.Int16Ty, I1Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false); + + RTFn = cast( + CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_parallel") + .getCallee()); + + RTFn->addParamAttr(2, llvm::Attribute::NoCapture); + RTFn->addParamAttr(3, llvm::Attribute::NoCapture); + RTFn->addParamAttr(5, llvm::Attribute::NoCapture); + RTFn->addParamAttr(5, llvm::Attribute::ReadOnly); + + // Add the callback metadata if it is not present already. + if (!RTFn->hasMetadata(llvm::LLVMContext::MD_callback)) { + llvm::LLVMContext &Ctx = RTFn->getContext(); + llvm::MDBuilder MDB(Ctx); + // Annotate the callback behavior of __kmpc_target_region_kernel_parallel: + // - The callback callee is WorkFn, argument 2 starting with 0. + // - The first callback payload is SharedVars. + // - The second callback payload is PrivateVars. + RTFn->addMetadata( + llvm::LLVMContext::MD_callback, + *llvm::MDNode::get( + Ctx, {MDB.createCallbackEncoding(2, {3, 5}, + /* VarArgsArePassed */ false)})); + } + break; + } + } + + // TODO: Remove all globals and set this attribute. + // + // This is overwritten when the definition is linked in. + // RTFn->addFnAttr(llvm::Attribute::InaccessibleMemOrArgMemOnly); + + return RTFn; +} + +} // anonymous namespace + +void CGOpenMPRuntimeTRegion::emitKernel(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + const RegionCodeGenTy &CodeGen) { + WrapperInfoMap.clear(); + + // Emit target region as a standalone region. + class KernelPrePostActionTy : public PrePostActionTy { + CGOpenMPRuntimeTRegion &RT; + llvm::BasicBlock *ExitBB; + + public: + KernelPrePostActionTy(CGOpenMPRuntimeTRegion &RT) + : RT(RT), ExitBB(nullptr) {} + + void Enter(CodeGenFunction &CGF) override { + RT.emitKernelHeader(CGF, ExitBB); + // Skip target region initialization. + RT.setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true); + } + + void Exit(CodeGenFunction &CGF) override { + RT.clearLocThreadIdInsertPt(CGF); + RT.emitKernelFooter(CGF, ExitBB); + } + + } Action(*this); + CodeGen.setAction(Action); + + emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, + /* IsOffloadEntry */ true, CodeGen); +} + +void CGOpenMPRuntimeTRegion::emitKernelHeader(CodeGenFunction &CGF, + llvm::BasicBlock *&ExitBB) { + CGBuilderTy &Bld = CGF.Builder; + + // Setup BBs in entry function. + llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); + ExitBB = CGF.createBasicBlock(".exit"); + + llvm::Value *Args[] = { + /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()), + /* UseStateMachine */ Bld.getInt1(1), + /* RequiresOMPRuntime */ + Bld.getInt1(mayNeedRuntimeSupport()), + /* RequiresDataSharing */ Bld.getInt1(mayPerformDataSharing())}; + llvm::CallInst *InitCI = CGF.EmitRuntimeCall( + getOrCreateRuntimeFunctionDeclaration( + *this, CGM, OMPRTL__kmpc_target_region_kernel_init), + Args); + + llvm::Value *ExecuteCnd = Bld.CreateICmpEQ(InitCI, Bld.getInt8(1)); + + Bld.CreateCondBr(ExecuteCnd, ExecuteBB, ExitBB); + CGF.EmitBlock(ExecuteBB); +} + +void CGOpenMPRuntimeTRegion::emitKernelFooter(CodeGenFunction &CGF, + llvm::BasicBlock *ExitBB) { + if (!CGF.HaveInsertPoint()) + return; + + llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit"); + CGF.EmitBranch(OMPDeInitBB); + + CGF.EmitBlock(OMPDeInitBB); + + CGBuilderTy &Bld = CGF.Builder; + // DeInitialize the OMP state in the runtime; called by all active threads. + llvm::Value *Args[] = {/* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()), + /* RequiredOMPRuntime */ + Bld.getInt1(mayNeedRuntimeSupport())}; + + CGF.EmitRuntimeCall(getOrCreateRuntimeFunctionDeclaration( + *this, CGM, OMPRTL__kmpc_target_region_kernel_deinit), + Args); + + CGF.EmitBranch(ExitBB); + CGF.EmitBlock(ExitBB); +} + +void CGOpenMPRuntimeTRegion::emitTargetOutlinedFunction( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + if (!IsOffloadEntry) // Nothing to do. + return; + + assert(!ParentName.empty() && "Invalid target region parent name!"); + + emitKernel(D, ParentName, OutlinedFn, OutlinedFnID, CodeGen); + + // Create a unique global variable to indicate the execution mode of this + // target region. The execution mode is either 'non-SPMD' or 'SPMD'. Initially + // all regions are executed in non-SPMD mode. This variable is picked up by + // the offload library to setup the device appropriately before kernel launch. + auto *GVMode = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, /* isConstant */ true, + llvm::GlobalValue::WeakAnyLinkage, llvm::ConstantInt::get(CGM.Int8Ty, 1), + Twine(OutlinedFn->getName(), "_exec_mode")); + CGM.addCompilerUsedGlobal(GVMode); +} + +CGOpenMPRuntimeTRegion::CGOpenMPRuntimeTRegion(CodeGenModule &CGM) + : CGOpenMPRuntime(CGM, "_", "$") { + if (!CGM.getLangOpts().OpenMPIsDevice) + llvm_unreachable("TRegion code generation does only handle device code!"); +} + +void CGOpenMPRuntimeTRegion::emitProcBindClause( + CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, + SourceLocation Loc) { + CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); +} + +void CGOpenMPRuntimeTRegion::emitNumThreadsClause(CodeGenFunction &CGF, + llvm::Value *NumThreads, + SourceLocation Loc) { + CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc); +} + +void CGOpenMPRuntimeTRegion::emitNumTeamsClause(CodeGenFunction &CGF, + const Expr *NumTeams, + const Expr *ThreadLimit, + SourceLocation Loc) { + // Nothing to do for kernel mode, no other modes supported yet. +} + +llvm::Function *CGOpenMPRuntimeTRegion::emitParallelOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + + // Emit target region as a standalone region. + llvm::Function *OutlinedFun = + cast(CGOpenMPRuntime::emitParallelOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen)); + + createParallelDataSharingWrapper(OutlinedFun, D); + + return OutlinedFun; +} + +// TODO: Check if this is actually needed. +static const ValueDecl *getUnderlyingVar(const Expr *E) { + E = E->IgnoreParens(); + if (const auto *ASE = dyn_cast(E)) { + const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempASE = dyn_cast(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + E = Base; + } else if (auto *OASE = dyn_cast(E)) { + const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempOASE = dyn_cast(Base)) + Base = TempOASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempASE = dyn_cast(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + E = Base; + } + E = E->IgnoreParenImpCasts(); + if (const auto *DE = dyn_cast(E)) + return cast(DE->getDecl()->getCanonicalDecl()); + const auto *ME = cast(E); + return cast(ME->getMemberDecl()->getCanonicalDecl()); +} + +void CGOpenMPRuntimeTRegion::createParallelDataSharingWrapper( + llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { + ASTContext &Ctx = CGM.getContext(); + const auto &CS = *D.getCapturedStmt(OMPD_parallel); + + // Create a function that takes as argument the source thread. + FunctionArgList WrapperArgs; + ImplicitParamDecl SharedVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(), + /* Id */ nullptr, Ctx.VoidPtrTy, + ImplicitParamDecl::Other); + ImplicitParamDecl PrivateVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(), + /* Id */ nullptr, Ctx.VoidPtrTy, + ImplicitParamDecl::Other); + WrapperArgs.emplace_back(&SharedVarsArgDecl); + WrapperArgs.emplace_back(&PrivateVarsArgDecl); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); + + auto *WrapperFn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), WrapperFn, CGFI); + + OutlinedParallelFn->setLinkage(llvm::GlobalValue::InternalLinkage); + OutlinedParallelFn->setDoesNotRecurse(); + WrapperFn->setLinkage(llvm::GlobalValue::InternalLinkage); + WrapperFn->setDoesNotRecurse(); + + CodeGenFunction CGF(CGM, /* suppressNewContext */ true); + CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WrapperFn, CGFI, WrapperArgs, + D.getBeginLoc(), D.getBeginLoc()); + + auto AI = WrapperFn->arg_begin(); + llvm::Argument &SharedVarsArg = *(AI++); + llvm::Argument &PrivateVarsArg = *(AI); + SharedVarsArg.setName("shared_vars"); + PrivateVarsArg.setName("private_vars"); + + Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth( + /* DestWidth */ 32, /* Signed */ 1), + /* Name */ ".zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/* C */ 0)); + + setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true); + + // Create the array of arguments and fill it with boilerplate values. + SmallVector Args; + Args.emplace_back(emitThreadIDAddress(CGF, D.getBeginLoc()).getPointer()); + Args.emplace_back(ZeroAddr.getPointer()); + + CGBuilderTy &Bld = CGF.Builder; + + // Collect all variables marked as shared. + llvm::SmallPtrSet SharedVars; + for (const auto *C : D.getClausesOfKind()) + for (const Expr *E : C->getVarRefs()) + SharedVars.insert(getUnderlyingVar(E)); + + // Retrieve the shared and private variables from argument pointers and pass + // them to the outlined function. + llvm::SmallVector SharedStructMemberTypes; + llvm::SmallVector PrivateStructMemberTypes; + + WrapperInfo &WI = WrapperInfoMap[OutlinedParallelFn]; + WI.WrapperFn = WrapperFn; + + auto ArgIt = OutlinedParallelFn->arg_begin() + /* global_tid & bound_tid */ 2; + + // If we require loop bounds they are already part of the outlined function + // encoding, just after global_tid and bound_tid. + bool RequiresLoopBounds = + isOpenMPLoopBoundSharingDirective(D.getDirectiveKind()); + if (RequiresLoopBounds) { + // Register the lower bound in the wrapper info. + WI.CaptureIsPrivate.push_back(true); + PrivateStructMemberTypes.push_back((ArgIt++)->getType()); + // Register the upper bound in the wrapper info. + WI.CaptureIsPrivate.push_back(true); + PrivateStructMemberTypes.push_back((ArgIt++)->getType()); + } + + auto CIt = CS.capture_begin(); + for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CIt) { + bool IsPrivate = CIt->capturesVariableArrayType() || + CIt->capturesVariableByCopy() || + !SharedVars.count(CIt->getCapturedVar()); + WI.CaptureIsPrivate.push_back(IsPrivate); + + auto &StructMemberTypes = + IsPrivate ? PrivateStructMemberTypes : SharedStructMemberTypes; + llvm::Type *ArgTy = (ArgIt++)->getType(); + if (!IsPrivate) { + assert(ArgTy->isPointerTy()); + ArgTy = ArgTy->getPointerElementType(); + } + StructMemberTypes.push_back(ArgTy); + } + + // Verify the position of the outlined function argument iterator as a sanity + // check. + assert(ArgIt == OutlinedParallelFn->arg_end() && + "Not all arguments have been processed!"); + + llvm::Value *SharedVarsStructPtr = nullptr; + llvm::Value *PrivateVarsStructPtr = nullptr; + llvm::LLVMContext &LLVMCtx = OutlinedParallelFn->getContext(); + if (!PrivateStructMemberTypes.empty()) { + WI.PrivateVarsStructTy = llvm::StructType::create( + LLVMCtx, PrivateStructMemberTypes, "omp.private.struct"); + PrivateVarsStructPtr = Bld.CreateBitCast( + &PrivateVarsArg, WI.PrivateVarsStructTy->getPointerTo()); + } + if (!SharedStructMemberTypes.empty()) { + WI.SharedVarsStructTy = llvm::StructType::create( + LLVMCtx, SharedStructMemberTypes, "omp.shared.struct"); + SharedVarsStructPtr = Bld.CreateBitCast( + &SharedVarsArg, WI.SharedVarsStructTy->getPointerTo()); + } + + assert(WI.CaptureIsPrivate.size() + /* global_tid & bound_tid */ 2 == + OutlinedParallelFn->arg_size() && + "Not all arguments have been processed!"); + + unsigned PrivateIdx = 0, SharedIdx = 0; + for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) { + bool IsPrivate = WI.CaptureIsPrivate[i]; + + llvm::Value *StructPtr = + IsPrivate ? PrivateVarsStructPtr : SharedVarsStructPtr; + unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx; + + // TODO: Figure out the real alignment + if (IsPrivate) { + Args.emplace_back( + Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1)); + } else { + llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++); + Args.emplace_back(GEP); + } + } + + assert(Args.size() == OutlinedParallelFn->arg_size()); + emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); + + CGF.FinishFunction(); + + clearLocThreadIdInsertPt(CGF); +} + +void CGOpenMPRuntimeTRegion::emitParallelCall( + CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *Fn, + ArrayRef CapturedVars, const Expr *IfCond) { + if (!CGF.HaveInsertPoint()) + return; + + const WrapperInfo &WI = WrapperInfoMap[Fn]; + + auto &&ParGen = [this, CapturedVars, WI](CodeGenFunction &CGF, + PrePostActionTy &) { + CGBuilderTy &Bld = CGF.Builder; + assert(WI.WrapperFn && "Wrapper function does not exist!"); + + llvm::Value *SharedVarsSize = llvm::Constant::getNullValue(CGM.Int16Ty); + llvm::Value *PrivateVarsSize = SharedVarsSize; + llvm::Value *SharedStructAlloca = llvm::UndefValue::get(CGM.VoidPtrTy); + llvm::Value *PrivateStructAlloca = SharedStructAlloca; + + if (WI.SharedVarsStructTy) { + SharedStructAlloca = CGF.CreateDefaultAlignTempAlloca( + WI.SharedVarsStructTy, ".shared.vars") + .getPointer(); + const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout(); + SharedVarsSize = Bld.getInt16(DL.getTypeAllocSize(WI.SharedVarsStructTy)); + } + if (WI.PrivateVarsStructTy) { + PrivateStructAlloca = CGF.CreateDefaultAlignTempAlloca( + WI.PrivateVarsStructTy, ".private.vars") + .getPointer(); + const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout(); + PrivateVarsSize = + Bld.getInt16(DL.getTypeAllocSize(WI.PrivateVarsStructTy)); + } + + llvm::SmallVector Args; + Args.push_back( + /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode())); + Args.push_back( + /* RequiredOMPRuntime */ Bld.getInt1(mayNeedRuntimeSupport())); + Args.push_back(WI.WrapperFn); + Args.push_back(CGF.EmitCastToVoidPtr(SharedStructAlloca)); + Args.push_back(SharedVarsSize); + Args.push_back(CGF.EmitCastToVoidPtr(PrivateStructAlloca)); + Args.push_back(PrivateVarsSize); + Args.push_back( + /* SharedPointers */ Bld.getInt1(0)); + + assert((CapturedVars.empty() || + (WI.SharedVarsStructTy || WI.PrivateVarsStructTy)) && + "Expected the shared or private struct type to be set if variables " + "are captured!"); + assert((CapturedVars.empty() || + CapturedVars.size() == + (WI.SharedVarsStructTy ? WI.SharedVarsStructTy->getNumElements() + : 0) + + (WI.PrivateVarsStructTy + ? WI.PrivateVarsStructTy->getNumElements() + : 0)) && + "# elements in shared struct types should be number of captured " + "variables!"); + + // Store all captured variables into a single local structure that is then + // passed to the runtime library. + unsigned PrivateIdx = 0, SharedIdx = 0; + for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) { + bool IsPrivate = WI.CaptureIsPrivate[i]; + + llvm::Value *StructPtr = + IsPrivate ? PrivateStructAlloca : SharedStructAlloca; + unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx; + llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++); + llvm::Value *Var = IsPrivate ? CapturedVars[i] + : Bld.CreateAlignedLoad(CapturedVars[i], 1); + Bld.CreateDefaultAlignedStore(Var, GEP); + } + + CGF.EmitRuntimeCall( + getOrCreateRuntimeFunctionDeclaration( + *this, CGM, OMPRTL__kmpc_target_region_kernel_parallel), + Args); + + SharedIdx = 0; + for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) { + bool IsPrivate = WI.CaptureIsPrivate[i]; + if (IsPrivate) + continue; + + llvm::Value *GEP = Bld.CreateStructGEP(SharedStructAlloca, SharedIdx++); + llvm::Value *Var = Bld.CreateAlignedLoad(GEP, 1); + Bld.CreateDefaultAlignedStore(Var, CapturedVars[i]); + } + }; + + auto &&SeqGen = [this, &ParGen, Loc](CodeGenFunction &CGF, + PrePostActionTy &Action) { + // Use an artifical "num_threads(1)" clause to force sequential execution if + // the expression in the 'if clause' evaluated to false. We expect the + // middle-end to clean this up. + emitNumThreadsClause(CGF, CGF.Builder.getInt32(/* C */ 1), Loc); + ParGen(CGF, Action); + }; + + if (IfCond) { + emitOMPIfClause(CGF, IfCond, ParGen, SeqGen); + } else { + CodeGenFunction::RunCleanupsScope Scope(CGF); + RegionCodeGenTy ThenRCG(ParGen); + ThenRCG(CGF); + } +} + +llvm::Function *CGOpenMPRuntimeTRegion::emitTeamsOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + + // Emit target region as a standalone region. + llvm::Function *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen); + + return OutlinedFunVal; +} + +void CGOpenMPRuntimeTRegion::emitTeamsCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, + llvm::Function *OutlinedFn, ArrayRef CapturedVars) { + if (!CGF.HaveInsertPoint()) + return; + + Address ZeroAddr = CGF.CreateMemTemp( + CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1), + /*Name*/ ".zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); + llvm::SmallVector OutlinedFnArgs; + OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); + OutlinedFnArgs.push_back(ZeroAddr.getPointer()); + OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); + emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); +} + +void CGOpenMPRuntimeTRegion::emitCriticalRegion( + CodeGenFunction &CGF, StringRef CriticalName, + const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, + const Expr *Hint) { + llvm_unreachable( + "TODO: TRegion code generation does not support critical regions yet!"); +} + +void CGOpenMPRuntimeTRegion::emitReduction( + CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, + ArrayRef LHSExprs, ArrayRef RHSExprs, + ArrayRef ReductionOps, ReductionOptionsTy Options) { + llvm_unreachable( + "TODO: TRegion code generation does not support reductions yet!"); +} + +void CGOpenMPRuntimeTRegion::emitFunctionProlog(CodeGenFunction &CGF, + const Decl *D) {} + +void CGOpenMPRuntimeTRegion::functionFinished(CodeGenFunction &CGF) { + CGOpenMPRuntime::functionFinished(CGF); +} + +void CGOpenMPRuntimeTRegion::getDefaultDistScheduleAndChunk( + CodeGenFunction &CGF, const OMPLoopDirective &S, + OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const { + CGOpenMPRuntime::getDefaultDistScheduleAndChunk(CGF, S, ScheduleKind, Chunk); +} + +void CGOpenMPRuntimeTRegion::getDefaultScheduleAndChunk( + CodeGenFunction &CGF, const OMPLoopDirective &S, + OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const { + ScheduleKind = OMPC_SCHEDULE_static; + // Chunk size is 1 in this case. + llvm::APInt ChunkSize(32, 1); + ChunkExpr = IntegerLiteral::Create( + CGF.getContext(), ChunkSize, + CGF.getContext().getIntTypeForBitwidth(32, /* Signed */ 0), + SourceLocation()); +} + +// ------------------------------------------------------------------------ // +// TODO: The following cuda specific part should live somewhere else, +// potentially in a derived class. + +void CGOpenMPRuntimeTRegion::createOffloadEntry( + llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t, + llvm::GlobalValue::LinkageTypes) { + // TODO: Add support for global variables on the device after declare target + // support. + if (!isa(Addr)) + return; + llvm::Module &M = CGM.getModule(); + llvm::LLVMContext &Ctx = CGM.getLLVMContext(); + + // Get "nvvm.annotations" metadata node + llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); + + llvm::Metadata *MDVals[] = { + llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"), + llvm::ConstantAsMetadata::get( + llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; + // Append metadata to nvvm.annotations + MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); +} + +#include "clang/Basic/Cuda.h" + +// Get current CudaArch and ignore any unknown values +static CudaArch getCudaArch(CodeGenModule &CGM) { + if (!CGM.getTarget().hasFeature("ptx")) + return CudaArch::UNKNOWN; + llvm::StringMap Features; + CGM.getTarget().initFeatureMap(Features, CGM.getDiags(), + CGM.getTarget().getTargetOpts().CPU, + CGM.getTarget().getTargetOpts().Features); + for (const auto &Feature : Features) { + if (Feature.getValue()) { + CudaArch Arch = StringToCudaArch(Feature.getKey()); + if (Arch != CudaArch::UNKNOWN) + return Arch; + } + } + return CudaArch::UNKNOWN; +} + +/// Check to see if target architecture supports unified addressing which is +/// a restriction for OpenMP requires clause "unified_shared_memory". +void CGOpenMPRuntimeTRegion::checkArchForUnifiedAddressing( + CodeGenModule &CGM, const OMPRequiresDecl *D) const { + for (const OMPClause *Clause : D->clauselists()) { + if (Clause->getClauseKind() == OMPC_unified_shared_memory) { + switch (getCudaArch(CGM)) { + case CudaArch::SM_20: + case CudaArch::SM_21: + case CudaArch::SM_30: + case CudaArch::SM_32: + case CudaArch::SM_35: + case CudaArch::SM_37: + case CudaArch::SM_50: + case CudaArch::SM_52: + case CudaArch::SM_53: + case CudaArch::SM_60: + case CudaArch::SM_61: + case CudaArch::SM_62: + CGM.Error(Clause->getBeginLoc(), + "Target architecture does not support unified addressing"); + return; + case CudaArch::SM_70: + case CudaArch::SM_72: + case CudaArch::SM_75: + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX902: + case CudaArch::GFX904: + case CudaArch::GFX906: + case CudaArch::GFX909: + case CudaArch::UNKNOWN: + break; + case CudaArch::LAST: + llvm_unreachable("Unexpected Cuda arch."); + } + } + } +} diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -69,6 +69,7 @@ CGOpenCLRuntime.cpp CGOpenMPRuntime.cpp CGOpenMPRuntimeNVPTX.cpp + CGOpenMPRuntimeTRegion.cpp CGRecordLayoutBuilder.cpp CGStmt.cpp CGStmtOpenMP.cpp diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -20,6 +20,7 @@ #include "CGOpenCLRuntime.h" #include "CGOpenMPRuntime.h" #include "CGOpenMPRuntimeNVPTX.h" +#include "CGOpenMPRuntimeTRegion.h" #include "CodeGenFunction.h" #include "CodeGenPGO.h" #include "ConstantEmitter.h" @@ -67,6 +68,11 @@ llvm::cl::desc("Emit limited coverage mapping information (experimental)"), llvm::cl::init(false)); +static llvm::cl::opt UseGenericTRegionInterface( + "openmp-tregion-runtime", llvm::cl::ZeroOrMore, llvm::cl::Hidden, + llvm::cl::desc("Use the generic target region OpenMP runtime interface"), + llvm::cl::init(false)); + static const char AnnotationSection[] = "llvm.metadata"; static CGCXXABI *createCXXABI(CodeGenModule &CGM) { @@ -206,7 +212,10 @@ case llvm::Triple::nvptx64: assert(getLangOpts().OpenMPIsDevice && "OpenMP NVPTX is only prepared to deal with device code."); - OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this)); + if (UseGenericTRegionInterface) + OpenMPRuntime.reset(new CGOpenMPRuntimeTRegion(*this)); + else + OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this)); break; default: if (LangOpts.OpenMPSimd) diff --git a/clang/test/OpenMP/target_tregion_no_SPMD_mode.c b/clang/test/OpenMP/target_tregion_no_SPMD_mode.c new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/target_tregion_no_SPMD_mode.c @@ -0,0 +1,72 @@ +// RUN: %clang_cc1 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -fopenmp -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -mllvm -openmp-tregion-runtime -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s + +// CHECK: loop_in_loop_in_tregion +// CHECK: %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) +// CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) +void loop_in_loop_in_tregion(int *A, int *B) { +#pragma omp target + for (int i = 0; i < 512; i++) { + for (int j = 0; j < 1024; j++) + A[j] += B[i + j]; + } +} + +// CHECK: parallel_loops_and_accesses_in_tregion +// CHECK: %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) +// CHECK: call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false) +// CHECK: call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false) +// CHECK: call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false) +// CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) +void parallel_loops_and_accesses_in_tregion(int *A, int *B) { +#pragma omp target + { +#pragma omp parallel for + for (int j = 0; j < 1024; j++) + A[j] += B[0 + j]; +#pragma omp parallel for + for (int j = 0; j < 1024; j++) + A[j] += B[1 + j]; +#pragma omp parallel for + for (int j = 0; j < 1024; j++) + A[j] += B[2 + j]; + + // This needs a guard in SPMD mode + A[0] = B[0]; + } +} + +void extern_func(); +static void parallel_loop(int *A, int *B, int i) { +#pragma omp parallel for + for (int j = 0; j < 1024; j++) + A[j] += B[i + j]; +} + +// CHECK: parallel_loop_in_function_in_loop_with_global_acc_in_tregion +// CHECK: %1 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) +// CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) +int Global[512]; +void parallel_loop_in_function_in_loop_with_global_acc_in_tregion(int *A, int *B) { +#pragma omp target + for (int i = 0; i < 512; i++) { + parallel_loop(A, B, i); + Global[i]++; + } +} + +// CHECK: parallel_loop +// CHECK: call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false) + +// CHECK: parallel_loops_in_functions_and_extern_func_in_tregion +// CHECK: %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) +// CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) +void parallel_loops_in_functions_and_extern_func_in_tregion(int *A, int *B) { +#pragma omp target + { + parallel_loop(A, B, 1); + parallel_loop(A, B, 2); + extern_func(); + parallel_loop(A, B, 3); + } +}