diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h new file mode 100644 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h @@ -0,0 +1,260 @@ +//===-- CGOpenMPRuntimeTRegion.h --- OpenMP RT TRegion interface codegen --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Code generation interface for OpenMP target offloading though the generic +// Target Region (TRegion) interface. +// +// See openmp/libomptarget/deviceRTLs/common/target_Region.h for further +// information on the interface functions and their intended use. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H +#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H + +#include "CGOpenMPRuntime.h" +#include "llvm/ADT/SmallBitVector.h" + +namespace clang { +namespace CodeGen { + +class CGOpenMPRuntimeTRegion : public CGOpenMPRuntime { + // TODO: The target region interface only covers kernel codes for now. This + // therefore codegen implicitly assumes the target region kernel + // interface is targeted. Once a second target region interface is put + // in place, e.g., specialized to many-core offloading, we might need + // to make the target interface explicit. + + /// Create an outlined function for a target kernel. + /// + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param CodeGen Object containing the target statements. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + const RegionCodeGenTy &CodeGen); + + /// Helper for generic kernel mode, target directive's entry function. + void emitKernelHeader(CodeGenFunction &CGF, llvm::BasicBlock *&ExitBB); + + /// Signal termination of generic mode execution. + void emitKernelFooter(CodeGenFunction &CGF, llvm::BasicBlock *ExitBB); + + // + // Base class overrides. + // + + /// Creates offloading entry for the provided entry ID \a ID, + /// address \a Addr, size \a Size, and flags \a Flags. + void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, + uint64_t Size, int32_t Flags, + llvm::GlobalValue::LinkageTypes Linkage) override; + + /// Emit outlined function for 'target' directive. + /// + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitTargetOutlinedFunction(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + +protected: + /// Get the function name of an outlined region, customized to the target. + StringRef getOutlinedHelperName() const override { return ".omp_TRegion."; } + +public: + explicit CGOpenMPRuntimeTRegion(CodeGenModule &CGM); + + /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 + /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. + virtual void emitProcBindClause(CodeGenFunction &CGF, + OpenMPProcBindClauseKind ProcBind, + SourceLocation Loc) override; + + /// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 + /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads' + /// clause. + /// \param NumThreads An integer value of threads. + virtual void emitNumThreadsClause(CodeGenFunction &CGF, + llvm::Value *NumThreads, + SourceLocation Loc) override; + + /// Set the number of teams to \p NumTeams and the thread limit to + /// \p ThreadLimit. + /// + /// \param NumTeams An integer expression of teams. + /// \param ThreadLimit An integer expression of threads. + void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, + const Expr *ThreadLimit, SourceLocation Loc) override; + + /// Emits inlined function for the specified OpenMP parallel directive. + /// + /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, + /// kmp_int32 BoundID, struct context_vars*). + /// \param D OpenMP directive. + /// \param ThreadIDVar Variable for thread id in the current OpenMP region. + /// \param InnermostKind Kind of innermost directive (for simple directives it + /// is a directive itself, for combined - its innermost directive). + /// \param CodeGen Code generation sequence for the \a D directive. + llvm::Function * + emitParallelOutlinedFunction(const OMPExecutableDirective &D, + const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, + const RegionCodeGenTy &CodeGen) override; + + /// Emits code for parallel or serial call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// \param OutlinedFn Outlined function to be run in parallel threads. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// \param IfCond Condition in the associated 'if' clause, if it was + /// specified, nullptr otherwise. + void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Function *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond) override; + + /// Emits a critical region. + /// \param CriticalName Name of the critical region. + /// \param CriticalOpGen Generator for the statement associated with the given + /// critical region. + /// \param Hint Value of the 'hint' clause (optional). + void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, + const RegionCodeGenTy &CriticalOpGen, + SourceLocation Loc, + const Expr *Hint = nullptr) override; + + /// Emit a code for reduction clause. + /// + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + /// \param Options List of options for reduction codegen: + /// WithNowait true if parent directive has also nowait clause, false + /// otherwise. + /// SimpleReduction Emit reduction operation only. Used for omp simd + /// directive on the host. + /// ReductionKind The kind of reduction to perform. + virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, + ArrayRef Privates, + ArrayRef LHSExprs, + ArrayRef RHSExprs, + ArrayRef ReductionOps, + ReductionOptionsTy Options) override; + + /// Emits OpenMP-specific function prolog. + /// Required for device constructs. + void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override; + + /// Cleans up references to the objects in finished function. + /// + void functionFinished(CodeGenFunction &CGF) override; + + /// Choose a default value for the dist_schedule clause. + void + getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, + const OMPLoopDirective &S, + OpenMPDistScheduleClauseKind &ScheduleKind, + llvm::Value *&Chunk) const override; + + /// Choose a default value for the schedule clause. + void getDefaultScheduleAndChunk(CodeGenFunction &CGF, + const OMPLoopDirective &S, + OpenMPScheduleClauseKind &ScheduleKind, + const Expr *&ChunkExpr) const override; + + /// Perform check on requires decl to ensure that target architecture + /// supports unified addressing + void checkArchForUnifiedAddressing(CodeGenModule &CGM, + const OMPRequiresDecl *D) const override; + + /// Emits inlined function for the specified OpenMP teams + // directive. + /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, + /// kmp_int32 BoundID, struct context_vars*). + /// \param D OpenMP directive. + /// \param ThreadIDVar Variable for thread id in the current OpenMP region. + /// \param InnermostKind Kind of innermost directive (for simple directives it + /// is a directive itself, for combined - its innermost directive). + /// \param CodeGen Code generation sequence for the \a D directive. + llvm::Function * + emitTeamsOutlinedFunction(const OMPExecutableDirective &D, + const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, + const RegionCodeGenTy &CodeGen) override; + + /// Emits code for teams call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// \param OutlinedFn Outlined function to be run by team masters. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// + void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, + SourceLocation Loc, llvm::Function *OutlinedFn, + ArrayRef CapturedVars) override; + +protected: + + /// Hook to allow derived classes to perform checks on the AST that justify + /// SPMD mode. + virtual bool isKnownSPMDMode() const { return false; } + + /// Hook to allow derived classes to perform checks on the AST that justify + /// execution without runtime support. + virtual bool mayNeedRuntimeSupport() const { return true; } + + /// Hook to allow derived classes to perform checks on the AST that justify + /// execution without data sharing support. + virtual bool mayPerformDataSharing() const { return true; } + +private: + /// Simple container for a wrapper of an outlined parallel function and the + /// layout of the passed variables (= captured variables, both shared and + /// firstprivate). + struct WrapperInfo { + llvm::Function *WrapperFn = nullptr; + llvm::StructType *SharedVarsStructTy = nullptr; + llvm::StructType *PrivateVarsStructTy = nullptr; + llvm::SmallBitVector CaptureIsPrivate; + }; + + /// Map an outlined function to its wrapper and shared struct type. The latter + /// defines the layout of the payload and the wrapper will unpack that payload + /// and pass the values to the outlined function. + llvm::DenseMap WrapperInfoMap; + + /// Emit function which wraps the outline parallel region + /// and controls the parameters which are passed to this function. + /// The wrapper ensures that the outlined function is called + /// with the correct arguments when data is shared. + void createParallelDataSharingWrapper(llvm::Function *OutlinedParallelFn, + const OMPExecutableDirective &D); +}; + +} // namespace CodeGen +} // namespace clang + +#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEKERNEL_H diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp @@ -0,0 +1,712 @@ +//===-- CGOpenMPRuntimeTRegion.cpp - OpenMP RT TRegion interface codegen --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of the code generation interface for OpenMP target offloading +// though the Target Region (TRegion) interface. +// +// See the file comment in CGOpenMPRuntimeTRegion.h for more information. +// +//===----------------------------------------------------------------------===// + +#include "CGOpenMPRuntimeTRegion.h" +#include "CodeGenFunction.h" +#include "clang/AST/StmtVisitor.h" + +using namespace clang; +using namespace CodeGen; + +namespace { + +/// Enums for all functions in the target +enum OpenMPTargetRuntimeLibraryCalls { + OMPRTL__kmpc_target_region_kernel_init, + OMPRTL__kmpc_target_region_kernel_deinit, + OMPRTL__kmpc_target_region_kernel_parallel, +}; + +/// Return the runtime function declaration specified by \p Function. +static llvm::Function *getOrCreateRuntimeFunctionDeclaration( + CGOpenMPRuntimeTRegion &CG, CodeGenModule &CGM, + OpenMPTargetRuntimeLibraryCalls Function) { + + llvm::Function *RTFn; + auto *I1Ty = llvm::IntegerType::getInt1Ty(CGM.getLLVMContext()); + switch (Function) { + case OMPRTL__kmpc_target_region_kernel_init: { + // char __kmpc_target_region_kernel_init(bool UseSPMDMode, + // bool UseStateMachine, + // bool RequiresOMPRuntime, + // bool RequiresDataSharing); + llvm::Type *TypeParams[] = {I1Ty, I1Ty, I1Ty, I1Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.Int8Ty, TypeParams, /* isVarArg */ false); + RTFn = cast( + CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_init") + .getCallee()); + break; + } + case OMPRTL__kmpc_target_region_kernel_deinit: { + // void __kmpc_target_region_kernel_deinit(bool UseSPMDMode, + // bool RequiredOMPRuntime); + llvm::Type *TypeParams[] = {I1Ty, I1Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false); + RTFn = cast( + CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_deinit") + .getCallee()); + break; + } + case OMPRTL__kmpc_target_region_kernel_parallel: { + // typedef void (*ParallelWorkFnTy)(void *, void *); + auto *ParWorkFnTy = + llvm::FunctionType::get(CGM.VoidTy, {CGM.VoidPtrTy, CGM.VoidPtrTy}, + /* isVarArg */ false); + // + // void __kmpc_target_region_kernel_parallel(bool UseSPMDMode, + // bool RequiredOMPRuntime, + // ParallelWorkFnTy WorkFn, + // void *SharedVars, + // uint16_t SharedVarsBytes, + // void *PrivateVars, + // uint16_t PrivateVarsBytes, + // bool SharedPointers); + llvm::Type *TypeParams[] = { + I1Ty, I1Ty, ParWorkFnTy->getPointerTo(), + CGM.VoidPtrTy, CGM.Int16Ty, CGM.VoidPtrTy, + CGM.Int16Ty, I1Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false); + + RTFn = cast( + CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_parallel") + .getCallee()); + + RTFn->addParamAttr(2, llvm::Attribute::NoCapture); + RTFn->addParamAttr(3, llvm::Attribute::NoCapture); + RTFn->addParamAttr(5, llvm::Attribute::NoCapture); + RTFn->addParamAttr(5, llvm::Attribute::ReadOnly); + + // Add the callback metadata if it is not present already. + if (!RTFn->hasMetadata(llvm::LLVMContext::MD_callback)) { + llvm::LLVMContext &Ctx = RTFn->getContext(); + llvm::MDBuilder MDB(Ctx); + // Annotate the callback behavior of __kmpc_target_region_kernel_parallel: + // - The callback callee is WorkFn, argument 2 starting with 0. + // - The first callback payload is SharedVars. + // - The second callback payload is PrivateVars. + RTFn->addMetadata( + llvm::LLVMContext::MD_callback, + *llvm::MDNode::get( + Ctx, {MDB.createCallbackEncoding(2, {3, 5}, + /* VarArgsArePassed */ false)})); + } + break; + } + } + + // TODO: Remove all globals and set this attribute. + // + // This is overwritten when the definition is linked in. + // RTFn->addFnAttr(llvm::Attribute::InaccessibleMemOrArgMemOnly); + + return RTFn; +} + +} // anonymous namespace + +void CGOpenMPRuntimeTRegion::emitKernel(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + const RegionCodeGenTy &CodeGen) { + WrapperInfoMap.clear(); + + // Emit target region as a standalone region. + class KernelPrePostActionTy : public PrePostActionTy { + CGOpenMPRuntimeTRegion &RT; + llvm::BasicBlock *ExitBB; + + public: + KernelPrePostActionTy(CGOpenMPRuntimeTRegion &RT) + : RT(RT), ExitBB(nullptr) {} + + void Enter(CodeGenFunction &CGF) override { + RT.emitKernelHeader(CGF, ExitBB); + // Skip target region initialization. + RT.setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true); + } + + void Exit(CodeGenFunction &CGF) override { + RT.clearLocThreadIdInsertPt(CGF); + RT.emitKernelFooter(CGF, ExitBB); + } + + } Action(*this); + CodeGen.setAction(Action); + + emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, + /* IsOffloadEntry */ true, CodeGen); +} + +void CGOpenMPRuntimeTRegion::emitKernelHeader(CodeGenFunction &CGF, + llvm::BasicBlock *&ExitBB) { + CGBuilderTy &Bld = CGF.Builder; + + // Setup BBs in entry function. + llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); + ExitBB = CGF.createBasicBlock(".exit"); + + llvm::Value *Args[] = { + /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()), + /* UseStateMachine */ Bld.getInt1(1), + /* RequiresOMPRuntime */ + Bld.getInt1(mayNeedRuntimeSupport()), + /* RequiresDataSharing */ Bld.getInt1(mayPerformDataSharing())}; + llvm::CallInst *InitCI = CGF.EmitRuntimeCall( + getOrCreateRuntimeFunctionDeclaration( + *this, CGM, OMPRTL__kmpc_target_region_kernel_init), + Args); + + llvm::Value *ExecuteCnd = Bld.CreateICmpEQ(InitCI, Bld.getInt8(1)); + + Bld.CreateCondBr(ExecuteCnd, ExecuteBB, ExitBB); + CGF.EmitBlock(ExecuteBB); +} + +void CGOpenMPRuntimeTRegion::emitKernelFooter(CodeGenFunction &CGF, + llvm::BasicBlock *ExitBB) { + if (!CGF.HaveInsertPoint()) + return; + + llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit"); + CGF.EmitBranch(OMPDeInitBB); + + CGF.EmitBlock(OMPDeInitBB); + + CGBuilderTy &Bld = CGF.Builder; + // DeInitialize the OMP state in the runtime; called by all active threads. + llvm::Value *Args[] = {/* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()), + /* RequiredOMPRuntime */ + Bld.getInt1(mayNeedRuntimeSupport())}; + + CGF.EmitRuntimeCall(getOrCreateRuntimeFunctionDeclaration( + *this, CGM, OMPRTL__kmpc_target_region_kernel_deinit), + Args); + + CGF.EmitBranch(ExitBB); + CGF.EmitBlock(ExitBB); +} + +void CGOpenMPRuntimeTRegion::emitTargetOutlinedFunction( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + if (!IsOffloadEntry) // Nothing to do. + return; + + assert(!ParentName.empty() && "Invalid target region parent name!"); + + emitKernel(D, ParentName, OutlinedFn, OutlinedFnID, CodeGen); + + // Create a unique global variable to indicate the execution mode of this + // target region. The execution mode is either 'non-SPMD' or 'SPMD'. Initially + // all regions are executed in non-SPMD mode. This variable is picked up by + // the offload library to setup the device appropriately before kernel launch. + auto *GVMode = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, /* isConstant */ true, + llvm::GlobalValue::WeakAnyLinkage, llvm::ConstantInt::get(CGM.Int8Ty, 1), + Twine(OutlinedFn->getName(), "_exec_mode")); + CGM.addCompilerUsedGlobal(GVMode); +} + +CGOpenMPRuntimeTRegion::CGOpenMPRuntimeTRegion(CodeGenModule &CGM) + : CGOpenMPRuntime(CGM, "_", "$") { + if (!CGM.getLangOpts().OpenMPIsDevice) + llvm_unreachable("TRegion code generation does only handle device code!"); +} + +void CGOpenMPRuntimeTRegion::emitProcBindClause( + CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, + SourceLocation Loc) { + CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); +} + +void CGOpenMPRuntimeTRegion::emitNumThreadsClause(CodeGenFunction &CGF, + llvm::Value *NumThreads, + SourceLocation Loc) { + CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc); +} + +void CGOpenMPRuntimeTRegion::emitNumTeamsClause(CodeGenFunction &CGF, + const Expr *NumTeams, + const Expr *ThreadLimit, + SourceLocation Loc) { + // Nothing to do for kernel mode, no other modes supported yet. +} + +llvm::Function *CGOpenMPRuntimeTRegion::emitParallelOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + + // Emit target region as a standalone region. + llvm::Function *OutlinedFun = + cast(CGOpenMPRuntime::emitParallelOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen)); + + createParallelDataSharingWrapper(OutlinedFun, D); + + return OutlinedFun; +} + +// TODO: Check if this is actually needed. +static const ValueDecl *getUnderlyingVar(const Expr *E) { + E = E->IgnoreParens(); + if (const auto *ASE = dyn_cast(E)) { + const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempASE = dyn_cast(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + E = Base; + } else if (auto *OASE = dyn_cast(E)) { + const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempOASE = dyn_cast(Base)) + Base = TempOASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempASE = dyn_cast(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + E = Base; + } + E = E->IgnoreParenImpCasts(); + if (const auto *DE = dyn_cast(E)) + return cast(DE->getDecl()->getCanonicalDecl()); + const auto *ME = cast(E); + return cast(ME->getMemberDecl()->getCanonicalDecl()); +} + +void CGOpenMPRuntimeTRegion::createParallelDataSharingWrapper( + llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { + ASTContext &Ctx = CGM.getContext(); + const auto &CS = *D.getCapturedStmt(OMPD_parallel); + + // Create a function that takes as argument the source thread. + FunctionArgList WrapperArgs; + ImplicitParamDecl SharedVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(), + /* Id */ nullptr, Ctx.VoidPtrTy, + ImplicitParamDecl::Other); + ImplicitParamDecl PrivateVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(), + /* Id */ nullptr, Ctx.VoidPtrTy, + ImplicitParamDecl::Other); + WrapperArgs.emplace_back(&SharedVarsArgDecl); + WrapperArgs.emplace_back(&PrivateVarsArgDecl); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); + + auto *WrapperFn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), WrapperFn, CGFI); + + OutlinedParallelFn->setLinkage(llvm::GlobalValue::InternalLinkage); + OutlinedParallelFn->setDoesNotRecurse(); + WrapperFn->setLinkage(llvm::GlobalValue::InternalLinkage); + WrapperFn->setDoesNotRecurse(); + + CodeGenFunction CGF(CGM, /* suppressNewContext */ true); + CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WrapperFn, CGFI, WrapperArgs, + D.getBeginLoc(), D.getBeginLoc()); + + auto AI = WrapperFn->arg_begin(); + llvm::Argument &SharedVarsArg = *(AI++); + llvm::Argument &PrivateVarsArg = *(AI); + SharedVarsArg.setName("shared_vars"); + PrivateVarsArg.setName("private_vars"); + + Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth( + /* DestWidth */ 32, /* Signed */ 1), + /* Name */ ".zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/* C */ 0)); + + setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true); + + // Create the array of arguments and fill it with boilerplate values. + SmallVector Args; + Args.emplace_back(emitThreadIDAddress(CGF, D.getBeginLoc()).getPointer()); + Args.emplace_back(ZeroAddr.getPointer()); + + CGBuilderTy &Bld = CGF.Builder; + + // Collect all variables marked as shared. + llvm::SmallPtrSet SharedVars; + for (const auto *C : D.getClausesOfKind()) + for (const Expr *E : C->getVarRefs()) + SharedVars.insert(getUnderlyingVar(E)); + + // Retrieve the shared and private variables from argument pointers and pass + // them to the outlined function. + llvm::SmallVector SharedStructMemberTypes; + llvm::SmallVector PrivateStructMemberTypes; + + WrapperInfo &WI = WrapperInfoMap[OutlinedParallelFn]; + WI.WrapperFn = WrapperFn; + + auto ArgIt = OutlinedParallelFn->arg_begin() + /* global_tid & bound_tid */ 2; + + // If we require loop bounds they are already part of the outlined function + // encoding, just after global_tid and bound_tid. + bool RequiresLoopBounds = + isOpenMPLoopBoundSharingDirective(D.getDirectiveKind()); + if (RequiresLoopBounds) { + // Register the lower bound in the wrapper info. + WI.CaptureIsPrivate.push_back(true); + PrivateStructMemberTypes.push_back((ArgIt++)->getType()); + // Register the upper bound in the wrapper info. + WI.CaptureIsPrivate.push_back(true); + PrivateStructMemberTypes.push_back((ArgIt++)->getType()); + } + + auto CIt = CS.capture_begin(); + for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CIt) { + bool IsPrivate = CIt->capturesVariableArrayType() || + CIt->capturesVariableByCopy() || + !SharedVars.count(CIt->getCapturedVar()); + WI.CaptureIsPrivate.push_back(IsPrivate); + + auto &StructMemberTypes = + IsPrivate ? PrivateStructMemberTypes : SharedStructMemberTypes; + llvm::Type *ArgTy = (ArgIt++)->getType(); + if (!IsPrivate) { + assert(ArgTy->isPointerTy()); + ArgTy = ArgTy->getPointerElementType(); + } + StructMemberTypes.push_back(ArgTy); + } + + // Verify the position of the outlined function argument iterator as a sanity + // check. + assert(ArgIt == OutlinedParallelFn->arg_end() && + "Not all arguments have been processed!"); + + llvm::Value *SharedVarsStructPtr = nullptr; + llvm::Value *PrivateVarsStructPtr = nullptr; + llvm::LLVMContext &LLVMCtx = OutlinedParallelFn->getContext(); + if (!PrivateStructMemberTypes.empty()) { + WI.PrivateVarsStructTy = llvm::StructType::create( + LLVMCtx, PrivateStructMemberTypes, "omp.private.struct"); + PrivateVarsStructPtr = Bld.CreateBitCast( + &PrivateVarsArg, WI.PrivateVarsStructTy->getPointerTo()); + } + if (!SharedStructMemberTypes.empty()) { + WI.SharedVarsStructTy = llvm::StructType::create( + LLVMCtx, SharedStructMemberTypes, "omp.shared.struct"); + SharedVarsStructPtr = Bld.CreateBitCast( + &SharedVarsArg, WI.SharedVarsStructTy->getPointerTo()); + } + + assert(WI.CaptureIsPrivate.size() + /* global_tid & bound_tid */ 2 == + OutlinedParallelFn->arg_size() && + "Not all arguments have been processed!"); + + unsigned PrivateIdx = 0, SharedIdx = 0; + for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) { + bool IsPrivate = WI.CaptureIsPrivate[i]; + + llvm::Value *StructPtr = + IsPrivate ? PrivateVarsStructPtr : SharedVarsStructPtr; + unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx; + + // TODO: Figure out the real alignment + if (IsPrivate) { + Args.emplace_back( + Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1)); + } else { + llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++); + Args.emplace_back(GEP); + } + } + + assert(Args.size() == OutlinedParallelFn->arg_size()); + emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); + + CGF.FinishFunction(); + + clearLocThreadIdInsertPt(CGF); +} + +void CGOpenMPRuntimeTRegion::emitParallelCall( + CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *Fn, + ArrayRef CapturedVars, const Expr *IfCond) { + if (!CGF.HaveInsertPoint()) + return; + + const WrapperInfo &WI = WrapperInfoMap[Fn]; + + auto &&ParGen = [this, CapturedVars, WI](CodeGenFunction &CGF, + PrePostActionTy &) { + CGBuilderTy &Bld = CGF.Builder; + assert(WI.WrapperFn && "Wrapper function does not exist!"); + + llvm::Value *SharedVarsSize = llvm::Constant::getNullValue(CGM.Int16Ty); + llvm::Value *PrivateVarsSize = SharedVarsSize; + llvm::Value *SharedStructAlloca = llvm::UndefValue::get(CGM.VoidPtrTy); + llvm::Value *PrivateStructAlloca = SharedStructAlloca; + + if (WI.SharedVarsStructTy) { + SharedStructAlloca = CGF.CreateDefaultAlignTempAlloca( + WI.SharedVarsStructTy, ".shared.vars") + .getPointer(); + const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout(); + SharedVarsSize = Bld.getInt16(DL.getTypeAllocSize(WI.SharedVarsStructTy)); + } + if (WI.PrivateVarsStructTy) { + PrivateStructAlloca = CGF.CreateDefaultAlignTempAlloca( + WI.PrivateVarsStructTy, ".private.vars") + .getPointer(); + const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout(); + PrivateVarsSize = + Bld.getInt16(DL.getTypeAllocSize(WI.PrivateVarsStructTy)); + } + + llvm::SmallVector Args; + Args.push_back( + /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode())); + Args.push_back( + /* RequiredOMPRuntime */ Bld.getInt1(mayNeedRuntimeSupport())); + Args.push_back(WI.WrapperFn); + Args.push_back(CGF.EmitCastToVoidPtr(SharedStructAlloca)); + Args.push_back(SharedVarsSize); + Args.push_back(CGF.EmitCastToVoidPtr(PrivateStructAlloca)); + Args.push_back(PrivateVarsSize); + Args.push_back( + /* SharedPointers */ Bld.getInt1(0)); + + assert((CapturedVars.empty() || + (WI.SharedVarsStructTy || WI.PrivateVarsStructTy)) && + "Expected the shared or private struct type to be set if variables " + "are captured!"); + assert((CapturedVars.empty() || + CapturedVars.size() == + (WI.SharedVarsStructTy ? WI.SharedVarsStructTy->getNumElements() + : 0) + + (WI.PrivateVarsStructTy + ? WI.PrivateVarsStructTy->getNumElements() + : 0)) && + "# elements in shared struct types should be number of captured " + "variables!"); + + // Store all captured variables into a single local structure that is then + // passed to the runtime library. + unsigned PrivateIdx = 0, SharedIdx = 0; + for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) { + bool IsPrivate = WI.CaptureIsPrivate[i]; + + llvm::Value *StructPtr = + IsPrivate ? PrivateStructAlloca : SharedStructAlloca; + unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx; + llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++); + llvm::Value *Var = IsPrivate ? CapturedVars[i] + : Bld.CreateAlignedLoad(CapturedVars[i], 1); + Bld.CreateDefaultAlignedStore(Var, GEP); + } + + CGF.EmitRuntimeCall( + getOrCreateRuntimeFunctionDeclaration( + *this, CGM, OMPRTL__kmpc_target_region_kernel_parallel), + Args); + + SharedIdx = 0; + for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) { + bool IsPrivate = WI.CaptureIsPrivate[i]; + if (IsPrivate) + continue; + + llvm::Value *GEP = Bld.CreateStructGEP(SharedStructAlloca, SharedIdx++); + llvm::Value *Var = Bld.CreateAlignedLoad(GEP, 1); + Bld.CreateDefaultAlignedStore(Var, CapturedVars[i]); + } + }; + + auto &&SeqGen = [this, &ParGen, Loc](CodeGenFunction &CGF, + PrePostActionTy &Action) { + // Use an artifical "num_threads(1)" clause to force sequential execution if + // the expression in the 'if clause' evaluated to false. We expect the + // middle-end to clean this up. + emitNumThreadsClause(CGF, CGF.Builder.getInt32(/* C */ 1), Loc); + ParGen(CGF, Action); + }; + + if (IfCond) { + emitOMPIfClause(CGF, IfCond, ParGen, SeqGen); + } else { + CodeGenFunction::RunCleanupsScope Scope(CGF); + RegionCodeGenTy ThenRCG(ParGen); + ThenRCG(CGF); + } +} + +llvm::Function *CGOpenMPRuntimeTRegion::emitTeamsOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + + // Emit target region as a standalone region. + llvm::Function *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen); + + return OutlinedFunVal; +} + +void CGOpenMPRuntimeTRegion::emitTeamsCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, + llvm::Function *OutlinedFn, ArrayRef CapturedVars) { + if (!CGF.HaveInsertPoint()) + return; + + Address ZeroAddr = CGF.CreateMemTemp( + CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1), + /*Name*/ ".zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); + llvm::SmallVector OutlinedFnArgs; + OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); + OutlinedFnArgs.push_back(ZeroAddr.getPointer()); + OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); + emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); +} + +void CGOpenMPRuntimeTRegion::emitCriticalRegion( + CodeGenFunction &CGF, StringRef CriticalName, + const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, + const Expr *Hint) { + llvm_unreachable( + "TODO: TRegion code generation does not support critical regions yet!"); +} + +void CGOpenMPRuntimeTRegion::emitReduction( + CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, + ArrayRef LHSExprs, ArrayRef RHSExprs, + ArrayRef ReductionOps, ReductionOptionsTy Options) { + llvm_unreachable( + "TODO: TRegion code generation does not support reductions yet!"); +} + +void CGOpenMPRuntimeTRegion::emitFunctionProlog(CodeGenFunction &CGF, + const Decl *D) {} + +void CGOpenMPRuntimeTRegion::functionFinished(CodeGenFunction &CGF) { + CGOpenMPRuntime::functionFinished(CGF); +} + +void CGOpenMPRuntimeTRegion::getDefaultDistScheduleAndChunk( + CodeGenFunction &CGF, const OMPLoopDirective &S, + OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const { + CGOpenMPRuntime::getDefaultDistScheduleAndChunk(CGF, S, ScheduleKind, Chunk); +} + +void CGOpenMPRuntimeTRegion::getDefaultScheduleAndChunk( + CodeGenFunction &CGF, const OMPLoopDirective &S, + OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const { + ScheduleKind = OMPC_SCHEDULE_static; + // Chunk size is 1 in this case. + llvm::APInt ChunkSize(32, 1); + ChunkExpr = IntegerLiteral::Create( + CGF.getContext(), ChunkSize, + CGF.getContext().getIntTypeForBitwidth(32, /* Signed */ 0), + SourceLocation()); +} + +// ------------------------------------------------------------------------ // +// TODO: The following cuda specific part should live somewhere else, +// potentially in a derived class. + +void CGOpenMPRuntimeTRegion::createOffloadEntry( + llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t, + llvm::GlobalValue::LinkageTypes) { + // TODO: Add support for global variables on the device after declare target + // support. + if (!isa(Addr)) + return; + llvm::Module &M = CGM.getModule(); + llvm::LLVMContext &Ctx = CGM.getLLVMContext(); + + // Get "nvvm.annotations" metadata node + llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); + + llvm::Metadata *MDVals[] = { + llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"), + llvm::ConstantAsMetadata::get( + llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; + // Append metadata to nvvm.annotations + MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); +} + +#include "clang/Basic/Cuda.h" + +// Get current CudaArch and ignore any unknown values +static CudaArch getCudaArch(CodeGenModule &CGM) { + if (!CGM.getTarget().hasFeature("ptx")) + return CudaArch::UNKNOWN; + llvm::StringMap Features; + CGM.getTarget().initFeatureMap(Features, CGM.getDiags(), + CGM.getTarget().getTargetOpts().CPU, + CGM.getTarget().getTargetOpts().Features); + for (const auto &Feature : Features) { + if (Feature.getValue()) { + CudaArch Arch = StringToCudaArch(Feature.getKey()); + if (Arch != CudaArch::UNKNOWN) + return Arch; + } + } + return CudaArch::UNKNOWN; +} + +/// Check to see if target architecture supports unified addressing which is +/// a restriction for OpenMP requires clause "unified_shared_memory". +void CGOpenMPRuntimeTRegion::checkArchForUnifiedAddressing( + CodeGenModule &CGM, const OMPRequiresDecl *D) const { + for (const OMPClause *Clause : D->clauselists()) { + if (Clause->getClauseKind() == OMPC_unified_shared_memory) { + switch (getCudaArch(CGM)) { + case CudaArch::SM_20: + case CudaArch::SM_21: + case CudaArch::SM_30: + case CudaArch::SM_32: + case CudaArch::SM_35: + case CudaArch::SM_37: + case CudaArch::SM_50: + case CudaArch::SM_52: + case CudaArch::SM_53: + case CudaArch::SM_60: + case CudaArch::SM_61: + case CudaArch::SM_62: + CGM.Error(Clause->getBeginLoc(), + "Target architecture does not support unified addressing"); + return; + case CudaArch::SM_70: + case CudaArch::SM_72: + case CudaArch::SM_75: + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX902: + case CudaArch::GFX904: + case CudaArch::GFX906: + case CudaArch::GFX909: + case CudaArch::UNKNOWN: + break; + case CudaArch::LAST: + llvm_unreachable("Unexpected Cuda arch."); + } + } + } +} diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -69,6 +69,7 @@ CGOpenCLRuntime.cpp CGOpenMPRuntime.cpp CGOpenMPRuntimeNVPTX.cpp + CGOpenMPRuntimeTRegion.cpp CGRecordLayoutBuilder.cpp CGStmt.cpp CGStmtOpenMP.cpp diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -20,6 +20,7 @@ #include "CGOpenCLRuntime.h" #include "CGOpenMPRuntime.h" #include "CGOpenMPRuntimeNVPTX.h" +#include "CGOpenMPRuntimeTRegion.h" #include "CodeGenFunction.h" #include "CodeGenPGO.h" #include "ConstantEmitter.h" @@ -67,6 +68,11 @@ llvm::cl::desc("Emit limited coverage mapping information (experimental)"), llvm::cl::init(false)); +static llvm::cl::opt UseGenericTRegionInterface( + "openmp-tregion-runtime", llvm::cl::ZeroOrMore, llvm::cl::Hidden, + llvm::cl::desc("Use the generic target region OpenMP runtime interface"), + llvm::cl::init(false)); + static const char AnnotationSection[] = "llvm.metadata"; static CGCXXABI *createCXXABI(CodeGenModule &CGM) { @@ -206,7 +212,10 @@ case llvm::Triple::nvptx64: assert(getLangOpts().OpenMPIsDevice && "OpenMP NVPTX is only prepared to deal with device code."); - OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this)); + if (UseGenericTRegionInterface) + OpenMPRuntime.reset(new CGOpenMPRuntimeTRegion(*this)); + else + OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this)); break; default: if (LangOpts.OpenMPSimd) diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -287,6 +287,7 @@ void initializeNameAnonGlobalLegacyPassPass(PassRegistry&); void initializeNaryReassociateLegacyPassPass(PassRegistry&); void initializeNewGVNLegacyPassPass(PassRegistry&); +void initializeOpenMPOptLegacyPass(PassRegistry&); void initializeObjCARCAAWrapperPassPass(PassRegistry&); void initializeObjCARCAPElimPass(PassRegistry&); void initializeObjCARCContractPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -143,6 +143,7 @@ (void) llvm::createLowerInvokePass(); (void) llvm::createLowerSwitchPass(); (void) llvm::createNaryReassociatePass(); + (void) llvm::createOpenMPOptLegacyPass(); (void) llvm::createObjCARCAAWrapperPass(); (void) llvm::createObjCARCAPElimPass(); (void) llvm::createObjCARCExpandPass(); diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h --- a/llvm/include/llvm/Transforms/IPO.h +++ b/llvm/include/llvm/Transforms/IPO.h @@ -156,6 +156,11 @@ /// ModulePass *createIPConstantPropagationPass(); +//===----------------------------------------------------------------------===// +/// createOpenMPOpt - This pass performs OpenMP specific optimizations. +/// +Pass *createOpenMPOptLegacyPass(); + //===----------------------------------------------------------------------===// /// createIPSCCPPass - This pass propagates constants from call sites into the /// bodies of functions, and keeps track of whether basic blocks are executable diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -25,6 +25,7 @@ LoopExtractor.cpp LowerTypeTests.cpp MergeFunctions.cpp + OpenMPOpt.cpp PartialInlining.cpp PassManagerBuilder.cpp PruneEH.cpp diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp --- a/llvm/lib/Transforms/IPO/IPO.cpp +++ b/llvm/lib/Transforms/IPO/IPO.cpp @@ -35,6 +35,7 @@ initializeGlobalSplitPass(Registry); initializeHotColdSplittingLegacyPassPass(Registry); initializeIPCPPass(Registry); + initializeOpenMPOptLegacyPass(Registry); initializeAlwaysInlinerLegacyPassPass(Registry); initializeSimpleInlinerPass(Registry); initializeInferFunctionAttrsLegacyPassPass(Registry); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -0,0 +1,784 @@ +//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// OpenMP specific optimizations +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +#define DEBUG_TYPE "openmp-opt" + +static cl::opt BuildCustomStateMachines( + "openmp-opt-kernel-state-machines", cl::ZeroOrMore, + cl::desc("Build custom state machines for non-SPMD kernels."), cl::Hidden, + cl::init(true)); + +static cl::opt PerformOpenMPSIMDIZATION( + "openmp-opt-kernel-simdization", cl::ZeroOrMore, + cl::desc("Convert non-SPMD kernels to SPMD mode if possible."), cl::Hidden, + cl::init(true)); + +static cl::opt ForceOpenMPSIMDIZATION( + "openmp-opt-kernel-force-simdization", cl::ZeroOrMore, + cl::desc("Force execution of non-SPMD kernels in SPMD mode."), cl::Hidden, + cl::init(false)); + +STATISTIC(NumKernelsConvertedToSPMD, + "Number of GPU kernels converted to SPMD mode"); +STATISTIC(NumParallelCallsConvertedToSPMD, + "Number of parallel GPU kernel regions converted to SPMD mode"); +STATISTIC(NumKernelsNonSPMDNoParallelism, + "Number of GPU kernel in non-SPMD mode without parallelism"); +STATISTIC(NumCustomStateMachinesCreated, + "Number of custom GPU kernel non-SPMD mode state machines created"); +STATISTIC(NumCustomStateMachinesNoFallback, + "Number of custom GPU kernel non-SPMD mode state machines without " + "fallback"); + +namespace { + +/// Set of constants that describe the positions of arguments (ARG_FN_NAME) and +/// the meaning of return values (RET_FN_MEANING) for the target region kernel +/// interface. Has to be kept in sync with +/// openmp/libomptarget/deviceRTLs/common/target_region.h +/// and the respective implementations. +enum { + ARG_INIT_USE_SPMD_MODE = 0, + ARG_INIT_REQUIRES_OMP_RUNTIME = 1, + ARG_INIT_USE_STATE_MACHINE = 2, + ARG_INIT_REQUIRES_DATA_SHARING = 3, + + ARG_DEINIT_USE_SPMD_MODE = 0, + ARG_DEINIT_REQUIRES_OMP_RUNTIME = 1, + + ARG_PARALLEL_USE_SPMD_MODE = 0, + ARG_PARALLEL_REQUIRES_OMP_RUNTIME = 1, + ARG_PARALLEL_WORK_FUNCTION = 2, + ARG_PARALLEL_SHARED_VARS = 3, + ARG_PARALLEL_SHARED_VARS_BYTES = 4, + + RET_INIT_IS_WORKER = -1, + RET_INIT_IS_SURPLUS = 0, + RET_INIT_IS_MASTER = 1, +}; + +/// A macro list to represent known functions from the omp, __kmpc, and target +/// region interfaces. The first value is an enum identifier, see FunctionID, +/// the second value is the function name, and the third the expected number of +/// arguments. +#define KNOWN_FUNCTIONS() \ + KF(FID_OMP_GET_TEAM_NUM, "omp_get_team_num", 0) \ + KF(FID_OMP_GET_NUM_TEAMS, "omp_get_num_teams", 0) \ + KF(FID_OMP_GET_THREAD_NUM, "omp_get_thread_num", 0) \ + KF(FID_OMP_GET_NUM_THREADS, "omp_get_num_threads", 0) \ + KF(FID_OMP_SET_NUM_THREADS, "omp_set_num_threads", 1) \ + KF(FID_KMPC_TREGION_KERNEL_INIT, "__kmpc_target_region_kernel_init", 4) \ + KF(FID_KMPC_TREGION_KERNEL_DEINIT, "__kmpc_target_region_kernel_deinit", 2) \ + KF(FID_KMPC_TREGION_KERNEL_PARALLEL, "__kmpc_target_region_kernel_parallel", \ + 8) \ + KF(FID_KMPC_FOR_STATIC_INIT_4, "__kmpc_for_static_init_4", 9) \ + KF(FID_KMPC_FOR_STATIC_FINI, "__kmpc_for_static_fini", 2) \ + KF(FID_KMPC_GLOBAL_THREAD_NUM, "__kmpc_global_thread_num", 1) \ + KF(FID_KMPC_DISPATCH_INIT_4, "__kmpc_dispatch_init_4", 7) \ + KF(FID_KMPC_DISPATCH_NEXT_4, "__kmpc_dispatch_next_4", 6) + +/// An identifier enum for each known function as well as the different kinds +/// of unknown functions we distinguish. +enum FunctionID { +#define KF(NAME, STR, NARGS) NAME, + KNOWN_FUNCTIONS() +#undef KF + // Unknown functions + //{ + FID_KMPC_UNKNOWN, ///< unknown __kmpc_XXXX function + FID_OMP_UNKOWN, ///< unknown omp_XXX function + FID_NVVM_UNKNOWN, ///< unknown llvm.nvvm.XXX function + FID_LLVM_UNKNOWN, ///< unknown llvm.XXX function + FID_UNKNOWN ///< unknown function without known prefix. + //} +}; + +static FunctionID getFunctionID(Function *F) { + if (!F) + return FID_UNKNOWN; +#define KF(NAME, STR, NARGS) .Case(STR, NAME) + return StringSwitch(F->getName()) KNOWN_FUNCTIONS() + .StartsWith("__kmpc_", FID_KMPC_UNKNOWN) + .StartsWith("omp_", FID_OMP_UNKOWN) + .StartsWith("llvm.nvvm.", FID_NVVM_UNKNOWN) + .StartsWith("llvm.", FID_LLVM_UNKNOWN) + .Default(FID_UNKNOWN); +#undef KF +} + +static Type *getOrCreateStructIdentTypePtr(Module &M) { + // TODO create if not present! + return M.getTypeByName("struct.ident_t")->getPointerTo(); +} + +// TODO: Simplify function declaration +static Function *getOrCreateFn(Type *RT, const char *Name, Module &M) { + Function *Fn = M.getFunction(Name); + if (!Fn) { + FunctionType *FType = FunctionType::get(RT, {}, false); + Fn = + Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M); + } + return Fn; +} +static Function *getOrCreateFn(Type *RT, Type *T0, Type *T1, const char *Name, + Module &M) { + Function *Fn = M.getFunction(Name); + if (!Fn) { + FunctionType *FType = FunctionType::get(RT, {T0, T1}, false); + Fn = + Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M); + } + return Fn; +} + +static Function *getOrCreateSimpleSPMDBarrierFn(Module &M) { + static const char *Name = "__kmpc_barrier_simple_spmd"; + Function *Fn = M.getFunction(Name); + if (!Fn) { + LLVMContext &Ctx = M.getContext(); + FunctionType *FType = FunctionType::get( + Type::getVoidTy(Ctx), + {getOrCreateStructIdentTypePtr(M), Type::getInt32Ty(Ctx)}, false); + Fn = + Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M); + } + return Fn; +} + +/// A helper class to introduce smart guarding code. +struct GuardGenerator { + + /// Inform the guard generator about the side-effect instructions collected in + /// @p SideEffectInst. + /// + /// \Returns True if all registered side-effects can be (efficiently) guarded. + bool registerSideEffects(SmallVectorImpl &SideEffectInst) { + bool Guarded = true; + if (SideEffectInst.empty()) + return Guarded; + + const Module &M = *SideEffectInst.front()->getModule(); + const DataLayout &DL = M.getDataLayout(); + + SmallVector UnguardedSideEffectInst; + for (Instruction *I : SideEffectInst) { + if (CallInst *CI = dyn_cast(I)) { + if (getFunctionID(CI->getCalledFunction()) != FID_UNKNOWN) + continue; + } else if (StoreInst *SI = dyn_cast(I)) { + if (isa( + SI->getPointerOperand()->stripInBoundsConstantOffsets())) + continue; + } else if (LoadInst *LI = dyn_cast(I)) { + if (isSafeToLoadUnconditionally(LI->getPointerOperand(), + LI->getAlignment(), DL)) + continue; + } + LLVM_DEBUG(dbgs() << "Non-SPMD side effect found: " << *I << "\n"); + UnguardedSideEffectInst.push_back(I); + } + + return UnguardedSideEffectInst.empty(); + } + + bool registerReadEffects(SmallVectorImpl &ReadEffectInst) { + return registerSideEffects(ReadEffectInst); + } + + void introduceGuards() { + // TODO: The guard generator cannot introduce guards yet but the registerXXX + // functions above are aware of that! + } +}; + +/// Helper structure to represent and work with a target region kernel. +struct KernelTy { + + KernelTy(Function *KernelFn) : KernelFn(*KernelFn) {} + + /// Optimize this kernel, return true if something was done. + bool optimize(); + +private: + /// Analyze this kernel, return true if successful. + bool analyze(Function &F, SmallPtrSetImpl &Visited, + bool InParallelRegion); + + /// Return true if the kernel is executed in SPMD mode. + bool isExecutedInSPMDMode(); + + /// Convert a non-SPMD mode kernel to SPMD mode, return true if successful. + bool convertToSPMD(); + + /// Create a custom state machine in the module, return true if successful. + bool createCustomStateMachine(); + + /// All side-effect instructions potentially executed in this kernel. + SmallVector SideEffectInst; + + /// All read-only instructions potentially executed in this kernel. + SmallVector ReadOnlyInst; + + /// All non-analyzed calls contained in this kernel. They are separated by + /// their function ID which describes identifies known calls. + SmallVector KernelCalls[FID_UNKNOWN + 1]; + + /// All non-analyzed calls contained in parallel regions which are part of + /// this kernel. They are separated by their function ID which describes + /// identifies known calls. + SmallVector ParallelRegionCalls[FID_UNKNOWN + 1]; + + /// The entry function of this kernel. + Function &KernelFn; +}; + +bool KernelTy::analyze(Function &F, SmallPtrSetImpl &Visited, + bool InParallelRegion) { + if (!Visited.insert(&F).second) + return true; + + LLVM_DEBUG(dbgs() << "Analyze " + << (InParallelRegion ? "parallel-region" : "kernel") + << " function: " << F.getName() << "\n"); + + // Determine where we remember the call. + auto &CallsArray = InParallelRegion ? ParallelRegionCalls : KernelCalls; + + for (Instruction &I : instructions(&F)) { + + // In parallel regions we only look for calls, outside, we look for all + // side-effect and read-only instructions. + if (!InParallelRegion) { + // Handle non-side-effect instructions first. These will not write or + // throw which makes reading the only interesting potential property. + if (!I.mayHaveSideEffects()) { + if (I.mayReadFromMemory()) { + LLVM_DEBUG(dbgs() << "- read-only: " << I << "\n"); + ReadOnlyInst.push_back(&I); + } + continue; + } + + // Now we handle all non-call instructions. + if (!isa(I)) { + LLVM_DEBUG(dbgs() << "- side-effect: " << I << "\n"); + SideEffectInst.push_back(&I); + continue; + } + } + + if (!isa(I)) + continue; + + CallInst &CI = cast(I); + Function *Callee = CI.getCalledFunction(); + + // For exact definitions we recurs. + if (Callee && !Callee->isDeclaration() && Callee->isDefinitionExact()) { + // If recursive analysis failed we bail, otherwise the + // information was collected in the internal state. + if (!analyze(*Callee, Visited, InParallelRegion)) + return false; + continue; + } + + // Check that know functions have the right number of arguments early on. + // Additionally provide debug output based on the function ID. + FunctionID ID = getFunctionID(Callee); + + switch (ID) { +#define KF(NAME, STR, NARGS) \ + case NAME: \ + LLVM_DEBUG( \ + dbgs() << "- known call " \ + << (CI.getNumArgOperands() != NARGS ? "[#arg missmatch!]" : "") \ + << ": " << I << "\n"); \ + if (CI.getNumArgOperands() != NARGS) \ + ID = FID_UNKNOWN; \ + break; + KNOWN_FUNCTIONS() +#undef KF + case FID_KMPC_UNKNOWN: + LLVM_DEBUG(dbgs() << "- unknown __kmpc_* call: " << I << "\n"); + break; + case FID_OMP_UNKOWN: + LLVM_DEBUG(dbgs() << "- unknown omp_* call: " << I << "\n"); + break; + case FID_NVVM_UNKNOWN: + LLVM_DEBUG(dbgs() << "- unknown llvm.nvvm.* call: " << I << "\n"); + break; + case FID_LLVM_UNKNOWN: + LLVM_DEBUG(dbgs() << "- unknown llvm.* call: " << I << "\n"); + break; + case FID_UNKNOWN: + LLVM_DEBUG(dbgs() << "- unknown call: " << I << "\n"); + break; + } + + CallsArray[ID].push_back(&CI); + } + + // If we did not analyze the kernel function but some other one down the call + // chain we are done now. + // TODO: Add more verification code here. + if (&F != &KernelFn) + return true; + + assert(&KernelCalls == &CallsArray); + + // If we are analyzing the kernel function we need to verify we have at least + // the calls we expect to see in the right places. + if (KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() != 1 || + KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() != 1 || + KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].front()->getParent() != + &F.getEntryBlock()) { + LLVM_DEBUG(dbgs() << "- malformed kernel: [#Init: " + << KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() + << "][#DeInit: " + << KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() + << "]\n"); + return false; + } + + return true; +} + +bool KernelTy::isExecutedInSPMDMode() { + assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 && + "Non-canonical kernel form!"); + auto *SPMDFlag = cast( + KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].front()->getArgOperand(0)); + assert(SPMDFlag->isZeroValue() || SPMDFlag->isOneValue()); + return SPMDFlag->isOneValue(); +} + +bool KernelTy::optimize() { + bool Changed = false; + + // First analyze the code. If that fails for some reason we bail out early. + SmallPtrSet Visited; + if (!analyze(KernelFn, Visited, /* InParallelRegion */ false)) + return Changed; + + Visited.clear(); + for (CallInst *ParCI : KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL]) { + Value *ParCIParallelFnArg = + ParCI->getArgOperand(ARG_PARALLEL_WORK_FUNCTION); + Function *ParallelFn = + dyn_cast(ParCIParallelFnArg->stripPointerCasts()); + if (!ParallelFn || + !analyze(*ParallelFn, Visited, /* InParallelRegion */ true)) + return Changed; + } + + Changed |= convertToSPMD(); + Changed |= createCustomStateMachine(); + + return Changed; +} + +bool KernelTy::convertToSPMD() { + if (isExecutedInSPMDMode()) + return false; + + bool Changed = false; + + // Use a generic guard generator to determine if suitable guards for all + // side effect instructions can be placed. + GuardGenerator GG; + + // Check if SIMDIZATION is possible, in case it is not forced. + if (!ForceOpenMPSIMDIZATION) { + // Unknown calls are not handled yet and will cause us to bail. + if (!KernelCalls[FID_UNKNOWN].empty()) + return Changed; + + // If we cannot guard all side effect instructions bail out. + if (!GG.registerSideEffects(SideEffectInst)) + return Changed; + + if (!GG.registerReadEffects(ReadOnlyInst)) + return Changed; + + // TODO: Emit a remark. + LLVM_DEBUG(dbgs() << "Transformation to SPMD OK\n"); + + // If we disabled SIMDIZATION we only emit the debug message and bail. + if (!PerformOpenMPSIMDIZATION) + return Changed; + } + + // Actually emit the guard code after we decided to perform SIMDIZATION. + GG.introduceGuards(); + + // Create an "is-SPMD" flag. + Type *FlagTy = KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0] + ->getArgOperand(ARG_INIT_USE_SPMD_MODE) + ->getType(); + Constant *SPMDFlag = ConstantInt::getTrue(FlagTy); + + // Update the init and deinit calls with the "is-SPMD" flag to indicate + // SPMD mode. + assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 && + "Non-canonical kernel form!"); + assert(KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() == 1 && + "Non-canonical kernel form!"); + KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0]->setArgOperand( + ARG_INIT_USE_SPMD_MODE, SPMDFlag); + KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT][0]->setArgOperand( + ARG_DEINIT_USE_SPMD_MODE, SPMDFlag); + + // Use the simple barrier to synchronize all threads in SPMD mode after each + // parallel region. + Function *SimpleBarrierFn = + getOrCreateSimpleSPMDBarrierFn(*KernelFn.getParent()); + + // For each parallel region, identified by the + // __kmpc_target_region_kernel_parallel call, we set the "is-SPMD" flag and + // introduce a succeeding barrier call. + for (CallInst *ParCI : KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL]) { + ParCI->setArgOperand(ARG_PARALLEL_USE_SPMD_MODE, SPMDFlag); + auto AI = SimpleBarrierFn->arg_begin(); + CallInst::Create(SimpleBarrierFn, + {Constant::getNullValue((AI++)->getType()), + Constant::getNullValue((AI)->getType())}, + "", ParCI->getNextNode()); + } + + // TODO: serialize nested parallel regions + + // Finally, we change the global exec_mode variable to indicate SPMD mode. + GlobalVariable *ExecMode = KernelFn.getParent()->getGlobalVariable( + (KernelFn.getName() + "_exec_mode").str()); + assert(ExecMode && + "Assumed to find an execution mode hint among the globals"); + assert(ExecMode->getInitializer()->isOneValue() && + "Assumed target_region execution mode prior to 'SPMD'-zation"); + ExecMode->setInitializer( + Constant::getNullValue(ExecMode->getInitializer()->getType())); + + // Bookkeeping + NumKernelsConvertedToSPMD++; + NumParallelCallsConvertedToSPMD += + KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].size(); + + return Changed; +} + +bool KernelTy::createCustomStateMachine() { + if (isExecutedInSPMDMode()) + return false; + + // TODO: Warn or eliminate the offloading if no parallel regions are present. + // TODO: Use reachability to eliminate the loop and if-cascade + // + // The user module code looks as follows if this function returns true. + // + // ThreadKind = __kmpc_target_region_kernel_init(...) + // if (ThreadKind == -1) { // actual worker thread + // do { + // __kmpc_barrier_simple_spmd(...) + // void *WorkFn; + // bool IsActive = __kmpc_kernel_parallel(&WorkFn, ...); + // if (!WorkFn) + // goto exit; + // if (IsActive) { + // char *SharedVars = __kmpc_target_region_kernel_get_shared_memory(); + // char *PrivateVars = + // __kmpc_target_region_kernel_get_private_memory(); + // + // ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars); + // + // __kmpc_kernel_end_parallel(); + // } + // __kmpc_barrier_simple_spmd(...) + // } while (true); + // } else if (ThreadKind == 0) { // surplus worker thread + // goto exit; + // } else { // team master thread + // goto user_code; + // } + + if (KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].size() == 0) { + LLVM_DEBUG(dbgs() << "Will not build a custom state machine because there " + "are no known parallel regions in the kernel.\n"); + // TODO: If we also know there are no hidden parallel calls we can terminate + // all but the + // master thread right away. + NumKernelsNonSPMDNoParallelism++; + return false; + } + + assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 && + "Non-canonical kernel form!"); + CallInst *InitCI = KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0]; + + // Check if a custom state machine was already implemented. + auto *UseSM = + dyn_cast(InitCI->getArgOperand(ARG_INIT_USE_STATE_MACHINE)); + if (!UseSM || !UseSM->isOne()) { + LLVM_DEBUG(dbgs() << "Will not build a custom state machine because of " + << *KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0] << "\n"); + return false; + } + + InitCI->setName("thread_kind"); + LLVMContext &Ctx = InitCI->getContext(); + + // Create local storage for the work function pointer. + Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); + AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "work_fn.addr", + &KernelFn.getEntryBlock().front()); + + Instruction *IP = InitCI->getNextNode(); + + Type *FlagTy = InitCI->getArgOperand(ARG_INIT_USE_STATE_MACHINE)->getType(); + Constant *SMFlag = ConstantInt::getFalse(FlagTy); + InitCI->setArgOperand(ARG_INIT_USE_STATE_MACHINE, SMFlag); + + // Check the return value of __kmpc_target_region_kernel_init. First compare + // it to RET_INIT_IS_WORKER. + Instruction *WorkerCnd = new ICmpInst( + IP, ICmpInst::ICMP_EQ, InitCI, + ConstantInt::getSigned(InitCI->getType(), RET_INIT_IS_WORKER), + "is_worker"); + + // Create the conditional which is entered by worker threads. + Instruction *WaitTI = SplitBlockAndInsertIfThen(WorkerCnd, IP, false); + BasicBlock *WaitBB = WaitTI->getParent(); + WaitBB->setName("worker.wait"); + IP->getParent()->setName("master_check"); + + Instruction *MasterCheckTI = IP->getParent()->getTerminator(); + assert(MasterCheckTI->getNumSuccessors() == 2); + assert(WaitTI->getNumSuccessors() == 1); + + // Determine the final block, that is a trivial one where the kernel ends. + BasicBlock *FinalBB = nullptr; + if (MasterCheckTI->getSuccessor(0)->size() == 1 && + isa(MasterCheckTI->getSuccessor(0)->getTerminator())) + FinalBB = MasterCheckTI->getSuccessor(0); + else if (MasterCheckTI->getSuccessor(1)->size() == 1 && + isa(MasterCheckTI->getSuccessor(1)->getTerminator())) + FinalBB = MasterCheckTI->getSuccessor(1); + assert(FinalBB && "Could not determine the final kernal block."); + + // Use the simple barrier to synchronize all threads in SPMD mode after each + // parallel region. + Module &M = *KernelFn.getParent(); + Function *SimpleBarrierFn = getOrCreateSimpleSPMDBarrierFn(M); + + auto AI = SimpleBarrierFn->arg_begin(); + Instruction *BarrierCall = + CallInst::Create(SimpleBarrierFn, + {Constant::getNullValue((AI++)->getType()), + Constant::getNullValue((AI)->getType())}, + "", WaitTI); + + Function *KernelParallelFn = + getOrCreateFn(Type::getInt1Ty(Ctx), VoidPtrTy->getPointerTo(), + Type::getInt16Ty(Ctx), "__kmpc_kernel_parallel", M); + + Value *RequiresOMPRuntime = CastInst::CreateZExtOrBitCast( + InitCI->getArgOperand(ARG_INIT_REQUIRES_OMP_RUNTIME), + Type::getInt16Ty(Ctx), "", WaitTI); + Instruction *ActiveCnd = CallInst::Create( + KernelParallelFn, {WorkFnAI, RequiresOMPRuntime}, "is_active", WaitTI); + + Type *WorkFnPrototype = + FunctionType::get(Type::getVoidTy(Ctx), {VoidPtrTy, VoidPtrTy}, false) + ->getPointerTo(); + Value *WorkFnAICast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( + WorkFnAI, WorkFnPrototype->getPointerTo(), "Work_fn.addr_cast", WaitTI); + Value *WorkFn = new LoadInst(WorkFnAICast, "work_fn", WaitTI); + + Instruction *WorkFnCnd = + new ICmpInst(WaitTI, ICmpInst::ICMP_EQ, WorkFn, + Constant::getNullValue(WorkFn->getType()), "no_work"); + + Instruction *FinishedTI = SplitBlockAndInsertIfThen(WorkFnCnd, WaitTI, false); + FinishedTI->getParent()->setName("worker.finished"); + WaitTI->getParent()->setName("worker.active_check"); + + Instruction *ActiveTI = SplitBlockAndInsertIfThen(ActiveCnd, WaitTI, false); + ActiveTI->getParent()->setName("worker.active"); + WaitTI->getParent()->setName("worker.inactive"); + + Function *KernelGetSharedVars = getOrCreateFn( + VoidPtrTy, "__kmpc_target_region_kernel_get_shared_memory", M); + Value *SharedVars = CallInst::Create(KernelGetSharedVars, "", ActiveTI); + Function *KernelGetPrivateVars = getOrCreateFn( + VoidPtrTy, "__kmpc_target_region_kernel_get_private_memory", M); + Value *PrivateVars = CallInst::Create(KernelGetPrivateVars, "", ActiveTI); + + BasicBlock *ExecuteBB = ActiveTI->getParent(); + BasicBlock *ParallelEndBB = SplitBlock(ExecuteBB, ActiveTI); + ParallelEndBB->setName("worker.parallel_end"); + + Function *KernelEndParallelFn = + getOrCreateFn(Type::getVoidTy(Ctx), "__kmpc_kernel_end_parallel", M); + CallInst::Create(KernelEndParallelFn, "", ActiveTI); + + // A fallback is required if we might not see all parallel regions + // (__kmpc_target_region_kernel_parallel calls). This could be the case if + // there is an unknown function call with side effects in the target region + // or inside one of the parallel regions. + bool RequiresFallback = !KernelCalls[FID_UNKNOWN].empty() || + !ParallelRegionCalls[FID_UNKNOWN].empty(); + + // Collect all target region parallel calls + // (__kmpc_target_region_kernel_parallel). + SmallVector KernelParallelCalls; + KernelParallelCalls.append( + KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].begin(), + KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].end()); + KernelParallelCalls.append( + ParallelRegionCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].begin(), + ParallelRegionCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].end()); + + IP = ExecuteBB->getTerminator(); + + // For each parallel call create a conditional that compares the work function + // against the parallel work function of this parallel call, if available. If + // the function pointers are equal we call the known parallel call work + // function directly and continue to the end of the if-cascade. + for (CallInst *ParCI : KernelParallelCalls) { + Function *ParFn = dyn_cast( + ParCI->getArgOperand(ARG_PARALLEL_WORK_FUNCTION)->stripPointerCasts()); + if (!ParFn) { + LLVM_DEBUG( + dbgs() << "Require fallback due to unknown parallel function\n"); + RequiresFallback = true; + continue; + } + + Value *ParFnCnd = + new ICmpInst(IP, ICmpInst::ICMP_EQ, WorkFn, ParFn, "par_fn_check"); + Instruction *ParFnTI = SplitBlockAndInsertIfThen(ParFnCnd, IP, false); + IP->getParent()->setName("worker.check.next"); + ParFnTI->getParent()->setName("worker.execute." + ParFn->getName()); + CallInst::Create(ParFn, {SharedVars, PrivateVars}, "", ParFnTI); + ParFnTI->setSuccessor(0, ParallelEndBB); + } + + // If a fallback is required we emit a indirect call before we jump to the + // point where all cases converge. + if (RequiresFallback) + CallInst::Create(WorkFn, {SharedVars, PrivateVars}, "", IP); + + // Insert a barrier call at the convergence point, right before the back edge. + BarrierCall->clone()->insertBefore(WaitTI); + + // Rewire the CFG edges to introduce the back and exit edge of the new loop. + // TODO: Add the new loop to LI! + FinishedTI->setSuccessor(0, FinalBB); + WaitTI->setSuccessor(0, WaitBB); + + // Bookkeeping. + NumCustomStateMachinesCreated++; + NumCustomStateMachinesNoFallback += !RequiresFallback; + + return true; +} + +template +static void collectCallersOf(Module &M, StringRef Name, + SmallVectorImpl &Callers) { + Function *Callee = M.getFunction(Name); + + // If the callee function is not present, we are done. + if (!Callee) + return; + + // If it exists we check all users. + for (const Use &U : Callee->uses()) { + CallSite CS(U.getUser()); + + // Filter out non-callee uses and non-call uses. + if (!CS || !CS.isCallee(&U) || !isa(CS.getInstruction())) + continue; + + // Found a caller, use it to create a T type object and put the result + // in the Callers vector. + Callers.emplace_back(T(CS.getCaller())); + } +} + +/// OpenMPOpt - The interprocedural OpenMP optimization pass implementation. +struct OpenMPOpt { + + bool runOnModule(Module &M) { + bool Changed = false; + + // Collect target regions kernels identified by a call to + // __kmpc_target_region_kernel_init. + collectCallersOf(M, "__kmpc_target_region_kernel_init", TRKernels); + + for (KernelTy &K : TRKernels) + Changed |= K.optimize(); + + return Changed; + } + +private: + /// A collection of all target regions kernels we found. + SmallVector TRKernels; +}; + +// TODO: This could be a CGSCC pass as well. +struct OpenMPOptLegacy : public ModulePass { + static char ID; // Pass identification, replacement for typeid + OpenMPOpt OMPOpt; + + OpenMPOptLegacy() : ModulePass(ID) { + initializeOpenMPOptLegacyPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override {} + + bool runOnModule(Module &M) override { return OMPOpt.runOnModule(M); } +}; + +// TODO: Add a new PM entry point. + +} // namespace + +char OpenMPOptLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(OpenMPOptLegacy, "openmp-opt", + "OpenMP specific optimizations", false, false) +INITIALIZE_PASS_END(OpenMPOptLegacy, "openmp-opt", + "OpenMP specific optimizations", false, false) + +Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacy(); } diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -517,6 +517,10 @@ // Infer attributes about declarations if possible. MPM.add(createInferFunctionAttrsLegacyPass()); + // Try to perform OpenMP specific optimizations. This is a no-op if OpenMP + // runtime calls are not present in the module. + MPM.add(createOpenMPOptLegacyPass()); + addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); if (OptLevel > 2) diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -27,6 +27,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Force set function attributes ; CHECK-NEXT: Infer set function attributes +; CHECK-NEXT: OpenMP specific optimizations ; CHECK-NEXT: Interprocedural Sparse Conditional Constant Propagation ; CHECK-NEXT: Unnamed pass: implement Pass::getPassName() ; CHECK-NEXT: Called Value Propagation diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -27,6 +27,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Force set function attributes ; CHECK-NEXT: Infer set function attributes +; CHECK-NEXT: OpenMP specific optimizations ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Call-site splitting diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -27,6 +27,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Force set function attributes ; CHECK-NEXT: Infer set function attributes +; CHECK-NEXT: OpenMP specific optimizations ; CHECK-NEXT: Interprocedural Sparse Conditional Constant Propagation ; CHECK-NEXT: Unnamed pass: implement Pass::getPassName() ; CHECK-NEXT: Called Value Propagation diff --git a/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll b/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll @@ -0,0 +1,1064 @@ +; RUN: opt < %s -openmp-opt -stats -disable-output 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: opt < %s -openmp-opt -S 2>&1 | FileCheck %s +; +; REQUIRES: asserts +; +; Check that we will not execute any of the below target regions in SPMD-mode. +; TODO: SPMD-mode is valid for target region 2 and 3 if proper guarding code is inserted. +; +; See the to_SPMD_mode.ll file for almost the same functions that can be translated to SPMD mode. +; +; STATS-DAG: 1 openmp-opt - Number of GPU kernel in non-SPMD mode without parallelism +; STATS-DAG: 3 openmp-opt - Number of custom GPU kernel non-SPMD mode state machines created +; STATS-DAG: 2 openmp-opt - Number of custom GPU kernel non-SPMD mode state machines without fallback +; +; No state machine needed because there is no parallel region. +; CHECK: void @{{.*}}loop_in_loop_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 true +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, +; +; void loop_in_loop_in_tregion(int *A, int *B) { +; #pragma omp target +; for (int i = 0; i < 512; i++) { +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; } +; +; +; Custom state machine needed but no fallback because all parallel regions are known +; CHECK: void @{{.*}}parallel_loops_and_accesses_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false +; The "check.next" block should not contain a fallback call +; CHECK: worker.check.next4: +; CHECK-NEXT: br label %worker.parallel_end +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, +; +; void parallel_loops_and_accesses_in_tregion(int *A, int *B) { +; #pragma omp target +; { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[0 + j]; +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[1 + j]; +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[2 + j]; +; +; // This needs a guard in SPMD mode +; A[0] = B[0]; +; } +; } +; +; void extern_func(); +; static void parallel_loop(int *A, int *B, int i) { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; +; int Global[512]; +; +; +; Custom state machine needed but no fallback because all parallel regions are known +; CHECK: void @{{.*}}parallel_loop_in_function_in_loop_with_global_acc_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false +; The "check.next" block should not contain a fallback call +; CHECK: worker.check.next: +; CHECK-NEXT: br label %worker.parallel_end +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, +; +; void parallel_loop_in_function_in_loop_with_global_acc_in_tregion(int *A, int *B) { +; #pragma omp target +; for (int i = 0; i < 512; i++) { +; parallel_loop(A, B, i); +; Global[i]++; +; } +; } +; +; Custom state machine needed with fallback because "extern_func" might contain parallel regions. +; CHECK: void @{{.*}}parallel_loops_in_functions_and_extern_func_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false +; The "check.next" block should contain a fallback call +; CHECK: worker.check.next: +; CHECK-NEXT: call void %work_fn( +; CHECK-NEXT: br label %worker.parallel_end +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, +; +; void parallel_loops_in_functions_and_extern_func_in_tregion(int *A, int *B) { +; #pragma omp target +; { +; parallel_loop(A, B, 1); +; parallel_loop(A, B, 2); +; extern_func(); +; parallel_loop(A, B, 3); +; } +; } + +source_filename = "../llvm/test/Transforms/OpenMP/no_SPMD_mode.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.private.struct = type { i32**, i32** } +%omp.private.struct.0 = type { i32**, i32** } +%omp.private.struct.1 = type { i32**, i32** } +%omp.private.struct.2 = type { i32**, i32**, i32* } + +@__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2_exec_mode = weak constant i8 1 +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9_exec_mode = weak constant i8 1 +@__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35_exec_mode = weak constant i8 1 +@__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [4 x i8*] [i8* @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + %cleanup.dest.slot = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4 + store i32 0, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.cond: ; preds = %for.inc8, %.execute + %3 = load i32, i32* %i, align 4, !tbaa !15 + %cmp = icmp slt i32 %3, 512 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + store i32 2, i32* %cleanup.dest.slot, align 4 + %4 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #4 + br label %for.end10 + +for.body: ; preds = %for.cond + %5 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 0, i32* %j, align 4, !tbaa !15 + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %6 = load i32, i32* %j, align 4, !tbaa !15 + %cmp2 = icmp slt i32 %6, 1024 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 + +for.cond.cleanup3: ; preds = %for.cond1 + store i32 5, i32* %cleanup.dest.slot, align 4 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %7) #4 + br label %for.end + +for.body4: ; preds = %for.cond1 + %8 = load i32*, i32** %B.addr, align 8, !tbaa !11 + %9 = load i32, i32* %i, align 4, !tbaa !15 + %10 = load i32, i32* %j, align 4, !tbaa !15 + %add = add nsw i32 %9, %10 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom + %11 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %12 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %13 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom5 = sext i32 %13 to i64 + %arrayidx6 = getelementptr inbounds i32, i32* %12, i64 %idxprom5 + %14 = load i32, i32* %arrayidx6, align 4, !tbaa !15 + %add7 = add nsw i32 %14, %11 + store i32 %add7, i32* %arrayidx6, align 4, !tbaa !15 + br label %for.inc + +for.inc: ; preds = %for.body4 + %15 = load i32, i32* %j, align 4, !tbaa !15 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %j, align 4, !tbaa !15 + br label %for.cond1 + +for.end: ; preds = %for.cond.cleanup3 + br label %for.inc8 + +for.inc8: ; preds = %for.end + %16 = load i32, i32* %i, align 4, !tbaa !15 + %inc9 = add nsw i32 %16, 1 + store i32 %inc9, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.end10: ; preds = %for.cond.cleanup + br label %.omp.deinit + +.omp.deinit: ; preds = %for.end10 + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare i8 @__kmpc_target_region_kernel_init(i1, i1, i1, i1) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +declare void @__kmpc_target_region_kernel_deinit(i1, i1) + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %.private.vars = alloca %omp.private.struct, align 8 + %.private.vars1 = alloca %omp.private.struct.0, align 8 + %.private.vars2 = alloca %omp.private.struct.1, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast %omp.private.struct* %.private.vars to i8* + %3 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %3 + %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %4 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false) + %5 = bitcast %omp.private.struct.0* %.private.vars1 to i8* + %6 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars1, i32 0, i32 0 + store i32** %A.addr, i32*** %6 + %7 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars1, i32 0, i32 1 + store i32** %B.addr, i32*** %7 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false) + %8 = bitcast %omp.private.struct.1* %.private.vars2 to i8* + %9 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars2, i32 0, i32 0 + store i32** %A.addr, i32*** %9 + %10 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars2, i32 0, i32 1 + store i32** %B.addr, i32*** %10 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false) + %11 = load i32*, i32** %B.addr, align 8, !tbaa !11 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 0 + %12 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %13 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %arrayidx3 = getelementptr inbounds i32, i32* %13, i64 0 + store i32 %12, i32* %arrayidx3, align 4, !tbaa !15 + br label %.omp.deinit + +.omp.deinit: ; preds = %.execute + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 0, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4 + ret void +} + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion._wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct* + %2 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare !callback !17 void @__kmpc_target_region_kernel_parallel(i1, i1, void (i8*, i8*)* nocapture, i8* nocapture, i16, i8* nocapture readonly, i16, i1) + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 1, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.1_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.0* + %2 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.1(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 2, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.2_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.1* + %2 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35(i32* %A, i32* %B, [512 x i32]* dereferenceable(2048) %Global) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %Global.addr = alloca [512 x i32]*, align 8 + %i = alloca i32, align 4 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + store [512 x i32]* %Global, [512 x i32]** %Global.addr, align 8, !tbaa !11 + %0 = load [512 x i32]*, [512 x i32]** %Global.addr, align 8, !tbaa !11 + %1 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %2 = icmp eq i8 %1, 1 + br i1 %2, label %.execute, label %.exit + +.execute: ; preds = %entry + %3 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + store i32 0, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.cond: ; preds = %for.inc, %.execute + %4 = load i32, i32* %i, align 4, !tbaa !15 + %cmp = icmp slt i32 %4, 512 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %5 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %5) #4 + br label %for.end + +for.body: ; preds = %for.cond + %6 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %7 = load i32*, i32** %B.addr, align 8, !tbaa !11 + %8 = load i32, i32* %i, align 4, !tbaa !15 + call void @parallel_loop(i32* %6, i32* %7, i32 %8) + %9 = load i32, i32* %i, align 4, !tbaa !15 + %idxprom = sext i32 %9 to i64 + %arrayidx = getelementptr inbounds [512 x i32], [512 x i32]* %0, i64 0, i64 %idxprom + %10 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %inc = add nsw i32 %10, 1 + store i32 %inc, i32* %arrayidx, align 4, !tbaa !15 + br label %for.inc + +for.inc: ; preds = %for.body + %11 = load i32, i32* %i, align 4, !tbaa !15 + %inc1 = add nsw i32 %11, 1 + store i32 %inc1, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.end: ; preds = %for.cond.cleanup + br label %.omp.deinit + +.omp.deinit: ; preds = %for.end + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: nounwind +define internal void @parallel_loop(i32* %A, i32* %B, i32 %i) #2 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i.addr = alloca i32, align 4 + %.private.vars = alloca %omp.private.struct.2, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + store i32 %i, i32* %i.addr, align 4, !tbaa !15 + %0 = bitcast %omp.private.struct.2* %.private.vars to i8* + %1 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %1 + %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %2 + %3 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 2 + store i32* %i.addr, i32** %3 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false) + ret void +} + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %3 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop(i32* %2, i32* %3, i32 1) + %4 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %5 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop(i32* %4, i32* %5, i32 2) + call void bitcast (void (...)* @extern_func to void ()*)() + %6 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %7 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop(i32* %6, i32* %7, i32 3) + br label %.omp.deinit + +.omp.deinit: ; preds = %.execute + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare void @extern_func(...) #3 + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %i.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + store i32* %i, i32** %i.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = load i32*, i32** %i.addr, align 8, !tbaa !11 + %3 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + %4 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %7 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %8 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #4 + %9 = load i32*, i32** %.global_tid..addr, align 8 + %10 = load i32, i32* %9, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %11, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %13, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %14, %15 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %16, %17 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %18, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %19 = load i32*, i32** %1, align 8, !tbaa !11 + %20 = load i32, i32* %2, align 4, !tbaa !15 + %21 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 %20, %21 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom + %22 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %23 = load i32*, i32** %0, align 8, !tbaa !11 + %24 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %24 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4 + %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %25, %22 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %26, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %27, %28 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %29, %30 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) + %31 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4 + %32 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4 + %33 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4 + %34 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4 + %35 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #4 + %36 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.3_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.2* + %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + %6 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 2 + %7 = load i32*, i32** %6, align 1 + call void @.omp_TRegion.3(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #4 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0, !1, !2, !3} +!nvvm.annotations = !{!4, !5, !6, !7} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +!0 = !{i32 0, i32 24, i32 43713508, !"parallel_loops_and_accesses_in_tregion", i32 9, i32 1} +!1 = !{i32 0, i32 24, i32 43713508, !"loop_in_loop_in_tregion", i32 2, i32 0} +!2 = !{i32 0, i32 24, i32 43713508, !"parallel_loops_in_functions_and_extern_func_in_tregion", i32 43, i32 3} +!3 = !{i32 0, i32 24, i32 43713508, !"parallel_loop_in_function_in_loop_with_global_acc_in_tregion", i32 35, i32 2} +!4 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2, !"kernel", i32 1} +!5 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9, !"kernel", i32 1} +!6 = !{void (i32*, i32*, [512 x i32]*)* @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35, !"kernel", i32 1} +!7 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43, !"kernel", i32 1} +!8 = !{i32 1, !"wchar_size", i32 4} +!9 = !{i32 7, !"PIC Level", i32 2} +!10 = !{!"clang version 9.0.0 "} +!11 = !{!12, !12, i64 0} +!12 = !{!"any pointer", !13, i64 0} +!13 = !{!"omnipotent char", !14, i64 0} +!14 = !{!"Simple C/C++ TBAA"} +!15 = !{!16, !16, i64 0} +!16 = !{!"int", !13, i64 0} +!17 = !{!18} +!18 = !{i64 2, i64 3, i64 5, i1 false} diff --git a/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll b/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll @@ -0,0 +1,1208 @@ +; RUN: opt < %s -openmp-opt -stats -disable-output 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: opt < %s -openmp-opt -S 2>&1 | FileCheck %s +; +; REQUIRES: asserts +; +; Check that we will not execute any of the below target regions in SPMD-mode. +; TODO: SPMD-mode is valid for target region 2 and 3 if proper guarding code is inserted. +; +; See the no_SPMD_mode.ll file for almost the same functions that cannot be translated to SPMD mode. +; +; +; STATS-DAG: 4 openmp-opt - Number of GPU kernels converted to SPMD mode +; STATS-DAG: 6 openmp-opt - Number of parallel GPU kernel regions converted to SPMD mode +; +; Check for SPMD mode. +; CHECK: void @{{.*}}parallel_loop_in_loop_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 true, +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 true, +; +; void parallel_loop_in_loop_in_tregion(int *A, int *B) { +; #pragma omp target +; for (int i = 0; i < 512; i++) { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; } +; +; +; Check for SPMD mode. +; CHECK: void @{{.*}}parallel_loops_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 true, +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 true, +; +; void parallel_loops_in_tregion(int *A, int *B) { +; #pragma omp target +; { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[0 + j]; +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[1 + j]; +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[2 + j]; +; } +; } +; +; FIXME: For now we copy the parallel_loop function below to +; make sure they have only uses in one kernel. As all +; kernels can be translated to SPMD mode we don't need +; this. In the future we should handle that and clone +; the function automatically (or add a new argument) to +; facilitate partial SPMD-mode execution. +; +; static void parallel_loop1(int *A, int *B, int i) { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; static void parallel_loop2(int *A, int *B, int i) { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; +; +; Check for SPMD mode. +; CHECK: void @{{.*}}parallel_loop_in_function_in_loop_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 true, +; CHECK: call void @parallel_loop1( +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 true, +; CHECK: define internal void @parallel_loop1 +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; +; void parallel_loop_in_function_in_loop_in_tregion(int *A, int *B) { +; #pragma omp target +; for (int i = 0; i < 512; i++) +; parallel_loop1(A, B, i); +; } +; +; +; Check for SPMD mode. +; CHECK: define internal void @parallel_loop2 +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: void @{{.*}}parallel_loops_in_functions_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 true, +; CHECK: call void @parallel_loop2( +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 true, +; +; void parallel_loops_in_functions_in_tregion(int *A, int *B) { +; #pragma omp target +; { +; parallel_loop2(A, B, 1); +; parallel_loop2(A, B, 2); +; parallel_loop2(A, B, 3); +; } +; } + +source_filename = "../llvm/test/Transforms/OpenMP/to_SPMD_mode.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.private.struct = type { i32**, i32**, i32* } +%omp.private.struct.0 = type { i32**, i32** } +%omp.private.struct.1 = type { i32**, i32** } +%omp.private.struct.2 = type { i32**, i32** } +%omp.private.struct.3 = type { i32**, i32**, i32* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2_exec_mode = weak constant i8 1 +@__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10_exec_mode = weak constant i8 1 +@__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30_exec_mode = weak constant i8 1 +@__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [4 x i8*] [i8* @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + %.private.vars = alloca %omp.private.struct, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + store i32 0, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.cond: ; preds = %for.inc, %.execute + %3 = load i32, i32* %i, align 4, !tbaa !15 + %cmp = icmp slt i32 %3, 512 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %4 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #3 + br label %for.end + +for.body: ; preds = %for.cond + %5 = bitcast %omp.private.struct* %.private.vars to i8* + %6 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %6 + %7 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %7 + %8 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 2 + store i32* %i, i32** %8 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %5, i16 24, i1 false) + br label %for.inc + +for.inc: ; preds = %for.body + %9 = load i32, i32* %i, align 4, !tbaa !15 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.end: ; preds = %for.cond.cleanup + br label %.omp.deinit + +.omp.deinit: ; preds = %for.end + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare i8 @__kmpc_target_region_kernel_init(i1, i1, i1, i1) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %i.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + store i32* %i, i32** %i.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = load i32*, i32** %i.addr, align 8, !tbaa !11 + %3 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + %4 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %7 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %8 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #3 + %9 = load i32*, i32** %.global_tid..addr, align 8 + %10 = load i32, i32* %9, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %11, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %13, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %14, %15 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %16, %17 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %18, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %19 = load i32*, i32** %1, align 8, !tbaa !11 + %20 = load i32, i32* %2, align 4, !tbaa !15 + %21 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 %20, %21 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom + %22 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %23 = load i32*, i32** %0, align 8, !tbaa !11 + %24 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %24 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4 + %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %25, %22 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %26, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %27, %28 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %29, %30 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) + %31 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + %35 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #3 + %36 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #3 + ret void +} + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion._wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct* + %2 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + %6 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 2 + %7 = load i32*, i32** %6, align 1 + call void @.omp_TRegion.(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #3 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare !callback !17 void @__kmpc_target_region_kernel_parallel(i1, i1, void (i8*, i8*)* nocapture, i8* nocapture, i16, i8* nocapture readonly, i16, i1) + +declare void @__kmpc_target_region_kernel_deinit(i1, i1) + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %.private.vars = alloca %omp.private.struct.0, align 8 + %.private.vars1 = alloca %omp.private.struct.1, align 8 + %.private.vars2 = alloca %omp.private.struct.2, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast %omp.private.struct.0* %.private.vars to i8* + %3 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %3 + %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %4 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false) + %5 = bitcast %omp.private.struct.1* %.private.vars1 to i8* + %6 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars1, i32 0, i32 0 + store i32** %A.addr, i32*** %6 + %7 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars1, i32 0, i32 1 + store i32** %B.addr, i32*** %7 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false) + %8 = bitcast %omp.private.struct.2* %.private.vars2 to i8* + %9 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars2, i32 0, i32 0 + store i32** %A.addr, i32*** %9 + %10 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars2, i32 0, i32 1 + store i32** %B.addr, i32*** %10 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false) + br label %.omp.deinit + +.omp.deinit: ; preds = %.execute + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 0, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.1_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.0* + %2 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.1(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 1, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.2_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.1* + %2 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 2, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.3_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.2* + %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.3(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + store i32 0, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.cond: ; preds = %for.inc, %.execute + %3 = load i32, i32* %i, align 4, !tbaa !15 + %cmp = icmp slt i32 %3, 512 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %4 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #3 + br label %for.end + +for.body: ; preds = %for.cond + %5 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %6 = load i32*, i32** %B.addr, align 8, !tbaa !11 + %7 = load i32, i32* %i, align 4, !tbaa !15 + call void @parallel_loop1(i32* %5, i32* %6, i32 %7) + br label %for.inc + +for.inc: ; preds = %for.body + %8 = load i32, i32* %i, align 4, !tbaa !15 + %inc = add nsw i32 %8, 1 + store i32 %inc, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.end: ; preds = %for.cond.cleanup + br label %.omp.deinit + +.omp.deinit: ; preds = %for.end + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: nounwind +define internal void @parallel_loop1(i32* %A, i32* %B, i32 %i) #2 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i.addr = alloca i32, align 4 + %.private.vars = alloca %omp.private.struct.3, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + store i32 %i, i32* %i.addr, align 4, !tbaa !15 + %0 = bitcast %omp.private.struct.3* %.private.vars to i8* + %1 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %1 + %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %2 + %3 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 2 + store i32* %i.addr, i32** %3 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.4_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false) + ret void +} +; Function Attrs: nounwind +define internal void @parallel_loop2(i32* %A, i32* %B, i32 %i) #2 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i.addr = alloca i32, align 4 + %.private.vars = alloca %omp.private.struct.3, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + store i32 %i, i32* %i.addr, align 4, !tbaa !15 + %0 = bitcast %omp.private.struct.3* %.private.vars to i8* + %1 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %1 + %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %2 + %3 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 2 + store i32* %i.addr, i32** %3 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.4_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false) + ret void +} + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %3 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop2(i32* %2, i32* %3, i32 1) + %4 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %5 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop2(i32* %4, i32* %5, i32 2) + %6 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %7 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop2(i32* %6, i32* %7, i32 3) + br label %.omp.deinit + +.omp.deinit: ; preds = %.execute + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.4(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %i.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + store i32* %i, i32** %i.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = load i32*, i32** %i.addr, align 8, !tbaa !11 + %3 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + %4 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %7 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %8 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #3 + %9 = load i32*, i32** %.global_tid..addr, align 8 + %10 = load i32, i32* %9, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %11, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %13, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %14, %15 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %16, %17 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %18, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %19 = load i32*, i32** %1, align 8, !tbaa !11 + %20 = load i32, i32* %2, align 4, !tbaa !15 + %21 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 %20, %21 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom + %22 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %23 = load i32*, i32** %0, align 8, !tbaa !11 + %24 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %24 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4 + %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %25, %22 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %26, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %27, %28 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %29, %30 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) + %31 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + %35 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #3 + %36 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.4_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.3* + %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + %6 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 2 + %7 = load i32*, i32** %6, align 1 + call void @.omp_TRegion.4(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #3 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!omp_offload.info = !{!0, !1, !2, !3} +!nvvm.annotations = !{!4, !5, !6, !7} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +!0 = !{i32 0, i32 24, i32 43712181, !"parallel_loop_in_loop_in_tregion", i32 2, i32 0} +!1 = !{i32 0, i32 24, i32 43712181, !"parallel_loops_in_functions_in_tregion", i32 36, i32 3} +!2 = !{i32 0, i32 24, i32 43712181, !"parallel_loops_in_tregion", i32 10, i32 1} +!3 = !{i32 0, i32 24, i32 43712181, !"parallel_loop_in_function_in_loop_in_tregion", i32 30, i32 2} +!4 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2, !"kernel", i32 1} +!5 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10, !"kernel", i32 1} +!6 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30, !"kernel", i32 1} +!7 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36, !"kernel", i32 1} +!8 = !{i32 1, !"wchar_size", i32 4} +!9 = !{i32 7, !"PIC Level", i32 2} +!10 = !{!"clang version 9.0.0 "} +!11 = !{!12, !12, i64 0} +!12 = !{!"any pointer", !13, i64 0} +!13 = !{!"omnipotent char", !14, i64 0} +!14 = !{!"Simple C/C++ TBAA"} +!15 = !{!16, !16, i64 0} +!16 = !{!"int", !13, i64 0} +!17 = !{!18} +!18 = !{i64 2, i64 3, i64 5, i1 false} diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake --- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake @@ -78,13 +78,14 @@ # These flags are required to emit LLVM Bitcode. We check them together because # if any of them are not supported, there is no point in finding out which are. -set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) +set(compiler_flags_required -emit-llvm -std=c++11 -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }") check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required}) # It makes no sense to continue given that the compiler doesn't support # emitting basic LLVM Bitcode if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED) + message(ERROR "NO FLAG SUPPORT") return() endif() @@ -101,6 +102,7 @@ check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full}) if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC) + message(ERROR "NO FCUDA RDC") return() endif() diff --git a/openmp/libomptarget/deviceRTLs/common/target_region.h b/openmp/libomptarget/deviceRTLs/common/target_region.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/target_region.h @@ -0,0 +1,167 @@ +//===-- target_region.h --- Target region OpenMP devie runtime interface --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target region interfaces are simple interfaces designed to allow middle-end +// (=LLVM) passes to analyze and transform the code. To achieve good performance +// it may be required to run the associated passes. However, implementations of +// this interface shall always provide a correct implementation as close to the +// user expected code as possible. +// +//===----------------------------------------------------------------------===// + +#ifndef _DEVICERTL_COMMON_INTERFACES_H_ +#define _DEVICERTL_COMMON_INTERFACES_H_ + +#ifndef EXTERN +#define EXTERN +#endif +#ifndef CALLBACK +#define CALLBACK(Callee, Payload0, Payload1) +#endif + +/// The target region _kernel_ interface for GPUs +/// +/// This deliberatly simple interface provides the middle-end (=LLVM) with +/// easier means to reason about the semantic of the code and transform it as +/// well. The runtime calls are therefore also desiged to carry sufficient +/// information necessary for optimization. +/// +/// +/// Intended usage: +/// +/// \code +/// void kernel(...) { +/// +/// char ThreadKind = __kmpc_target_region_kernel_init(...); +/// +/// if (ThreadKind == -1) { // actual worker thread +/// if (!UsedLibraryStateMachine) +/// user_state_machine(); +/// goto exit; +/// } else if (ThreadKind == 0) { // surplus worker thread +/// goto exit; +/// } else { // team master thread +/// goto user_code; +/// } +/// +/// user_code: +/// +/// // User defined kernel code, parallel regions are replaced by +/// // by __kmpc_target_region_kernel_parallel(...) calls. +/// +/// // Fallthrough to de-initialization +/// +/// deinit: +/// __kmpc_target_region_kernel_deinit(...); +/// +/// exit: +/// /* exit the kernel */ +/// } +/// \endcode +/// +/// +///{ + +/// Initialization +/// +/// +/// In SPMD mode, all threads will execute their respective initialization +/// routines. +/// +/// In non-SPMD mode, team masters will invoke the initialization routines while +/// the rest is considered a worker thread. Worker threads required for this +/// target region will be trapped inside the function if \p UseStateMachine is +/// true. Otherwise they will escape with a return value of -1 +/// +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiresOMPRuntime Flag to indicate if the runtime is required and +/// needs to be initialized. +/// \param UseStateMachine Flag to indicate if the runtime state machine +/// should be used in non-SPMD mode. +/// \param RequiresDataSharing Flag to indicate if there might be inter-thread +/// sharing which needs runtime support. +/// +/// \return 1, always in SPMD mode, and in non-SPMD mode if the thread is the +/// team master. +/// 0, in non-SPMD mode and the thread is a surplus worker that should +/// not execute anything in the target region. +/// -1, in non-SPMD mode and the thread is a required worker which: +/// - finished work and should be terminated if \p UseStateMachine +/// is true. +/// - has not performed work and should be put in a user provied +/// state machine (as defined above). +/// +EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode, + bool RequiresOMPRuntime, + bool UseStateMachine, + bool RequiresDataSharing); + +/// De-Initialization +/// +/// +/// In non-SPMD, this function releases the workers trapped in a state machine +/// and also any memory dynamically allocated by the runtime. +/// +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and +/// is therefore initialized. +/// +EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode, + bool RequiredOMPRuntime); + +/// Generic type of a work function in the target region kernel interface. The +/// two arguments are pointers to structures that contains the shared and +/// firstprivate variables respectively. Since the layout and size was known at +/// compile time, the front-end is expected to generate appropriate packing and +/// unpacking code. +typedef void (*ParallelWorkFnTy)(char * /* SharedValues */, + char * /* PrivateValues */); + +/// Enter a parallel region +/// +/// +/// The parallel region is defined by \p ParallelWorkFn. The shared variables, +/// \p SharedMemorySize bytes in total, start at \p SharedValues. The +/// firstprivate variables, \p PrivateValuesBytes bytes in total, start at +/// \p PrivateValues. +/// +/// In SPMD mode, this function calls \p ParallelWorkFn with \p SharedValues and +/// \p PrivateValues as arguments before it returns. +/// +/// In non-SPMD mode, \p ParallelWorkFn, \p SharedValues, and \p PrivateValues +/// are communicated to the workers before they are released from the state +/// machine to run the code defined by \p ParallelWorkFn in parallel. This +/// function will only return after all workers are finished. +/// +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and +/// is therefore initialized. +/// \param ParallelWorkFn The outlined code that is executed in parallel by +/// the threads in the team. +/// \param SharedValues A pointer to the location of all shared values. +/// \param SharedValuesBytes The total size of the shared values in bytes. +/// \param PrivateValues A pointer to the location of all private values. +/// \param PrivateValuesBytes The total size of the private values in bytes. +/// \param SharedMemPointers Flag to indicate that the pointer \p SharedValues +/// and \p PrivateValues point into shared memory. +/// If this flag is true, it also requires that all +/// private values, if any, are stored directly after +/// the shared values. +/// +CALLBACK(ParallelWorkFnTy, SharedValues, PrivateValues) +EXTERN void __kmpc_target_region_kernel_parallel( + bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn, + char *SharedValues, uint16_t SharedValuesBytes, char *PrivateValues, + uint16_t PrivateValuesBytes, bool SharedMemPointers); + +///} + +#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -53,6 +53,7 @@ src/reduction.cu src/sync.cu src/task.cu + src/target_region.cu ) set(omp_data_objects src/omp_data.cu) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -63,3 +63,9 @@ // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; + +//////////////////////////////////////////////////////////////////////////////// +/// Pointer to share memory between team threads in the target region interface. +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ target_region_shared_buffer _target_region_shared_memory; + diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -101,6 +101,66 @@ uint32_t nArgs; }; +/// Helper structure to manage the memory shared by the threads in a team. +/// +/// Note: Only the team master is allowed to call non-const functions! +struct target_region_shared_buffer { +#define PRE_SHARED_BYTES 128 + + INLINE void init() { + _ptr = &_data[0]; + _size = PRE_SHARED_BYTES; + _offset = 0; + } + + /// Release any dynamic allocated memory. + INLINE void release() { + if (_size == PRE_SHARED_BYTES) + return; + SafeFree(_ptr, (char *)"free shared dynamic buffer"); + init(); + } + + INLINE void set(char *ptr, size_t offset) { + release(); + _ptr = ptr; + _offset = offset; + } + + INLINE void resize(size_t size, size_t offset) { + _offset = offset; + + if (size <= _size) + return; + + if (_size != PRE_SHARED_BYTES) + SafeFree(_ptr, (char *)"free shared dynamic buffer"); + + _size = size; + _ptr = (char *)SafeMalloc(_size, (char *)"new shared buffer"); + } + + // Called by all threads. + INLINE char *begin() const { return _ptr; }; + INLINE size_t size() const { return _size; }; + INLINE size_t get_offset() const { return _offset; }; + +private: + // Pre-allocated space that holds PRE_SHARED_BYTES many bytes. + char _data[PRE_SHARED_BYTES]; + + // Pointer to the currently used buffer. + char *_ptr; + + // Size of the currently used buffer. + uint32_t _size; + + // Offset into the currently used buffer. + uint32_t _offset; + +#undef PRE_SHARED_BYTES +}; + extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu @@ -0,0 +1,197 @@ +//===-- target_region.cu ---- CUDA impl. of the target region interface -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the common target region interface. +// +//===----------------------------------------------------------------------===// + +// Include the native definitions first as certain defines might be needed in +// the common interface definition below. +#include "omptarget-nvptx.h" +#include "interface.h" + +#include "../../common/target_region.h" + +/// The pointer used to share memory between team threads. +extern __device__ __shared__ target_region_shared_buffer + _target_region_shared_memory; + +EXTERN char *__kmpc_target_region_kernel_get_shared_memory() { + return _target_region_shared_memory.begin(); +} +EXTERN char *__kmpc_target_region_kernel_get_private_memory() { + return _target_region_shared_memory.begin() + + _target_region_shared_memory.get_offset(); +} + +/// Simple generic state machine for worker threads. +INLINE static void +__kmpc_target_region_state_machine(bool IsOMPRuntimeInitialized) { + + do { + void *WorkFn = 0; + + // Wait for the signal that we have a new work function. + __kmpc_barrier_simple_spmd(NULL, 0); + + // Retrieve the work function from the runtime. + bool IsActive = __kmpc_kernel_parallel(&WorkFn, IsOMPRuntimeInitialized); + + // If there is nothing more to do, break out of the state machine by + // returning to the caller. + if (!WorkFn) + return; + + if (IsActive) { + char *SharedVars = __kmpc_target_region_kernel_get_shared_memory(); + char *PrivateVars = __kmpc_target_region_kernel_get_private_memory(); + + ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars); + + __kmpc_kernel_end_parallel(); + } + + __kmpc_barrier_simple_spmd(NULL, 0); + + } while (true); +} + +/// Filter threads into masters and workers. If \p UseStateMachine is true, +/// required workers will enter a state machine through and be trapped there. +/// Master and surplus worker threads will return from this function immediately +/// while required workers will only return once there is no more work. The +/// return value indicates if the thread is a master (1), a surplus worker (0), +/// or a finished required worker released from the state machine (-1). +INLINE static int8_t +__kmpc_target_region_thread_filter(unsigned ThreadLimit, bool UseStateMachine, + bool IsOMPRuntimeInitialized) { + + unsigned TId = GetThreadIdInBlock(); + bool IsWorker = TId < ThreadLimit; + + if (IsWorker) { + if (UseStateMachine) + __kmpc_target_region_state_machine(IsOMPRuntimeInitialized); + return -1; + } + + return TId == GetMasterThreadID(); +} + +EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode, + bool UseStateMachine, + bool RequiresOMPRuntime, + bool RequiresDataSharing) { + unsigned NumThreads = GetNumberOfThreadsInBlock(); + + // Handle the SPMD case first. + if (UseSPMDMode) { + + __kmpc_spmd_kernel_init(NumThreads, RequiresOMPRuntime, + RequiresDataSharing); + + if (RequiresDataSharing) + __kmpc_data_sharing_init_stack_spmd(); + + return 1; + } + + // Reserve one WARP in non-SPMD mode for the masters. + unsigned ThreadLimit = NumThreads - WARPSIZE; + int8_t FilterVal = __kmpc_target_region_thread_filter( + ThreadLimit, UseStateMachine, RequiresOMPRuntime); + + // If the filter returns 1 the executing thread is a team master which will + // initialize the kernel in the following. + if (FilterVal == 1) { + __kmpc_kernel_init(ThreadLimit, RequiresOMPRuntime); + __kmpc_data_sharing_init_stack(); + _target_region_shared_memory.init(); + } + + return FilterVal; +} + +EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode, + bool RequiredOMPRuntime) { + // Handle the SPMD case first. + if (UseSPMDMode) { + __kmpc_spmd_kernel_deinit_v2(RequiredOMPRuntime); + return; + } + + __kmpc_kernel_deinit(RequiredOMPRuntime); + + // Barrier to terminate worker threads. + __kmpc_barrier_simple_spmd(NULL, 0); + + // Release any dynamically allocated memory used for sharing. + _target_region_shared_memory.release(); +} + +EXTERN void __kmpc_target_region_kernel_parallel( + bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn, + char *SharedVars, uint16_t SharedVarsBytes, char *PrivateVars, + uint16_t PrivateVarsBytes, bool SharedMemPointers) { + + if (UseSPMDMode) { + ParallelWorkFn(SharedVars, PrivateVars); + return; + } + + if (SharedMemPointers) { + // If shared memory pointers are used the user guarantees that any private + // variables, if any, are stored directly after the shared ones in memory + // and that this memory can be accessed by all the threads. In that case, + // we do not need to copy memory around but simply use the provided + // locations. + + _target_region_shared_memory.set(SharedVars, SharedVarsBytes); + + } else { + + size_t BytesToCopy = SharedVarsBytes + PrivateVarsBytes; + if (BytesToCopy) { + // Resize the shared memory to be able to hold the data which is required + // to be in shared memory. Also set the offset to the beginning to the + // private variables. + _target_region_shared_memory.resize(BytesToCopy, SharedVarsBytes); + + // Copy the shared and private variables into shared memory. + char *SVMemory = __kmpc_target_region_kernel_get_shared_memory(); + char *PVMemory = __kmpc_target_region_kernel_get_private_memory(); + memcpy(SVMemory, SharedVars, SharedVarsBytes); + memcpy(PVMemory, PrivateVars, PrivateVarsBytes); + } + } + + // TODO: It seems we could store the work function in the same shared space + // as the rest of the variables above. + // + // Initialize the parallel work, e.g., make sure the work function is known. + __kmpc_kernel_prepare_parallel((void *)ParallelWorkFn, RequiredOMPRuntime); + + // TODO: It is odd that we call the *_spmd version in non-SPMD mode here. + // + // Activate workers. This barrier is used by the master to signal + // work for the workers. + __kmpc_barrier_simple_spmd(NULL, 0); + + // OpenMP [2.5, Parallel Construct, p.49] + // There is an implied barrier at the end of a parallel region. After the + // end of a parallel region, only the master thread of the team resumes + // execution of the enclosing task region. + // + // The master waits at this barrier until all workers are done. + __kmpc_barrier_simple_spmd(NULL, 0); + + // Update the shared variables if necessary. + if (!SharedVars && SharedVarsBytes) + memcpy(SharedVars, __kmpc_target_region_kernel_get_shared_memory(), + SharedVarsBytes); +}