diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -287,6 +287,7 @@ void initializeNameAnonGlobalLegacyPassPass(PassRegistry&); void initializeNaryReassociateLegacyPassPass(PassRegistry&); void initializeNewGVNLegacyPassPass(PassRegistry&); +void initializeOpenMPOptLegacyPass(PassRegistry&); void initializeObjCARCAAWrapperPassPass(PassRegistry&); void initializeObjCARCAPElimPass(PassRegistry&); void initializeObjCARCContractPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -143,6 +143,7 @@ (void) llvm::createLowerInvokePass(); (void) llvm::createLowerSwitchPass(); (void) llvm::createNaryReassociatePass(); + (void) llvm::createOpenMPOptLegacyPass(); (void) llvm::createObjCARCAAWrapperPass(); (void) llvm::createObjCARCAPElimPass(); (void) llvm::createObjCARCExpandPass(); diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h --- a/llvm/include/llvm/Transforms/IPO.h +++ b/llvm/include/llvm/Transforms/IPO.h @@ -156,6 +156,11 @@ /// ModulePass *createIPConstantPropagationPass(); +//===----------------------------------------------------------------------===// +/// createOpenMPOpt - This pass performs OpenMP specific optimizations. +/// +Pass *createOpenMPOptLegacyPass(); + //===----------------------------------------------------------------------===// /// createIPSCCPPass - This pass propagates constants from call sites into the /// bodies of functions, and keeps track of whether basic blocks are executable diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -25,6 +25,7 @@ LoopExtractor.cpp LowerTypeTests.cpp MergeFunctions.cpp + OpenMPOpt.cpp PartialInlining.cpp PassManagerBuilder.cpp PruneEH.cpp diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp --- a/llvm/lib/Transforms/IPO/IPO.cpp +++ b/llvm/lib/Transforms/IPO/IPO.cpp @@ -35,6 +35,7 @@ initializeGlobalSplitPass(Registry); initializeHotColdSplittingLegacyPassPass(Registry); initializeIPCPPass(Registry); + initializeOpenMPOptLegacyPass(Registry); initializeAlwaysInlinerLegacyPassPass(Registry); initializeSimpleInlinerPass(Registry); initializeInferFunctionAttrsLegacyPassPass(Registry); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -0,0 +1,784 @@ +//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// OpenMP specific optimizations +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +#define DEBUG_TYPE "openmp-opt" + +static cl::opt BuildCustomStateMachines( + "openmp-opt-kernel-state-machines", cl::ZeroOrMore, + cl::desc("Build custom state machines for non-SPMD kernels."), cl::Hidden, + cl::init(true)); + +static cl::opt PerformOpenMPSIMDIZATION( + "openmp-opt-kernel-simdization", cl::ZeroOrMore, + cl::desc("Convert non-SPMD kernels to SPMD mode if possible."), cl::Hidden, + cl::init(true)); + +static cl::opt ForceOpenMPSIMDIZATION( + "openmp-opt-kernel-force-simdization", cl::ZeroOrMore, + cl::desc("Force execution of non-SPMD kernels in SPMD mode."), cl::Hidden, + cl::init(false)); + +STATISTIC(NumKernelsConvertedToSPMD, + "Number of GPU kernels converted to SPMD mode"); +STATISTIC(NumParallelCallsConvertedToSPMD, + "Number of parallel GPU kernel regions converted to SPMD mode"); +STATISTIC(NumKernelsNonSPMDNoParallelism, + "Number of GPU kernel in non-SPMD mode without parallelism"); +STATISTIC(NumCustomStateMachinesCreated, + "Number of custom GPU kernel non-SPMD mode state machines created"); +STATISTIC(NumCustomStateMachinesNoFallback, + "Number of custom GPU kernel non-SPMD mode state machines without " + "fallback"); + +namespace { + +/// Set of constants that describe the positions of arguments (ARG_FN_NAME) and +/// the meaning of return values (RET_FN_MEANING) for the target region kernel +/// interface. Has to be kept in sync with +/// openmp/libomptarget/deviceRTLs/common/target_region.h +/// and the respective implementations. +enum { + ARG_INIT_USE_SPMD_MODE = 0, + ARG_INIT_REQUIRES_OMP_RUNTIME = 1, + ARG_INIT_USE_STATE_MACHINE = 2, + ARG_INIT_REQUIRES_DATA_SHARING = 3, + + ARG_DEINIT_USE_SPMD_MODE = 0, + ARG_DEINIT_REQUIRES_OMP_RUNTIME = 1, + + ARG_PARALLEL_USE_SPMD_MODE = 0, + ARG_PARALLEL_REQUIRES_OMP_RUNTIME = 1, + ARG_PARALLEL_WORK_FUNCTION = 2, + ARG_PARALLEL_SHARED_VARS = 3, + ARG_PARALLEL_SHARED_VARS_BYTES = 4, + + RET_INIT_IS_WORKER = -1, + RET_INIT_IS_SURPLUS = 0, + RET_INIT_IS_MASTER = 1, +}; + +/// A macro list to represent known functions from the omp, __kmpc, and target +/// region interfaces. The first value is an enum identifier, see FunctionID, +/// the second value is the function name, and the third the expected number of +/// arguments. +#define KNOWN_FUNCTIONS() \ + KF(FID_OMP_GET_TEAM_NUM, "omp_get_team_num", 0) \ + KF(FID_OMP_GET_NUM_TEAMS, "omp_get_num_teams", 0) \ + KF(FID_OMP_GET_THREAD_NUM, "omp_get_thread_num", 0) \ + KF(FID_OMP_GET_NUM_THREADS, "omp_get_num_threads", 0) \ + KF(FID_OMP_SET_NUM_THREADS, "omp_set_num_threads", 1) \ + KF(FID_KMPC_TREGION_KERNEL_INIT, "__kmpc_target_region_kernel_init", 4) \ + KF(FID_KMPC_TREGION_KERNEL_DEINIT, "__kmpc_target_region_kernel_deinit", 2) \ + KF(FID_KMPC_TREGION_KERNEL_PARALLEL, "__kmpc_target_region_kernel_parallel", \ + 8) \ + KF(FID_KMPC_FOR_STATIC_INIT_4, "__kmpc_for_static_init_4", 9) \ + KF(FID_KMPC_FOR_STATIC_FINI, "__kmpc_for_static_fini", 2) \ + KF(FID_KMPC_GLOBAL_THREAD_NUM, "__kmpc_global_thread_num", 1) \ + KF(FID_KMPC_DISPATCH_INIT_4, "__kmpc_dispatch_init_4", 7) \ + KF(FID_KMPC_DISPATCH_NEXT_4, "__kmpc_dispatch_next_4", 6) + +/// An identifier enum for each known function as well as the different kinds +/// of unknown functions we distinguish. +enum FunctionID { +#define KF(NAME, STR, NARGS) NAME, + KNOWN_FUNCTIONS() +#undef KF + // Unknown functions + //{ + FID_KMPC_UNKNOWN, ///< unknown __kmpc_XXXX function + FID_OMP_UNKOWN, ///< unknown omp_XXX function + FID_NVVM_UNKNOWN, ///< unknown llvm.nvvm.XXX function + FID_LLVM_UNKNOWN, ///< unknown llvm.XXX function + FID_UNKNOWN ///< unknown function without known prefix. + //} +}; + +static FunctionID getFunctionID(Function *F) { + if (!F) + return FID_UNKNOWN; +#define KF(NAME, STR, NARGS) .Case(STR, NAME) + return StringSwitch(F->getName()) KNOWN_FUNCTIONS() + .StartsWith("__kmpc_", FID_KMPC_UNKNOWN) + .StartsWith("omp_", FID_OMP_UNKOWN) + .StartsWith("llvm.nvvm.", FID_NVVM_UNKNOWN) + .StartsWith("llvm.", FID_LLVM_UNKNOWN) + .Default(FID_UNKNOWN); +#undef KF +} + +static Type *getOrCreateStructIdentTypePtr(Module &M) { + // TODO create if not present! + return M.getTypeByName("struct.ident_t")->getPointerTo(); +} + +// TODO: Simplify function declaration +static Function *getOrCreateFn(Type *RT, const char *Name, Module &M) { + Function *Fn = M.getFunction(Name); + if (!Fn) { + FunctionType *FType = FunctionType::get(RT, {}, false); + Fn = + Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M); + } + return Fn; +} +static Function *getOrCreateFn(Type *RT, Type *T0, Type *T1, const char *Name, + Module &M) { + Function *Fn = M.getFunction(Name); + if (!Fn) { + FunctionType *FType = FunctionType::get(RT, {T0, T1}, false); + Fn = + Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M); + } + return Fn; +} + +static Function *getOrCreateSimpleSPMDBarrierFn(Module &M) { + static const char *Name = "__kmpc_barrier_simple_spmd"; + Function *Fn = M.getFunction(Name); + if (!Fn) { + LLVMContext &Ctx = M.getContext(); + FunctionType *FType = FunctionType::get( + Type::getVoidTy(Ctx), + {getOrCreateStructIdentTypePtr(M), Type::getInt32Ty(Ctx)}, false); + Fn = + Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M); + } + return Fn; +} + +/// A helper class to introduce smart guarding code. +struct GuardGenerator { + + /// Inform the guard generator about the side-effect instructions collected in + /// @p SideEffectInst. + /// + /// \Returns True if all registered side-effects can be (efficiently) guarded. + bool registerSideEffects(SmallVectorImpl &SideEffectInst) { + bool Guarded = true; + if (SideEffectInst.empty()) + return Guarded; + + const Module &M = *SideEffectInst.front()->getModule(); + const DataLayout &DL = M.getDataLayout(); + + SmallVector UnguardedSideEffectInst; + for (Instruction *I : SideEffectInst) { + if (CallInst *CI = dyn_cast(I)) { + if (getFunctionID(CI->getCalledFunction()) != FID_UNKNOWN) + continue; + } else if (StoreInst *SI = dyn_cast(I)) { + if (isa( + SI->getPointerOperand()->stripInBoundsConstantOffsets())) + continue; + } else if (LoadInst *LI = dyn_cast(I)) { + if (isSafeToLoadUnconditionally(LI->getPointerOperand(), + LI->getAlignment(), DL)) + continue; + } + LLVM_DEBUG(dbgs() << "Non-SPMD side effect found: " << *I << "\n"); + UnguardedSideEffectInst.push_back(I); + } + + return UnguardedSideEffectInst.empty(); + } + + bool registerReadEffects(SmallVectorImpl &ReadEffectInst) { + return registerSideEffects(ReadEffectInst); + } + + void introduceGuards() { + // TODO: The guard generator cannot introduce guards yet but the registerXXX + // functions above are aware of that! + } +}; + +/// Helper structure to represent and work with a target region kernel. +struct KernelTy { + + KernelTy(Function *KernelFn) : KernelFn(*KernelFn) {} + + /// Optimize this kernel, return true if something was done. + bool optimize(); + +private: + /// Analyze this kernel, return true if successful. + bool analyze(Function &F, SmallPtrSetImpl &Visited, + bool InParallelRegion); + + /// Return true if the kernel is executed in SPMD mode. + bool isExecutedInSPMDMode(); + + /// Convert a non-SPMD mode kernel to SPMD mode, return true if successful. + bool convertToSPMD(); + + /// Create a custom state machine in the module, return true if successful. + bool createCustomStateMachine(); + + /// All side-effect instructions potentially executed in this kernel. + SmallVector SideEffectInst; + + /// All read-only instructions potentially executed in this kernel. + SmallVector ReadOnlyInst; + + /// All non-analyzed calls contained in this kernel. They are separated by + /// their function ID which describes identifies known calls. + SmallVector KernelCalls[FID_UNKNOWN + 1]; + + /// All non-analyzed calls contained in parallel regions which are part of + /// this kernel. They are separated by their function ID which describes + /// identifies known calls. + SmallVector ParallelRegionCalls[FID_UNKNOWN + 1]; + + /// The entry function of this kernel. + Function &KernelFn; +}; + +bool KernelTy::analyze(Function &F, SmallPtrSetImpl &Visited, + bool InParallelRegion) { + if (!Visited.insert(&F).second) + return true; + + LLVM_DEBUG(dbgs() << "Analyze " + << (InParallelRegion ? "parallel-region" : "kernel") + << " function: " << F.getName() << "\n"); + + // Determine where we remember the call. + auto &CallsArray = InParallelRegion ? ParallelRegionCalls : KernelCalls; + + for (Instruction &I : instructions(&F)) { + + // In parallel regions we only look for calls, outside, we look for all + // side-effect and read-only instructions. + if (!InParallelRegion) { + // Handle non-side-effect instructions first. These will not write or + // throw which makes reading the only interesting potential property. + if (!I.mayHaveSideEffects()) { + if (I.mayReadFromMemory()) { + LLVM_DEBUG(dbgs() << "- read-only: " << I << "\n"); + ReadOnlyInst.push_back(&I); + } + continue; + } + + // Now we handle all non-call instructions. + if (!isa(I)) { + LLVM_DEBUG(dbgs() << "- side-effect: " << I << "\n"); + SideEffectInst.push_back(&I); + continue; + } + } + + if (!isa(I)) + continue; + + CallInst &CI = cast(I); + Function *Callee = CI.getCalledFunction(); + + // For exact definitions we recurs. + if (Callee && !Callee->isDeclaration() && Callee->isDefinitionExact()) { + // If recursive analysis failed we bail, otherwise the + // information was collected in the internal state. + if (!analyze(*Callee, Visited, InParallelRegion)) + return false; + continue; + } + + // Check that know functions have the right number of arguments early on. + // Additionally provide debug output based on the function ID. + FunctionID ID = getFunctionID(Callee); + + switch (ID) { +#define KF(NAME, STR, NARGS) \ + case NAME: \ + LLVM_DEBUG( \ + dbgs() << "- known call " \ + << (CI.getNumArgOperands() != NARGS ? "[#arg missmatch!]" : "") \ + << ": " << I << "\n"); \ + if (CI.getNumArgOperands() != NARGS) \ + ID = FID_UNKNOWN; \ + break; + KNOWN_FUNCTIONS() +#undef KF + case FID_KMPC_UNKNOWN: + LLVM_DEBUG(dbgs() << "- unknown __kmpc_* call: " << I << "\n"); + break; + case FID_OMP_UNKOWN: + LLVM_DEBUG(dbgs() << "- unknown omp_* call: " << I << "\n"); + break; + case FID_NVVM_UNKNOWN: + LLVM_DEBUG(dbgs() << "- unknown llvm.nvvm.* call: " << I << "\n"); + break; + case FID_LLVM_UNKNOWN: + LLVM_DEBUG(dbgs() << "- unknown llvm.* call: " << I << "\n"); + break; + case FID_UNKNOWN: + LLVM_DEBUG(dbgs() << "- unknown call: " << I << "\n"); + break; + } + + CallsArray[ID].push_back(&CI); + } + + // If we did not analyze the kernel function but some other one down the call + // chain we are done now. + // TODO: Add more verification code here. + if (&F != &KernelFn) + return true; + + assert(&KernelCalls == &CallsArray); + + // If we are analyzing the kernel function we need to verify we have at least + // the calls we expect to see in the right places. + if (KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() != 1 || + KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() != 1 || + KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].front()->getParent() != + &F.getEntryBlock()) { + LLVM_DEBUG(dbgs() << "- malformed kernel: [#Init: " + << KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() + << "][#DeInit: " + << KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() + << "]\n"); + return false; + } + + return true; +} + +bool KernelTy::isExecutedInSPMDMode() { + assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 && + "Non-canonical kernel form!"); + auto *SPMDFlag = cast( + KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].front()->getArgOperand(0)); + assert(SPMDFlag->isZeroValue() || SPMDFlag->isOneValue()); + return SPMDFlag->isOneValue(); +} + +bool KernelTy::optimize() { + bool Changed = false; + + // First analyze the code. If that fails for some reason we bail out early. + SmallPtrSet Visited; + if (!analyze(KernelFn, Visited, /* InParallelRegion */ false)) + return Changed; + + Visited.clear(); + for (CallInst *ParCI : KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL]) { + Value *ParCIParallelFnArg = + ParCI->getArgOperand(ARG_PARALLEL_WORK_FUNCTION); + Function *ParallelFn = + dyn_cast(ParCIParallelFnArg->stripPointerCasts()); + if (!ParallelFn || + !analyze(*ParallelFn, Visited, /* InParallelRegion */ true)) + return Changed; + } + + Changed |= convertToSPMD(); + Changed |= createCustomStateMachine(); + + return Changed; +} + +bool KernelTy::convertToSPMD() { + if (isExecutedInSPMDMode()) + return false; + + bool Changed = false; + + // Use a generic guard generator to determine if suitable guards for all + // side effect instructions can be placed. + GuardGenerator GG; + + // Check if SIMDIZATION is possible, in case it is not forced. + if (!ForceOpenMPSIMDIZATION) { + // Unknown calls are not handled yet and will cause us to bail. + if (!KernelCalls[FID_UNKNOWN].empty()) + return Changed; + + // If we cannot guard all side effect instructions bail out. + if (!GG.registerSideEffects(SideEffectInst)) + return Changed; + + if (!GG.registerReadEffects(ReadOnlyInst)) + return Changed; + + // TODO: Emit a remark. + LLVM_DEBUG(dbgs() << "Transformation to SPMD OK\n"); + + // If we disabled SIMDIZATION we only emit the debug message and bail. + if (!PerformOpenMPSIMDIZATION) + return Changed; + } + + // Actually emit the guard code after we decided to perform SIMDIZATION. + GG.introduceGuards(); + + // Create an "is-SPMD" flag. + Type *FlagTy = KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0] + ->getArgOperand(ARG_INIT_USE_SPMD_MODE) + ->getType(); + Constant *SPMDFlag = ConstantInt::getTrue(FlagTy); + + // Update the init and deinit calls with the "is-SPMD" flag to indicate + // SPMD mode. + assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 && + "Non-canonical kernel form!"); + assert(KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() == 1 && + "Non-canonical kernel form!"); + KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0]->setArgOperand( + ARG_INIT_USE_SPMD_MODE, SPMDFlag); + KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT][0]->setArgOperand( + ARG_DEINIT_USE_SPMD_MODE, SPMDFlag); + + // Use the simple barrier to synchronize all threads in SPMD mode after each + // parallel region. + Function *SimpleBarrierFn = + getOrCreateSimpleSPMDBarrierFn(*KernelFn.getParent()); + + // For each parallel region, identified by the + // __kmpc_target_region_kernel_parallel call, we set the "is-SPMD" flag and + // introduce a succeeding barrier call. + for (CallInst *ParCI : KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL]) { + ParCI->setArgOperand(ARG_PARALLEL_USE_SPMD_MODE, SPMDFlag); + auto AI = SimpleBarrierFn->arg_begin(); + CallInst::Create(SimpleBarrierFn, + {Constant::getNullValue((AI++)->getType()), + Constant::getNullValue((AI)->getType())}, + "", ParCI->getNextNode()); + } + + // TODO: serialize nested parallel regions + + // Finally, we change the global exec_mode variable to indicate SPMD mode. + GlobalVariable *ExecMode = KernelFn.getParent()->getGlobalVariable( + (KernelFn.getName() + "_exec_mode").str()); + assert(ExecMode && + "Assumed to find an execution mode hint among the globals"); + assert(ExecMode->getInitializer()->isOneValue() && + "Assumed target_region execution mode prior to 'SPMD'-zation"); + ExecMode->setInitializer( + Constant::getNullValue(ExecMode->getInitializer()->getType())); + + // Bookkeeping + NumKernelsConvertedToSPMD++; + NumParallelCallsConvertedToSPMD += + KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].size(); + + return Changed; +} + +bool KernelTy::createCustomStateMachine() { + if (isExecutedInSPMDMode()) + return false; + + // TODO: Warn or eliminate the offloading if no parallel regions are present. + // TODO: Use reachability to eliminate the loop and if-cascade + // + // The user module code looks as follows if this function returns true. + // + // ThreadKind = __kmpc_target_region_kernel_init(...) + // if (ThreadKind == -1) { // actual worker thread + // do { + // __kmpc_barrier_simple_spmd(...) + // void *WorkFn; + // bool IsActive = __kmpc_kernel_parallel(&WorkFn, ...); + // if (!WorkFn) + // goto exit; + // if (IsActive) { + // char *SharedVars = __kmpc_target_region_kernel_get_shared_memory(); + // char *PrivateVars = + // __kmpc_target_region_kernel_get_private_memory(); + // + // ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars); + // + // __kmpc_kernel_end_parallel(); + // } + // __kmpc_barrier_simple_spmd(...) + // } while (true); + // } else if (ThreadKind == 0) { // surplus worker thread + // goto exit; + // } else { // team master thread + // goto user_code; + // } + + if (KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].size() == 0) { + LLVM_DEBUG(dbgs() << "Will not build a custom state machine because there " + "are no known parallel regions in the kernel.\n"); + // TODO: If we also know there are no hidden parallel calls we can terminate + // all but the + // master thread right away. + NumKernelsNonSPMDNoParallelism++; + return false; + } + + assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 && + "Non-canonical kernel form!"); + CallInst *InitCI = KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0]; + + // Check if a custom state machine was already implemented. + auto *UseSM = + dyn_cast(InitCI->getArgOperand(ARG_INIT_USE_STATE_MACHINE)); + if (!UseSM || !UseSM->isOne()) { + LLVM_DEBUG(dbgs() << "Will not build a custom state machine because of " + << *KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0] << "\n"); + return false; + } + + InitCI->setName("thread_kind"); + LLVMContext &Ctx = InitCI->getContext(); + + // Create local storage for the work function pointer. + Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); + AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "work_fn.addr", + &KernelFn.getEntryBlock().front()); + + Instruction *IP = InitCI->getNextNode(); + + Type *FlagTy = InitCI->getArgOperand(ARG_INIT_USE_STATE_MACHINE)->getType(); + Constant *SMFlag = ConstantInt::getFalse(FlagTy); + InitCI->setArgOperand(ARG_INIT_USE_STATE_MACHINE, SMFlag); + + // Check the return value of __kmpc_target_region_kernel_init. First compare + // it to RET_INIT_IS_WORKER. + Instruction *WorkerCnd = new ICmpInst( + IP, ICmpInst::ICMP_EQ, InitCI, + ConstantInt::getSigned(InitCI->getType(), RET_INIT_IS_WORKER), + "is_worker"); + + // Create the conditional which is entered by worker threads. + Instruction *WaitTI = SplitBlockAndInsertIfThen(WorkerCnd, IP, false); + BasicBlock *WaitBB = WaitTI->getParent(); + WaitBB->setName("worker.wait"); + IP->getParent()->setName("master_check"); + + Instruction *MasterCheckTI = IP->getParent()->getTerminator(); + assert(MasterCheckTI->getNumSuccessors() == 2); + assert(WaitTI->getNumSuccessors() == 1); + + // Determine the final block, that is a trivial one where the kernel ends. + BasicBlock *FinalBB = nullptr; + if (MasterCheckTI->getSuccessor(0)->size() == 1 && + isa(MasterCheckTI->getSuccessor(0)->getTerminator())) + FinalBB = MasterCheckTI->getSuccessor(0); + else if (MasterCheckTI->getSuccessor(1)->size() == 1 && + isa(MasterCheckTI->getSuccessor(1)->getTerminator())) + FinalBB = MasterCheckTI->getSuccessor(1); + assert(FinalBB && "Could not determine the final kernal block."); + + // Use the simple barrier to synchronize all threads in SPMD mode after each + // parallel region. + Module &M = *KernelFn.getParent(); + Function *SimpleBarrierFn = getOrCreateSimpleSPMDBarrierFn(M); + + auto AI = SimpleBarrierFn->arg_begin(); + Instruction *BarrierCall = + CallInst::Create(SimpleBarrierFn, + {Constant::getNullValue((AI++)->getType()), + Constant::getNullValue((AI)->getType())}, + "", WaitTI); + + Function *KernelParallelFn = + getOrCreateFn(Type::getInt1Ty(Ctx), VoidPtrTy->getPointerTo(), + Type::getInt16Ty(Ctx), "__kmpc_kernel_parallel", M); + + Value *RequiresOMPRuntime = CastInst::CreateZExtOrBitCast( + InitCI->getArgOperand(ARG_INIT_REQUIRES_OMP_RUNTIME), + Type::getInt16Ty(Ctx), "", WaitTI); + Instruction *ActiveCnd = CallInst::Create( + KernelParallelFn, {WorkFnAI, RequiresOMPRuntime}, "is_active", WaitTI); + + Type *WorkFnPrototype = + FunctionType::get(Type::getVoidTy(Ctx), {VoidPtrTy, VoidPtrTy}, false) + ->getPointerTo(); + Value *WorkFnAICast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( + WorkFnAI, WorkFnPrototype->getPointerTo(), "Work_fn.addr_cast", WaitTI); + Value *WorkFn = new LoadInst(WorkFnAICast, "work_fn", WaitTI); + + Instruction *WorkFnCnd = + new ICmpInst(WaitTI, ICmpInst::ICMP_EQ, WorkFn, + Constant::getNullValue(WorkFn->getType()), "no_work"); + + Instruction *FinishedTI = SplitBlockAndInsertIfThen(WorkFnCnd, WaitTI, false); + FinishedTI->getParent()->setName("worker.finished"); + WaitTI->getParent()->setName("worker.active_check"); + + Instruction *ActiveTI = SplitBlockAndInsertIfThen(ActiveCnd, WaitTI, false); + ActiveTI->getParent()->setName("worker.active"); + WaitTI->getParent()->setName("worker.inactive"); + + Function *KernelGetSharedVars = getOrCreateFn( + VoidPtrTy, "__kmpc_target_region_kernel_get_shared_memory", M); + Value *SharedVars = CallInst::Create(KernelGetSharedVars, "", ActiveTI); + Function *KernelGetPrivateVars = getOrCreateFn( + VoidPtrTy, "__kmpc_target_region_kernel_get_private_memory", M); + Value *PrivateVars = CallInst::Create(KernelGetPrivateVars, "", ActiveTI); + + BasicBlock *ExecuteBB = ActiveTI->getParent(); + BasicBlock *ParallelEndBB = SplitBlock(ExecuteBB, ActiveTI); + ParallelEndBB->setName("worker.parallel_end"); + + Function *KernelEndParallelFn = + getOrCreateFn(Type::getVoidTy(Ctx), "__kmpc_kernel_end_parallel", M); + CallInst::Create(KernelEndParallelFn, "", ActiveTI); + + // A fallback is required if we might not see all parallel regions + // (__kmpc_target_region_kernel_parallel calls). This could be the case if + // there is an unknown function call with side effects in the target region + // or inside one of the parallel regions. + bool RequiresFallback = !KernelCalls[FID_UNKNOWN].empty() || + !ParallelRegionCalls[FID_UNKNOWN].empty(); + + // Collect all target region parallel calls + // (__kmpc_target_region_kernel_parallel). + SmallVector KernelParallelCalls; + KernelParallelCalls.append( + KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].begin(), + KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].end()); + KernelParallelCalls.append( + ParallelRegionCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].begin(), + ParallelRegionCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].end()); + + IP = ExecuteBB->getTerminator(); + + // For each parallel call create a conditional that compares the work function + // against the parallel work function of this parallel call, if available. If + // the function pointers are equal we call the known parallel call work + // function directly and continue to the end of the if-cascade. + for (CallInst *ParCI : KernelParallelCalls) { + Function *ParFn = dyn_cast( + ParCI->getArgOperand(ARG_PARALLEL_WORK_FUNCTION)->stripPointerCasts()); + if (!ParFn) { + LLVM_DEBUG( + dbgs() << "Require fallback due to unknown parallel function\n"); + RequiresFallback = true; + continue; + } + + Value *ParFnCnd = + new ICmpInst(IP, ICmpInst::ICMP_EQ, WorkFn, ParFn, "par_fn_check"); + Instruction *ParFnTI = SplitBlockAndInsertIfThen(ParFnCnd, IP, false); + IP->getParent()->setName("worker.check.next"); + ParFnTI->getParent()->setName("worker.execute." + ParFn->getName()); + CallInst::Create(ParFn, {SharedVars, PrivateVars}, "", ParFnTI); + ParFnTI->setSuccessor(0, ParallelEndBB); + } + + // If a fallback is required we emit a indirect call before we jump to the + // point where all cases converge. + if (RequiresFallback) + CallInst::Create(WorkFn, {SharedVars, PrivateVars}, "", IP); + + // Insert a barrier call at the convergence point, right before the back edge. + BarrierCall->clone()->insertBefore(WaitTI); + + // Rewire the CFG edges to introduce the back and exit edge of the new loop. + // TODO: Add the new loop to LI! + FinishedTI->setSuccessor(0, FinalBB); + WaitTI->setSuccessor(0, WaitBB); + + // Bookkeeping. + NumCustomStateMachinesCreated++; + NumCustomStateMachinesNoFallback += !RequiresFallback; + + return true; +} + +template +static void collectCallersOf(Module &M, StringRef Name, + SmallVectorImpl &Callers) { + Function *Callee = M.getFunction(Name); + + // If the callee function is not present, we are done. + if (!Callee) + return; + + // If it exists we check all users. + for (const Use &U : Callee->uses()) { + CallSite CS(U.getUser()); + + // Filter out non-callee uses and non-call uses. + if (!CS || !CS.isCallee(&U) || !isa(CS.getInstruction())) + continue; + + // Found a caller, use it to create a T type object and put the result + // in the Callers vector. + Callers.emplace_back(T(CS.getCaller())); + } +} + +/// OpenMPOpt - The interprocedural OpenMP optimization pass implementation. +struct OpenMPOpt { + + bool runOnModule(Module &M) { + bool Changed = false; + + // Collect target regions kernels identified by a call to + // __kmpc_target_region_kernel_init. + collectCallersOf(M, "__kmpc_target_region_kernel_init", TRKernels); + + for (KernelTy &K : TRKernels) + Changed |= K.optimize(); + + return Changed; + } + +private: + /// A collection of all target regions kernels we found. + SmallVector TRKernels; +}; + +// TODO: This could be a CGSCC pass as well. +struct OpenMPOptLegacy : public ModulePass { + static char ID; // Pass identification, replacement for typeid + OpenMPOpt OMPOpt; + + OpenMPOptLegacy() : ModulePass(ID) { + initializeOpenMPOptLegacyPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override {} + + bool runOnModule(Module &M) override { return OMPOpt.runOnModule(M); } +}; + +// TODO: Add a new PM entry point. + +} // namespace + +char OpenMPOptLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(OpenMPOptLegacy, "openmp-opt", + "OpenMP specific optimizations", false, false) +INITIALIZE_PASS_END(OpenMPOptLegacy, "openmp-opt", + "OpenMP specific optimizations", false, false) + +Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacy(); } diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -517,6 +517,10 @@ // Infer attributes about declarations if possible. MPM.add(createInferFunctionAttrsLegacyPass()); + // Try to perform OpenMP specific optimizations. This is a no-op if OpenMP + // runtime calls are not present in the module. + MPM.add(createOpenMPOptLegacyPass()); + addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); if (OptLevel > 2) diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -27,6 +27,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Force set function attributes ; CHECK-NEXT: Infer set function attributes +; CHECK-NEXT: OpenMP specific optimizations ; CHECK-NEXT: Interprocedural Sparse Conditional Constant Propagation ; CHECK-NEXT: Unnamed pass: implement Pass::getPassName() ; CHECK-NEXT: Called Value Propagation diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -27,6 +27,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Force set function attributes ; CHECK-NEXT: Infer set function attributes +; CHECK-NEXT: OpenMP specific optimizations ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Call-site splitting diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -27,6 +27,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Force set function attributes ; CHECK-NEXT: Infer set function attributes +; CHECK-NEXT: OpenMP specific optimizations ; CHECK-NEXT: Interprocedural Sparse Conditional Constant Propagation ; CHECK-NEXT: Unnamed pass: implement Pass::getPassName() ; CHECK-NEXT: Called Value Propagation diff --git a/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll b/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll @@ -0,0 +1,1064 @@ +; RUN: opt < %s -openmp-opt -stats -disable-output 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: opt < %s -openmp-opt -S 2>&1 | FileCheck %s +; +; REQUIRES: asserts +; +; Check that we will not execute any of the below target regions in SPMD-mode. +; TODO: SPMD-mode is valid for target region 2 and 3 if proper guarding code is inserted. +; +; See the to_SPMD_mode.ll file for almost the same functions that can be translated to SPMD mode. +; +; STATS-DAG: 1 openmp-opt - Number of GPU kernel in non-SPMD mode without parallelism +; STATS-DAG: 3 openmp-opt - Number of custom GPU kernel non-SPMD mode state machines created +; STATS-DAG: 2 openmp-opt - Number of custom GPU kernel non-SPMD mode state machines without fallback +; +; No state machine needed because there is no parallel region. +; CHECK: void @{{.*}}loop_in_loop_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 true +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, +; +; void loop_in_loop_in_tregion(int *A, int *B) { +; #pragma omp target +; for (int i = 0; i < 512; i++) { +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; } +; +; +; Custom state machine needed but no fallback because all parallel regions are known +; CHECK: void @{{.*}}parallel_loops_and_accesses_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false +; The "check.next" block should not contain a fallback call +; CHECK: worker.check.next4: +; CHECK-NEXT: br label %worker.parallel_end +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, +; +; void parallel_loops_and_accesses_in_tregion(int *A, int *B) { +; #pragma omp target +; { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[0 + j]; +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[1 + j]; +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[2 + j]; +; +; // This needs a guard in SPMD mode +; A[0] = B[0]; +; } +; } +; +; void extern_func(); +; static void parallel_loop(int *A, int *B, int i) { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; +; int Global[512]; +; +; +; Custom state machine needed but no fallback because all parallel regions are known +; CHECK: void @{{.*}}parallel_loop_in_function_in_loop_with_global_acc_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false +; The "check.next" block should not contain a fallback call +; CHECK: worker.check.next: +; CHECK-NEXT: br label %worker.parallel_end +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, +; +; void parallel_loop_in_function_in_loop_with_global_acc_in_tregion(int *A, int *B) { +; #pragma omp target +; for (int i = 0; i < 512; i++) { +; parallel_loop(A, B, i); +; Global[i]++; +; } +; } +; +; Custom state machine needed with fallback because "extern_func" might contain parallel regions. +; CHECK: void @{{.*}}parallel_loops_in_functions_and_extern_func_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false +; The "check.next" block should contain a fallback call +; CHECK: worker.check.next: +; CHECK-NEXT: call void %work_fn( +; CHECK-NEXT: br label %worker.parallel_end +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false, +; +; void parallel_loops_in_functions_and_extern_func_in_tregion(int *A, int *B) { +; #pragma omp target +; { +; parallel_loop(A, B, 1); +; parallel_loop(A, B, 2); +; extern_func(); +; parallel_loop(A, B, 3); +; } +; } + +source_filename = "../llvm/test/Transforms/OpenMP/no_SPMD_mode.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.private.struct = type { i32**, i32** } +%omp.private.struct.0 = type { i32**, i32** } +%omp.private.struct.1 = type { i32**, i32** } +%omp.private.struct.2 = type { i32**, i32**, i32* } + +@__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2_exec_mode = weak constant i8 1 +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9_exec_mode = weak constant i8 1 +@__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35_exec_mode = weak constant i8 1 +@__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [4 x i8*] [i8* @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + %cleanup.dest.slot = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4 + store i32 0, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.cond: ; preds = %for.inc8, %.execute + %3 = load i32, i32* %i, align 4, !tbaa !15 + %cmp = icmp slt i32 %3, 512 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + store i32 2, i32* %cleanup.dest.slot, align 4 + %4 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #4 + br label %for.end10 + +for.body: ; preds = %for.cond + %5 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 0, i32* %j, align 4, !tbaa !15 + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %6 = load i32, i32* %j, align 4, !tbaa !15 + %cmp2 = icmp slt i32 %6, 1024 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 + +for.cond.cleanup3: ; preds = %for.cond1 + store i32 5, i32* %cleanup.dest.slot, align 4 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %7) #4 + br label %for.end + +for.body4: ; preds = %for.cond1 + %8 = load i32*, i32** %B.addr, align 8, !tbaa !11 + %9 = load i32, i32* %i, align 4, !tbaa !15 + %10 = load i32, i32* %j, align 4, !tbaa !15 + %add = add nsw i32 %9, %10 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom + %11 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %12 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %13 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom5 = sext i32 %13 to i64 + %arrayidx6 = getelementptr inbounds i32, i32* %12, i64 %idxprom5 + %14 = load i32, i32* %arrayidx6, align 4, !tbaa !15 + %add7 = add nsw i32 %14, %11 + store i32 %add7, i32* %arrayidx6, align 4, !tbaa !15 + br label %for.inc + +for.inc: ; preds = %for.body4 + %15 = load i32, i32* %j, align 4, !tbaa !15 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %j, align 4, !tbaa !15 + br label %for.cond1 + +for.end: ; preds = %for.cond.cleanup3 + br label %for.inc8 + +for.inc8: ; preds = %for.end + %16 = load i32, i32* %i, align 4, !tbaa !15 + %inc9 = add nsw i32 %16, 1 + store i32 %inc9, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.end10: ; preds = %for.cond.cleanup + br label %.omp.deinit + +.omp.deinit: ; preds = %for.end10 + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare i8 @__kmpc_target_region_kernel_init(i1, i1, i1, i1) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +declare void @__kmpc_target_region_kernel_deinit(i1, i1) + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %.private.vars = alloca %omp.private.struct, align 8 + %.private.vars1 = alloca %omp.private.struct.0, align 8 + %.private.vars2 = alloca %omp.private.struct.1, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast %omp.private.struct* %.private.vars to i8* + %3 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %3 + %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %4 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false) + %5 = bitcast %omp.private.struct.0* %.private.vars1 to i8* + %6 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars1, i32 0, i32 0 + store i32** %A.addr, i32*** %6 + %7 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars1, i32 0, i32 1 + store i32** %B.addr, i32*** %7 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false) + %8 = bitcast %omp.private.struct.1* %.private.vars2 to i8* + %9 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars2, i32 0, i32 0 + store i32** %A.addr, i32*** %9 + %10 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars2, i32 0, i32 1 + store i32** %B.addr, i32*** %10 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false) + %11 = load i32*, i32** %B.addr, align 8, !tbaa !11 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 0 + %12 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %13 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %arrayidx3 = getelementptr inbounds i32, i32* %13, i64 0 + store i32 %12, i32* %arrayidx3, align 4, !tbaa !15 + br label %.omp.deinit + +.omp.deinit: ; preds = %.execute + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 0, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4 + ret void +} + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion._wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct* + %2 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare !callback !17 void @__kmpc_target_region_kernel_parallel(i1, i1, void (i8*, i8*)* nocapture, i8* nocapture, i16, i8* nocapture readonly, i16, i1) + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 1, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.1_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.0* + %2 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.1(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 2, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.2_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.1* + %2 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35(i32* %A, i32* %B, [512 x i32]* dereferenceable(2048) %Global) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %Global.addr = alloca [512 x i32]*, align 8 + %i = alloca i32, align 4 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + store [512 x i32]* %Global, [512 x i32]** %Global.addr, align 8, !tbaa !11 + %0 = load [512 x i32]*, [512 x i32]** %Global.addr, align 8, !tbaa !11 + %1 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %2 = icmp eq i8 %1, 1 + br i1 %2, label %.execute, label %.exit + +.execute: ; preds = %entry + %3 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + store i32 0, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.cond: ; preds = %for.inc, %.execute + %4 = load i32, i32* %i, align 4, !tbaa !15 + %cmp = icmp slt i32 %4, 512 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %5 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %5) #4 + br label %for.end + +for.body: ; preds = %for.cond + %6 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %7 = load i32*, i32** %B.addr, align 8, !tbaa !11 + %8 = load i32, i32* %i, align 4, !tbaa !15 + call void @parallel_loop(i32* %6, i32* %7, i32 %8) + %9 = load i32, i32* %i, align 4, !tbaa !15 + %idxprom = sext i32 %9 to i64 + %arrayidx = getelementptr inbounds [512 x i32], [512 x i32]* %0, i64 0, i64 %idxprom + %10 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %inc = add nsw i32 %10, 1 + store i32 %inc, i32* %arrayidx, align 4, !tbaa !15 + br label %for.inc + +for.inc: ; preds = %for.body + %11 = load i32, i32* %i, align 4, !tbaa !15 + %inc1 = add nsw i32 %11, 1 + store i32 %inc1, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.end: ; preds = %for.cond.cleanup + br label %.omp.deinit + +.omp.deinit: ; preds = %for.end + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: nounwind +define internal void @parallel_loop(i32* %A, i32* %B, i32 %i) #2 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i.addr = alloca i32, align 4 + %.private.vars = alloca %omp.private.struct.2, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + store i32 %i, i32* %i.addr, align 4, !tbaa !15 + %0 = bitcast %omp.private.struct.2* %.private.vars to i8* + %1 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %1 + %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %2 + %3 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 2 + store i32* %i.addr, i32** %3 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false) + ret void +} + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %3 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop(i32* %2, i32* %3, i32 1) + %4 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %5 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop(i32* %4, i32* %5, i32 2) + call void bitcast (void (...)* @extern_func to void ()*)() + %6 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %7 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop(i32* %6, i32* %7, i32 3) + br label %.omp.deinit + +.omp.deinit: ; preds = %.execute + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare void @extern_func(...) #3 + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %i.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + store i32* %i, i32** %i.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = load i32*, i32** %i.addr, align 8, !tbaa !11 + %3 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 + %4 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %7 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %8 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #4 + %9 = load i32*, i32** %.global_tid..addr, align 8 + %10 = load i32, i32* %9, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %11, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %13, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %14, %15 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %16, %17 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %18, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %19 = load i32*, i32** %1, align 8, !tbaa !11 + %20 = load i32, i32* %2, align 4, !tbaa !15 + %21 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 %20, %21 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom + %22 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %23 = load i32*, i32** %0, align 8, !tbaa !11 + %24 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %24 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4 + %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %25, %22 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %26, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %27, %28 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %29, %30 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) + %31 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4 + %32 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4 + %33 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4 + %34 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4 + %35 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #4 + %36 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #4 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.3_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.2* + %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + %6 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 2 + %7 = load i32*, i32** %6, align 1 + call void @.omp_TRegion.3(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #4 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0, !1, !2, !3} +!nvvm.annotations = !{!4, !5, !6, !7} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +!0 = !{i32 0, i32 24, i32 43713508, !"parallel_loops_and_accesses_in_tregion", i32 9, i32 1} +!1 = !{i32 0, i32 24, i32 43713508, !"loop_in_loop_in_tregion", i32 2, i32 0} +!2 = !{i32 0, i32 24, i32 43713508, !"parallel_loops_in_functions_and_extern_func_in_tregion", i32 43, i32 3} +!3 = !{i32 0, i32 24, i32 43713508, !"parallel_loop_in_function_in_loop_with_global_acc_in_tregion", i32 35, i32 2} +!4 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2, !"kernel", i32 1} +!5 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9, !"kernel", i32 1} +!6 = !{void (i32*, i32*, [512 x i32]*)* @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35, !"kernel", i32 1} +!7 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43, !"kernel", i32 1} +!8 = !{i32 1, !"wchar_size", i32 4} +!9 = !{i32 7, !"PIC Level", i32 2} +!10 = !{!"clang version 9.0.0 "} +!11 = !{!12, !12, i64 0} +!12 = !{!"any pointer", !13, i64 0} +!13 = !{!"omnipotent char", !14, i64 0} +!14 = !{!"Simple C/C++ TBAA"} +!15 = !{!16, !16, i64 0} +!16 = !{!"int", !13, i64 0} +!17 = !{!18} +!18 = !{i64 2, i64 3, i64 5, i1 false} diff --git a/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll b/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll @@ -0,0 +1,1208 @@ +; RUN: opt < %s -openmp-opt -stats -disable-output 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: opt < %s -openmp-opt -S 2>&1 | FileCheck %s +; +; REQUIRES: asserts +; +; Check that we will not execute any of the below target regions in SPMD-mode. +; TODO: SPMD-mode is valid for target region 2 and 3 if proper guarding code is inserted. +; +; See the no_SPMD_mode.ll file for almost the same functions that cannot be translated to SPMD mode. +; +; +; STATS-DAG: 4 openmp-opt - Number of GPU kernels converted to SPMD mode +; STATS-DAG: 6 openmp-opt - Number of parallel GPU kernel regions converted to SPMD mode +; +; Check for SPMD mode. +; CHECK: void @{{.*}}parallel_loop_in_loop_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 true, +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 true, +; +; void parallel_loop_in_loop_in_tregion(int *A, int *B) { +; #pragma omp target +; for (int i = 0; i < 512; i++) { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; } +; +; +; Check for SPMD mode. +; CHECK: void @{{.*}}parallel_loops_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 true, +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 true, +; +; void parallel_loops_in_tregion(int *A, int *B) { +; #pragma omp target +; { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[0 + j]; +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[1 + j]; +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[2 + j]; +; } +; } +; +; FIXME: For now we copy the parallel_loop function below to +; make sure they have only uses in one kernel. As all +; kernels can be translated to SPMD mode we don't need +; this. In the future we should handle that and clone +; the function automatically (or add a new argument) to +; facilitate partial SPMD-mode execution. +; +; static void parallel_loop1(int *A, int *B, int i) { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; static void parallel_loop2(int *A, int *B, int i) { +; #pragma omp parallel for +; for (int j = 0; j < 1024; j++) +; A[j] += B[i + j]; +; } +; +; +; Check for SPMD mode. +; CHECK: void @{{.*}}parallel_loop_in_function_in_loop_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 true, +; CHECK: call void @parallel_loop1( +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 true, +; CHECK: define internal void @parallel_loop1 +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; +; void parallel_loop_in_function_in_loop_in_tregion(int *A, int *B) { +; #pragma omp target +; for (int i = 0; i < 512; i++) +; parallel_loop1(A, B, i); +; } +; +; +; Check for SPMD mode. +; CHECK: define internal void @parallel_loop2 +; CHECK: call void @__kmpc_target_region_kernel_parallel(i1 true, +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +; CHECK: void @{{.*}}parallel_loops_in_functions_in_tregion +; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 true, +; CHECK: call void @parallel_loop2( +; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 true, +; +; void parallel_loops_in_functions_in_tregion(int *A, int *B) { +; #pragma omp target +; { +; parallel_loop2(A, B, 1); +; parallel_loop2(A, B, 2); +; parallel_loop2(A, B, 3); +; } +; } + +source_filename = "../llvm/test/Transforms/OpenMP/to_SPMD_mode.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.private.struct = type { i32**, i32**, i32* } +%omp.private.struct.0 = type { i32**, i32** } +%omp.private.struct.1 = type { i32**, i32** } +%omp.private.struct.2 = type { i32**, i32** } +%omp.private.struct.3 = type { i32**, i32**, i32* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2_exec_mode = weak constant i8 1 +@__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10_exec_mode = weak constant i8 1 +@__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30_exec_mode = weak constant i8 1 +@__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [4 x i8*] [i8* @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + %.private.vars = alloca %omp.private.struct, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + store i32 0, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.cond: ; preds = %for.inc, %.execute + %3 = load i32, i32* %i, align 4, !tbaa !15 + %cmp = icmp slt i32 %3, 512 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %4 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #3 + br label %for.end + +for.body: ; preds = %for.cond + %5 = bitcast %omp.private.struct* %.private.vars to i8* + %6 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %6 + %7 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %7 + %8 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 2 + store i32* %i, i32** %8 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %5, i16 24, i1 false) + br label %for.inc + +for.inc: ; preds = %for.body + %9 = load i32, i32* %i, align 4, !tbaa !15 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.end: ; preds = %for.cond.cleanup + br label %.omp.deinit + +.omp.deinit: ; preds = %for.end + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare i8 @__kmpc_target_region_kernel_init(i1, i1, i1, i1) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %i.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + store i32* %i, i32** %i.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = load i32*, i32** %i.addr, align 8, !tbaa !11 + %3 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + %4 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %7 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %8 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #3 + %9 = load i32*, i32** %.global_tid..addr, align 8 + %10 = load i32, i32* %9, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %11, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %13, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %14, %15 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %16, %17 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %18, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %19 = load i32*, i32** %1, align 8, !tbaa !11 + %20 = load i32, i32* %2, align 4, !tbaa !15 + %21 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 %20, %21 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom + %22 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %23 = load i32*, i32** %0, align 8, !tbaa !11 + %24 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %24 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4 + %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %25, %22 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %26, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %27, %28 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %29, %30 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) + %31 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + %35 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #3 + %36 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #3 + ret void +} + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion._wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct* + %2 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + %6 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 2 + %7 = load i32*, i32** %6, align 1 + call void @.omp_TRegion.(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #3 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare !callback !17 void @__kmpc_target_region_kernel_parallel(i1, i1, void (i8*, i8*)* nocapture, i8* nocapture, i16, i8* nocapture readonly, i16, i1) + +declare void @__kmpc_target_region_kernel_deinit(i1, i1) + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %.private.vars = alloca %omp.private.struct.0, align 8 + %.private.vars1 = alloca %omp.private.struct.1, align 8 + %.private.vars2 = alloca %omp.private.struct.2, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast %omp.private.struct.0* %.private.vars to i8* + %3 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %3 + %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %4 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false) + %5 = bitcast %omp.private.struct.1* %.private.vars1 to i8* + %6 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars1, i32 0, i32 0 + store i32** %A.addr, i32*** %6 + %7 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars1, i32 0, i32 1 + store i32** %B.addr, i32*** %7 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false) + %8 = bitcast %omp.private.struct.2* %.private.vars2 to i8* + %9 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars2, i32 0, i32 0 + store i32** %A.addr, i32*** %9 + %10 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars2, i32 0, i32 1 + store i32** %B.addr, i32*** %10 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false) + br label %.omp.deinit + +.omp.deinit: ; preds = %.execute + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 0, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.1_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.0* + %2 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.1(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 1, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.2_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.1* + %2 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + %3 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %4 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %7 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + %8 = load i32*, i32** %.global_tid..addr, align 8 + %9 = load i32, i32* %8, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %10, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %12, i32* %.omp.iv, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %13, %14 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %15, %16 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %17, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %18 = load i32*, i32** %1, align 8, !tbaa !11 + %19 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 2, %19 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom + %20 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %21 = load i32*, i32** %0, align 8, !tbaa !11 + %22 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %22 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4 + %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %23, %20 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %24, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %25, %26 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %27, %28 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) + %29 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3 + %30 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3 + %31 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.3_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.2* + %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + call void @.omp_TRegion.3(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 + store i32 0, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.cond: ; preds = %for.inc, %.execute + %3 = load i32, i32* %i, align 4, !tbaa !15 + %cmp = icmp slt i32 %3, 512 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %4 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #3 + br label %for.end + +for.body: ; preds = %for.cond + %5 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %6 = load i32*, i32** %B.addr, align 8, !tbaa !11 + %7 = load i32, i32* %i, align 4, !tbaa !15 + call void @parallel_loop1(i32* %5, i32* %6, i32 %7) + br label %for.inc + +for.inc: ; preds = %for.body + %8 = load i32, i32* %i, align 4, !tbaa !15 + %inc = add nsw i32 %8, 1 + store i32 %inc, i32* %i, align 4, !tbaa !15 + br label %for.cond + +for.end: ; preds = %for.cond.cleanup + br label %.omp.deinit + +.omp.deinit: ; preds = %for.end + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: nounwind +define internal void @parallel_loop1(i32* %A, i32* %B, i32 %i) #2 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i.addr = alloca i32, align 4 + %.private.vars = alloca %omp.private.struct.3, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + store i32 %i, i32* %i.addr, align 4, !tbaa !15 + %0 = bitcast %omp.private.struct.3* %.private.vars to i8* + %1 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %1 + %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %2 + %3 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 2 + store i32* %i.addr, i32** %3 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.4_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false) + ret void +} +; Function Attrs: nounwind +define internal void @parallel_loop2(i32* %A, i32* %B, i32 %i) #2 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %i.addr = alloca i32, align 4 + %.private.vars = alloca %omp.private.struct.3, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + store i32 %i, i32* %i.addr, align 4, !tbaa !15 + %0 = bitcast %omp.private.struct.3* %.private.vars to i8* + %1 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 0 + store i32** %A.addr, i32*** %1 + %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 1 + store i32** %B.addr, i32*** %2 + %3 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 2 + store i32* %i.addr, i32** %3 + call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.4_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false) + ret void +} + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36(i32* %A, i32* %B) #0 { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + store i32* %A, i32** %A.addr, align 8, !tbaa !11 + store i32* %B, i32** %B.addr, align 8, !tbaa !11 + %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true) + %1 = icmp eq i8 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %3 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop2(i32* %2, i32* %3, i32 1) + %4 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %5 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop2(i32* %4, i32* %5, i32 2) + %6 = load i32*, i32** %A.addr, align 8, !tbaa !11 + %7 = load i32*, i32** %B.addr, align 8, !tbaa !11 + call void @parallel_loop2(i32* %6, i32* %7, i32 3) + br label %.omp.deinit + +.omp.deinit: ; preds = %.execute + call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.4(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %A.addr = alloca i32**, align 8 + %B.addr = alloca i32**, align 8 + %i.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %j = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11 + store i32** %A, i32*** %A.addr, align 8, !tbaa !11 + store i32** %B, i32*** %B.addr, align 8, !tbaa !11 + store i32* %i, i32** %i.addr, align 8, !tbaa !11 + %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11 + %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11 + %2 = load i32*, i32** %i.addr, align 8, !tbaa !11 + %3 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 + %4 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3 + store i32 0, i32* %.omp.lb, align 4, !tbaa !15 + %5 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3 + store i32 1023, i32* %.omp.ub, align 4, !tbaa !15 + %6 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3 + store i32 1, i32* %.omp.stride, align 4, !tbaa !15 + %7 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !15 + %8 = bitcast i32* %j to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #3 + %9 = load i32*, i32** %.global_tid..addr, align 8 + %10 = load i32, i32* %9, align 4, !tbaa !15 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp = icmp sgt i32 %11, 1023 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15 + %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + store i32 %13, i32* %.omp.iv, align 4, !tbaa !15 + %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp1 = icmp sle i32 %14, %15 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup + +omp.dispatch.cleanup: ; preds = %cond.end + br label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %cmp2 = icmp sle i32 %16, %17 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup + +omp.inner.for.cond.cleanup: ; preds = %omp.inner.for.cond + br label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %mul = mul nsw i32 %18, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %j, align 4, !tbaa !15 + %19 = load i32*, i32** %1, align 8, !tbaa !11 + %20 = load i32, i32* %2, align 4, !tbaa !15 + %21 = load i32, i32* %j, align 4, !tbaa !15 + %add3 = add nsw i32 %20, %21 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom + %22 = load i32, i32* %arrayidx, align 4, !tbaa !15 + %23 = load i32*, i32** %0, align 8, !tbaa !11 + %24 = load i32, i32* %j, align 4, !tbaa !15 + %idxprom4 = sext i32 %24 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4 + %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15 + %add6 = add nsw i32 %25, %22 + store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15 + %add7 = add nsw i32 %26, 1 + store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond.cleanup + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15 + %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add8 = add nsw i32 %27, %28 + store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15 + %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15 + %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15 + %add9 = add nsw i32 %29, %30 + store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %omp.dispatch.cleanup + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) + %31 = bitcast i32* %j to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3 + %32 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3 + %33 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3 + %34 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3 + %35 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #3 + %36 = bitcast i32* %.omp.iv to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #3 + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @.omp_TRegion.4_wrapper(i8* %shared_vars, i8* %private_vars) #0 { +entry: + %.addr = alloca i8*, align 8 + %.addr1 = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11 + store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15 + %1 = bitcast i8* %private_vars to %omp.private.struct.3* + %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + %4 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 1 + %5 = load i32**, i32*** %4, align 1 + %6 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 2 + %7 = load i32*, i32** %6, align 1 + call void @.omp_TRegion.4(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #3 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!omp_offload.info = !{!0, !1, !2, !3} +!nvvm.annotations = !{!4, !5, !6, !7} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +!0 = !{i32 0, i32 24, i32 43712181, !"parallel_loop_in_loop_in_tregion", i32 2, i32 0} +!1 = !{i32 0, i32 24, i32 43712181, !"parallel_loops_in_functions_in_tregion", i32 36, i32 3} +!2 = !{i32 0, i32 24, i32 43712181, !"parallel_loops_in_tregion", i32 10, i32 1} +!3 = !{i32 0, i32 24, i32 43712181, !"parallel_loop_in_function_in_loop_in_tregion", i32 30, i32 2} +!4 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2, !"kernel", i32 1} +!5 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10, !"kernel", i32 1} +!6 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30, !"kernel", i32 1} +!7 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36, !"kernel", i32 1} +!8 = !{i32 1, !"wchar_size", i32 4} +!9 = !{i32 7, !"PIC Level", i32 2} +!10 = !{!"clang version 9.0.0 "} +!11 = !{!12, !12, i64 0} +!12 = !{!"any pointer", !13, i64 0} +!13 = !{!"omnipotent char", !14, i64 0} +!14 = !{!"Simple C/C++ TBAA"} +!15 = !{!16, !16, i64 0} +!16 = !{!"int", !13, i64 0} +!17 = !{!18} +!18 = !{i64 2, i64 3, i64 5, i1 false}