diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" +#include "llvm/IR/Assumptions.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" @@ -67,6 +68,9 @@ "Number of OpenMP runtime function uses identified"); STATISTIC(NumOpenMPTargetRegionKernels, "Number of OpenMP target region entry points (=kernels) identified"); +STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachine, + "Number of OpenMP target region entry points (=kernels) executed in " + "generic-mode with customized state machines"); STATISTIC( NumOpenMPParallelRegionsReplacedInGPUStateMachine, "Number of OpenMP parallel regions replaced with ID in GPU state machines"); @@ -243,6 +247,11 @@ /// Map from functions to all uses of this runtime function contained in /// them. DenseMap> UsesMap; + + public: + /// Iterators for the uses of this runtime function. + decltype(UsesMap)::iterator begin() { return UsesMap.begin(); } + decltype(UsesMap)::iterator end() { return UsesMap.end(); } }; /// An OpenMP-IR-Builder instance @@ -253,6 +262,9 @@ RuntimeFunction::OMPRTL___last> RFIs; + /// Map from function declarations/definitions to their runtime enum type. + DenseMap RuntimeFunctionIDMap; + /// Map from ICV kind to the ICV description. EnumeratedArray @@ -395,6 +407,7 @@ SmallVector ArgsTypes({__VA_ARGS__}); \ Function *F = M.getFunction(_Name); \ if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \ + RuntimeFunctionIDMap[F] = _Enum; \ auto &RFI = RFIs[_Enum]; \ RFI.Kind = _Enum; \ RFI.Name = _Name; \ @@ -423,6 +436,458 @@ SmallPtrSetImpl &Kernels; }; +struct KernelInfoState : AbstractState { + /// Flag to track if we reached a fixpoint. + bool IsAtFixpoint = false; + + /// The parallel regions (identified by the outlined parallel functions) that + /// can be reached from the associated function. + SmallSetVector ParallelRegions; + + /// The __kmpc_target_init call in this kernel, if any. If we find more than + /// one we abort as the kernel is malformed. + CallBase *KernelInitCB = nullptr; + + /// The __kmpc_target_deinit call in this kernel, if any. If we find more than + /// one we abort as the kernel is malformed. + CallBase *KernelDeinitCB = nullptr; + + /// Flag to indicate that we may reach a parallel region that is not tracked + /// in the ParallelRegions set above. + bool MayReachUnknownParallelRegion = false; + + /// Abstract State interface + ///{ + + KernelInfoState() {} + KernelInfoState(bool BestState) { + if (!BestState) + indicatePessimisticFixpoint(); + } + + /// See AbstractState::isValidState(...) + bool isValidState() const override { return true; } + + /// See AbstractState::isAtFixpoint(...) + bool isAtFixpoint() const override { return IsAtFixpoint; } + + /// See AbstractState::indicatePessimisticFixpoint(...) + ChangeStatus indicatePessimisticFixpoint() override { + IsAtFixpoint = true; + MayReachUnknownParallelRegion = true; + return ChangeStatus::CHANGED; + } + + /// See AbstractState::indicateOptimisticFixpoint(...) + ChangeStatus indicateOptimisticFixpoint() override { + IsAtFixpoint = true; + return ChangeStatus::UNCHANGED; + } + + /// Return the assumed state + KernelInfoState &getAssumed() { return *this; } + const KernelInfoState &getAssumed() const { return *this; } + + bool operator==(const KernelInfoState &RHS) const { + if ((MayReachUnknownParallelRegion != RHS.MayReachUnknownParallelRegion)) + return false; + return ParallelRegions.size() == RHS.ParallelRegions.size(); + } + + /// Return empty set as the best state of potential values. + static KernelInfoState getBestState() { return KernelInfoState(true); } + + static KernelInfoState getBestState(KernelInfoState &KIS) { + return getBestState(); + } + + /// Return full set as the worst state of potential values. + static KernelInfoState getWorstState() { return KernelInfoState(false); } + + /// "Clamp" this state with \p KIS. + KernelInfoState operator^=(const KernelInfoState &KIS) { + // We should never merge in the state of a kernel function, that is one + // which has a __kmpc_target_init or deinit. + if (KIS.KernelInitCB || KIS.KernelDeinitCB) + indicatePessimisticFixpoint(); + MayReachUnknownParallelRegion |= KIS.MayReachUnknownParallelRegion; + ParallelRegions.insert(KIS.ParallelRegions.begin(), + KIS.ParallelRegions.end()); + return *this; + } + + KernelInfoState operator&=(const KernelInfoState &KIS) { + return (*this ^= KIS); + } + + ///} +}; + +struct AAKernelInfo : public StateWrapper { + using Base = StateWrapper; + AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Modify the IR based on the KernelInfoState as the fixpoint iteration is + /// finished now. + ChangeStatus manifest(Attributor &A) override { + // This is a somewhat unrelated modification. We basically flatten the + // function that was reached from a kernel completely by asking the inliner + // to inline everything it can. This should live in a separate AA though as + // it should also run on parallel regions and other GPU functions. + auto CheckCallInst = [&](Instruction &I) { + auto &CB = cast(I); + CB.addAttribute(AttributeList::FunctionIndex, Attribute::AlwaysInline); + return true; + }; + A.checkForAllCallLikeInstructions(CheckCallInst, *this); + + // If we are not looking at a kernel with __kmpc_target_init and + // __kmpc_target_deinit call we cannot actually manifest the information. + if (!KernelInitCB || !KernelDeinitCB) + return ChangeStatus::UNCHANGED; + + const int InitIsSPMDArgNo = 1; + const int InitUseStateMachineArgNo = 2; + + // Check if the current configuration is non-SPMD and generic state machine. + // If we already have SPMD mode or a custom state machine we do not need to + // go any further. If it is anything but a constant something is weird and + // we give up. + ConstantInt *UseStateMachine = dyn_cast( + KernelInitCB->getArgOperand(InitUseStateMachineArgNo)); + ConstantInt *IsSPMD = + dyn_cast(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); + + // If we are stuck with generic mode, try to create a custom device (=GPU) + // state machine which is specialized for the parallel regions that are + // reachable by the kernel. + if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD || + !IsSPMD->isZero()) { + return ChangeStatus::UNCHANGED; + } + + // First, indicate we use a custom state machine now. + auto &Ctx = getAnchorValue().getContext(); + auto *FalseVal = ConstantInt::getBool(Ctx, 0); + A.changeUseAfterManifest( + KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal); + + // Keep track in the statistics of our new shiny custom state machine. + ++NumOpenMPTargetRegionKernelsCustomStateMachine; + + // If we don't actually need a state machine we are done here. This can + // happen if there simply are no parallel regions. In the resulting kernel + // all worker threads will simply exit right away, leaving the main thread + // to do the work alone. + if (!MayReachUnknownParallelRegion && ParallelRegions.empty()) + return ChangeStatus::CHANGED; + + // Create all the blocks: + // + // InitCB = __kmpc_target_init(...) + // bool IsWorker = InitCB >= 0; + // if (IsWorker) { + // SMBeginBB: __kmpc_barrier_simple_spmd(...); + // void *WorkFn; + // bool Active = __kmpc_kernel_parallel(&WorkFn); + // if (!WorkFn) return; + // SMIsActiveCheckBB: if (Active) { + // SMIfCascadeCurrentBB: if (WorkFn == ) + // ParFn0(...); + // SMIfCascadeCurrentBB: else if (WorkFn == ) + // ParFn1(...); + // ... + // SMIfCascadeCurrentBB: else + // ((WorkFnTy*)WorkFn)(...); + // SMEndParallelBB: __kmpc_kernel_end_parallel(...); + // } + // SMDoneBB: __kmpc_barrier_simple_spmd(...); + // goto SMBeginBB; + // } + // UserCodeEntryBB: // user code + // __kmpc_target_deinit(...) + // + Function *Kernel = getAssociatedFunction(); + BasicBlock *InitBB = KernelInitCB->getParent(); + BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock( + KernelInitCB->getNextNode(), "thread.user_code.check"); + BasicBlock *StateMachineBeginBB = BasicBlock::Create( + Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB); + BasicBlock *StateMachineFinishedBB = BasicBlock::Create( + Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB); + BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create( + Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB); + BasicBlock *StateMachineIfCascadeCurrentBB = + BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check", + Kernel, UserCodeEntryBB); + BasicBlock *StateMachineEndParallelBB = + BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end", + Kernel, UserCodeEntryBB); + BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create( + Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB); + + ReturnInst::Create(Ctx, StateMachineFinishedBB); + + InitBB->getTerminator()->eraseFromParent(); + Value *IsWorker = + ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, + ConstantInt::get(KernelInitCB->getType(), -1), + "thread.is_worker", InitBB); + BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB); + + // Create local storage for the work function pointer. + Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); + AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "worker.work_fn.addr", + &Kernel->getEntryBlock().front()); + + auto &OMPInfoCache = static_cast(A.getInfoCache()); + OMPInfoCache.OMPBuilder.updateToLocation(IRBuilder<>::InsertPoint( + StateMachineBeginBB, StateMachineBeginBB->end())); + + Value *Ident = KernelInitCB->getArgOperand(0); + Value *GTid = KernelInitCB; + + Module &M = *Kernel->getParent(); + FunctionCallee BarrierFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_barrier_simple_spmd); + CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB); + + FunctionCallee KernelParallelFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_kernel_parallel); + Value *IsActiveWorker = CallInst::Create( + KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB); + Value *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn", + StateMachineBeginBB); + + FunctionType *ParallelRegionFnTy = FunctionType::get( + Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)}, + false); + Value *WorkFnCast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( + WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast", + StateMachineBeginBB); + + Value *IsDone = ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, + WorkFn, Constant::getNullValue(VoidPtrTy), + "worker.is_done", StateMachineBeginBB); + BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB, + IsDone, StateMachineBeginBB); + + BranchInst::Create(StateMachineIfCascadeCurrentBB, + StateMachineDoneBarrierBB, IsActiveWorker, + StateMachineIsActiveCheckBB); + + Value *ZeroArg = + Constant::getNullValue(ParallelRegionFnTy->getParamType(0)); + + // Now that we have most of the CFG skelleton it is time for the if-cascade + // that checks the function pointer we got from the runtime against the + // parallel regions we expect, if there are any. + for (int i = 0, e = ParallelRegions.size(); i < e; ++i) { + auto *ParallelRegion = ParallelRegions[i]; + BasicBlock *PRExecuteBB = BasicBlock::Create( + Ctx, "worker_state_machine.parallel_region.execute", Kernel, + StateMachineEndParallelBB); + CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB); + BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB); + + BasicBlock *PRNextBB = + BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check", + Kernel, StateMachineEndParallelBB); + + // Check if we need to compare the pointer at all or if we can just + // call the parallel region function. + Value *IsPR; + if (i + 1 < e || MayReachUnknownParallelRegion) + IsPR = ICmpInst::Create( + ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion, + "worker.check_parallel_region", StateMachineIfCascadeCurrentBB); + else + IsPR = ConstantInt::getTrue(Ctx); + + BranchInst::Create(PRExecuteBB, PRNextBB, IsPR, + StateMachineIfCascadeCurrentBB); + StateMachineIfCascadeCurrentBB = PRNextBB; + } + + // At the end of the if-cascade we place the indirect function pointer call + // in case we might need it, that is if there can be parallel regions we + // have not handled in the if-cascade above. + if (MayReachUnknownParallelRegion) { + StateMachineIfCascadeCurrentBB->setName( + "worker_state_machine.parallel_region.fallback.execute"); + CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "", + StateMachineIfCascadeCurrentBB); + } + BranchInst::Create(StateMachineEndParallelBB, + StateMachineIfCascadeCurrentBB); + + CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_kernel_end_parallel), + {}, "", StateMachineEndParallelBB); + BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB); + + CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB); + BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB); + + return ChangeStatus::CHANGED; + } + + /// Statistics are tracked as part of manifest for now. + void trackStatistics() const override {} + + /// See AbstractAttribute::getAsStr() + const std::string getAsStr() const override { + return std::string("#PR: ") + std::to_string(ParallelRegions.size()) + + (MayReachUnknownParallelRegion ? " + Unknown PR" : ""); + } + + /// Create an abstract attribute biew for the position \p IRP. + static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A); + + /// See AbstractAttribute::getName() + const std::string getName() const override { return "AAKernelInfo"; } + + /// See AbstractAttribute::getIdAddr() + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is AAKernelInfo + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + static const char ID; +}; + +/// The function kernel info abstract attribute, basically, what can we say +/// about a function with regards to the KernelInfoState. +struct AAKernelInfoFunction : AAKernelInfo { + AAKernelInfoFunction(const IRPosition &IRP, Attributor &A) + : AAKernelInfo(IRP, A) {} + + /// Fixpoint iteration update function. Will be called every time a dependence + /// changed its state (and in the beginning). + ChangeStatus updateImpl(Attributor &A) override { + KernelInfoState StateBefore = getState(); + + // Callback to check a call instruction. + auto CheckCallInst = [&](Instruction &I) { + auto &CB = cast(I); + // First weed out calls we do not care about, that is readonly/readnone + // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a + // parallel region or anything else we are looking for. + if (!CB.mayWriteToMemory()) + return true; + if (isa(CB)) + return true; + Function *Callee = CB.getCalledFunction(); + if (Callee && + hasAssumption(*Callee, KnownAssumptionString("omp_no_openmp"))) + return true; + + // Next we check if we know the callee. If it is a known OpenMP function + // we will handle them explicitly in the switch below. If it is not, we + // will use an AAKernelInfo object on the callee to gather information and + // merge that into the current state. If we cannot gather information, + // thus the state of the AAKernelInfo we got is invalid, we don't have to + // completely give up here. It basically means we have no idea what the + // effects of the call might be, for now the worst that can happen are + // unknown parallel regions hide in the callee. + auto &OMPInfoCache = static_cast(A.getInfoCache()); + const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee); + if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) { + if (Callee && A.isFunctionIPOAmendable(*Callee)) { + auto &CBAA = A.getAAFor( + *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED); + if (CBAA.getState().isValidState()) { + getState() ^= CBAA.getState(); + return true; + } + } + // The callee is not known, not ipo-amendable (e.g., due to linkage), or + // we can for some other reason not analyze it -> worst case. + MayReachUnknownParallelRegion = true; + return true; + } + + const unsigned int WrapperFunctionArgNo = 6; + RuntimeFunction RF = It->getSecond(); + switch (RF) { + case OMPRTL___kmpc_target_init: + // The only reason we completely give up is when we see a second init or + // deinit call. + if (KernelInitCB && KernelInitCB != &CB) + return false; + KernelInitCB = &CB; + return true; + case OMPRTL___kmpc_target_deinit: + // The only reason we completely give up is when we see a second init or + // deinit call. + if (KernelDeinitCB && KernelDeinitCB != &CB) + return false; + KernelDeinitCB = &CB; + return true; + case OMPRTL___kmpc_parallel_51: + if (auto *ParallelRegion = dyn_cast( + CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) { + ParallelRegions.insert(ParallelRegion); + return true; + } + // The condition above should usually get the parallel region function + // pointer and record it. In the off chance it doesn't we assume the + // worst. + MayReachUnknownParallelRegion = true; + return true; + case OMPRTL___kmpc_omp_task: + // We do not look into tasks right now, just give up. + MayReachUnknownParallelRegion = true; + return true; + default: + break; + } + // All other OpenMP runtime calls will not reach parallel regions so they + // can be safely ignored for now. + return true; + }; + if (!A.checkForAllCallLikeInstructions(CheckCallInst, *this)) + return indicatePessimisticFixpoint(); + + return StateBefore == getState() ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; + } +}; + +/// The call site kernel info abstract attribute, basically, what can we say +/// about a call site with regards to the KernelInfoState. For now this simply +/// forwards the information from the callee. +struct AAKernelInfoCallSite : AAKernelInfo { + AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A) + : AAKernelInfo(IRP, A) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AAKernelInfo::initialize(A); + Function *F = getAssociatedFunction(); + if (!F || F->isDeclaration()) + indicatePessimisticFixpoint(); + } + + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::function(*F); + auto &FnAA = A.getAAFor(*this, FnPos, DepClassTy::REQUIRED); + if (getState() == FnAA.getState()) + return ChangeStatus::UNCHANGED; + getState() = FnAA.getState(); + return ChangeStatus::CHANGED; + } +}; + /// Used to map the values physically (in the IR) stored in an offload /// array, to a vector in memory. struct OffloadArray { @@ -537,7 +1002,7 @@ << OMPInfoCache.ModuleSlice.size() << " functions\n"); if (IsModulePass) { - Changed |= runAttributor(); + Changed |= runAttributor(IsModulePass); if (remarksEnabled()) analysisGlobalization(); @@ -547,14 +1012,14 @@ if (PrintOpenMPKernels) printKernels(); - Changed |= rewriteDeviceCodeStateMachine(); - - Changed |= runAttributor(); + Changed |= runAttributor(IsModulePass); // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); Changed |= deleteParallelRegions(); + Changed |= rewriteDeviceCodeStateMachine(); + if (HideMemoryTransferLatency) Changed |= hideMemTransfersLatency(); Changed |= deduplicateRuntimeCalls(); @@ -1599,11 +2064,11 @@ Attributor &A; /// Helper function to run Attributor on SCC. - bool runAttributor() { + bool runAttributor(bool IsModulePass) { if (SCC.empty()) return false; - registerAAs(); + registerAAs(IsModulePass); ChangeStatus Changed = A.run(); @@ -1615,7 +2080,7 @@ /// Populate the Attributor with abstract attribute opportunities in the /// function. - void registerAAs() { + void registerAAs(bool IsModulePass) { if (SCC.empty()) return; @@ -1644,6 +2109,10 @@ if (!F.isDeclaration()) A.getOrCreateAAFor(IRPosition::function(F)); } + if (IsModulePass) { + for (Function *Kernel : OMPInfoCache.Kernels) + A.getOrCreateAAFor(IRPosition::function(*Kernel)); + } } }; @@ -1780,7 +2249,7 @@ // TODO: Checking the number of uses is not a necessary restriction and // should be lifted. if (UnknownUse || NumDirectCalls != 1 || - ToBeReplacedStateMachineUses.size() != 2) { + ToBeReplacedStateMachineUses.size() > 2) { { auto Remark = [&](OptimizationRemark OR) { return OR << "Parallel region is used in " @@ -2411,6 +2880,7 @@ } // namespace const char AAICVTracker::ID = 0; +const char AAKernelInfo::ID = 0; const char AAExecutionDomain::ID = 0; AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP, @@ -2460,6 +2930,28 @@ return *AA; } +AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP, + Attributor &A) { + AAKernelInfo *AA = nullptr; + switch (IRP.getPositionKind()) { + case IRPosition::IRP_INVALID: + case IRPosition::IRP_FLOAT: + case IRPosition::IRP_ARGUMENT: + case IRPosition::IRP_RETURNED: + case IRPosition::IRP_CALL_SITE_RETURNED: + case IRPosition::IRP_CALL_SITE_ARGUMENT: + llvm_unreachable("KernelInfo can only be created for function position!"); + case IRPosition::IRP_CALL_SITE: + AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A); + break; + case IRPosition::IRP_FUNCTION: + AA = new (A.Allocator) AAKernelInfoFunction(IRP, A); + break; + } + + return *AA; +} + PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { if (!containsOpenMP(M, OMPInModule)) return PreservedAnalyses::all(); @@ -2492,7 +2984,7 @@ OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, OMPInModule.getKernels()); - Attributor A(Functions, InfoCache, CGUpdater); + Attributor A(Functions, InfoCache, CGUpdater, nullptr, false); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); bool Changed = OMPOpt.run(true); @@ -2547,7 +3039,7 @@ OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, /*CGSCC*/ Functions, OMPInModule.getKernels()); - Attributor A(Functions, InfoCache, CGUpdater); + Attributor A(Functions, InfoCache, CGUpdater, nullptr, false); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); bool Changed = OMPOpt.run(false); @@ -2622,7 +3114,7 @@ *(Functions.back()->getParent()), AG, Allocator, /*CGSCC*/ Functions, OMPInModule.getKernels()); - Attributor A(Functions, InfoCache, CGUpdater); + Attributor A(Functions, InfoCache, CGUpdater, nullptr, false); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); return OMPOpt.run(false); diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll @@ -0,0 +1,1747 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s + +;; void p0(void); +;; void p1(void); +;; void unknown(void); +;; void unknown_pure(void) __attribute__((pure)); +;; void unknown_no_openmp(void) __attribute__((assume("omp_no_openmp"))); +;; +;; int G; +;; void no_parallel_region_in_here(void) { +;; #pragma omp single +;; G = 0; +;; } +;; +;; void no_state_machine_needed() { +;; #pragma omp target teams +;; no_parallel_region_in_here(); +;; } +;; +;; void simple_state_machine() { +;; #pragma omp target teams +;; { +;; #pragma omp parallel +;; { p0(); } +;; no_parallel_region_in_here(); +;; #pragma omp parallel +;; { p1(); } +;; } +;; } +;; +;; void simple_state_machine_interprocedural_after(void); +;; void simple_state_machine_interprocedural_before(void) { +;; #pragma omp parallel +;; { p0(); } +;; } +;; void simple_state_machine_interprocedural() { +;; #pragma omp target teams +;; { +;; simple_state_machine_interprocedural_before(); +;; no_parallel_region_in_here(); +;; #pragma omp parallel +;; { p1(); } +;; simple_state_machine_interprocedural_after(); +;; } +;; } +;; void simple_state_machine_interprocedural_after(void) { +;; #pragma omp parallel +;; { p0(); } +;; } +;; +;; void simple_state_machine_with_fallback() { +;; #pragma omp target teams +;; { +;; #pragma omp parallel +;; { p0(); } +;; unknown(); +;; #pragma omp parallel +;; { p1(); } +;; } +;; } +;; +;; void simple_state_machine_no_openmp_attr() { +;; #pragma omp target teams +;; { +;; #pragma omp parallel +;; { p0(); } +;; unknown_no_openmp(); +;; #pragma omp parallel +;; { p1(); } +;; } +;; } +;; +;; void simple_state_machine_pure() { +;; #pragma omp target teams +;; { +;; #pragma omp parallel +;; { p0(); } +;; unknown_pure(); +;; #pragma omp parallel +;; { p1(); } +;; } +;; } +;; +;; int omp_get_thread_num(); +;; void simple_state_machine_interprocedural_nested_recursive_after(int); +;; void simple_state_machine_interprocedural_nested_recursive_after_after(void); +;; void simple_state_machine_interprocedural_nested_recursive() { +;; #pragma omp target teams +;; { +;; simple_state_machine_interprocedural_nested_recursive_after( +;; omp_get_thread_num()); +;; } +;; } +;; +;; void simple_state_machine_interprocedural_nested_recursive_after(int a) { +;; if (a == 0) +;; return; +;; simple_state_machine_interprocedural_nested_recursive_after(a - 1); +;; simple_state_machine_interprocedural_nested_recursive_after_after(); +;; } +;; void simple_state_machine_interprocedural_nested_recursive_after_after(void) { +;; #pragma omp parallel +;; { p0(); } +;; } +;; +;; __attribute__((weak)) void weak_callee_empty(void) {} +;; void no_state_machine_weak_callee() { +;; #pragma omp target teams +;; { weak_callee_empty(); } +;; } + +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* undef +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@__omp_offloading_2c_389eb_no_state_machine_needed_l14_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_l19_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_pure_l72_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106_exec_mode = weak constant i8 1 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@G = external global i32, align 4 +@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@llvm.compiler.used = appending global [8 x i8*] [i8* @__omp_offloading_2c_389eb_no_state_machine_needed_l14_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_l19_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_pure_l72_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86_exec_mode, i8* @__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106_exec_mode], section "llvm.metadata" + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; No user code state machine is build because we do not need one. +define weak void @__omp_offloading_2c_389eb_no_state_machine_needed_l14() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_no_state_machine_needed_l14 +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true) #[[ATTR8:[0-9]+]] +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9:[0-9]+]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +; CHECK-SAME: (i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @no_parallel_region_in_here() #[[ATTR10:[0-9]+]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @no_parallel_region_in_here() #8 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @no_parallel_region_in_here() #1 { +; CHECK-LABEL: define {{[^@]+}}@no_parallel_region_in_here +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) #[[ATTR9]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) #[[ATTR9]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; CHECK: omp_if.then: +; CHECK-NEXT: store i32 0, i32* @G, align 4 +; CHECK-NEXT: call void @__kmpc_end_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) #[[ATTR9]] +; CHECK-NEXT: br label [[OMP_IF_END]] +; CHECK: omp_if.end: +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR9]] +; CHECK-NEXT: ret void +; +entry: + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = call i32 @__kmpc_single(%struct.ident_t* @2, i32 %0) + %2 = icmp ne i32 %1, 0 + br i1 %2, label %omp_if.then, label %omp_if.end + +omp_if.then: ; preds = %entry + store i32 0, i32* @G, align 4 + call void @__kmpc_end_single(%struct.ident_t* @2, i32 %0) + br label %omp_if.end + +omp_if.end: ; preds = %omp_if.then, %entry + call void @__kmpc_barrier(%struct.ident_t* @3, i32 %0) + ret void +} + +; Function Attrs: nounwind +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #2 + +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and only one pointer comparison are needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_l19() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_l19 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__2_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check3: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__1(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 +; CHECK-SAME: (i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) #[[ATTR8]] +; CHECK-NEXT: call void @no_parallel_region_in_here() #[[ATTR10]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) #[[ATTR8]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + %captured_vars_addrs1 = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %2, i64 0) + call void @no_parallel_region_in_here() #8 + %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %3, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR11:[0-9]+]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent +declare void @p0() #3 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +declare void @__kmpc_get_shared_variables(i8***) + +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent +declare void @p1() #3 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and only two pointer comparison are needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__17_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__5_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check3: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE5:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK6:%.*]] +; CHECK: worker_state_machine.parallel_region.execute5: +; CHECK-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check6: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__4(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__4(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__4(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4 +; CHECK-SAME: (i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @simple_state_machine_interprocedural_before() #[[ATTR10]] +; CHECK-NEXT: call void @no_parallel_region_in_here() #[[ATTR10]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) #[[ATTR8]] +; CHECK-NEXT: call void @simple_state_machine_interprocedural_after() #[[ATTR10]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @simple_state_machine_interprocedural_before() #8 + call void @no_parallel_region_in_here() #8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** %2, i64 0) + call void @simple_state_machine_interprocedural_after() #8 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @simple_state_machine_interprocedural_before() #1 { +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) #[[ATTR9]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** [[TMP1]], i64 0) #[[ATTR8]] +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @2, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** %1, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__5(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__5(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @simple_state_machine_interprocedural_after() #1 { +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) #[[ATTR9]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** [[TMP1]], i64 0) #[[ATTR8]] +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @2, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** %1, i64 0) + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. A fallback indirect call and only two pointer comparison are needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__7_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__8_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.fallback.execute: +; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__6(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__6(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__6 +; CHECK-SAME: (i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) #[[ATTR8]] +; CHECK-NEXT: call void @unknown() #[[ATTR12:[0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) #[[ATTR8]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + %captured_vars_addrs1 = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** %2, i64 0) + call void @unknown() #8 + %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** %3, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__7(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__7(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent +declare void @unknown() #3 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__8(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__8_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__8(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__10_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check3: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__9(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__9(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__9(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__9 +; CHECK-SAME: (i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) #[[ATTR8]] +; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR13:[0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) #[[ATTR8]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + %captured_vars_addrs1 = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** %2, i64 0) + call void @unknown_no_openmp() #9 + %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** %3, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__10(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__10_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__10(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent +declare void @unknown_no_openmp() #5 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__11(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__11_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__11(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_pure_l72() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_pure_l72 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__13_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check3: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__12(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__12(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__12(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__12 +; CHECK-SAME: (i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) #[[ATTR8]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) #[[ATTR8]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + %captured_vars_addrs1 = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** %2, i64 0) + call void @unknown_pure() #10 + %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** %3, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__13(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__13 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__13_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__13(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent nounwind readonly willreturn +declare void @unknown_pure() #6 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__14(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__14 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__14_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__14(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and no pointer comparison is needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__19_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__15(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__15(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__15(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__15 +; CHECK-SAME: (i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] +; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[CALL]]) #[[ATTR10]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %call = call i32 @omp_get_thread_num() + call void @simple_state_machine_interprocedural_nested_recursive_after(i32 %call) #8 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @simple_state_machine_interprocedural_nested_recursive_after(i32 %a) #1 { +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after +; CHECK-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: if.end: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR10]] +; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR10]] +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: ret void +; +entry: + %a.addr = alloca i32, align 4 + store i32 %a, i32* %a.addr, align 4 + %0 = load i32, i32* %a.addr, align 4 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + br label %return + +if.end: ; preds = %entry + %1 = load i32, i32* %a.addr, align 4 + %sub = sub nsw i32 %1, 1 + call void @simple_state_machine_interprocedural_nested_recursive_after(i32 %sub) #8 + call void @simple_state_machine_interprocedural_nested_recursive_after_after() #8 + br label %return + +return: ; preds = %if.end, %if.then + ret void +} + +; Function Attrs: convergent +declare i32 @omp_get_thread_num() #3 + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A pretty generic user code state machine is build because we do not know anything about the weak callee. +define weak void @__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.fallback.execute: +; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__16(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__16(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__16(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__16 +; CHECK-SAME: (i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @weak_callee_empty() #[[ATTR10]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @weak_callee_empty() #8 + ret void +} + +; Function Attrs: convergent nounwind +define weak hidden void @weak_callee_empty() #1 { +; CHECK-LABEL: define {{[^@]+}}@weak_callee_empty +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + ret void +} + +; Function Attrs: convergent nounwind +declare void @__kmpc_end_single(%struct.ident_t*, i32) #7 + +; Function Attrs: convergent nounwind +declare i32 @__kmpc_single(%struct.ident_t*, i32) #7 + +; Function Attrs: convergent nounwind +declare void @__kmpc_barrier(%struct.ident_t*, i32) #7 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__17(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__17 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__17_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__17(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__18(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__18 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__18_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__18(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @simple_state_machine_interprocedural_nested_recursive_after_after() #1 { +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) #[[ATTR9]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** [[TMP1]], i64 0) #[[ATTR8]] +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @2, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** %1, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__19(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__19 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR11]] +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__19_wrapper(i16 zeroext %0, i32 %1) #4 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__19(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #2 = { nounwind } +attributes #3 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #4 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #5 = { convergent "frame-pointer"="all" "llvm.assume"="omp_no_openmp" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #6 = { convergent nounwind readonly willreturn "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #7 = { convergent nounwind } +attributes #8 = { convergent } +attributes #9 = { convergent "llvm.assume"="omp_no_openmp" } +attributes #10 = { convergent nounwind readonly willreturn } + +!omp_offload.info = !{!0, !1, !2, !3, !4, !5, !6, !7} +!nvvm.annotations = !{!8, !9, !10, !11, !12, !13, !14, !15} +!llvm.module.flags = !{!16, !17, !18} +!llvm.ident = !{!19} + +!0 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_interprocedural", i32 35, i32 2} +!1 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_no_openmp_attr", i32 61, i32 4} +!2 = !{i32 0, i32 44, i32 231915, !"no_state_machine_needed", i32 14, i32 0} +!3 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_with_fallback", i32 50, i32 3} +!4 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_pure", i32 72, i32 5} +!5 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_interprocedural_nested_recursive", i32 86, i32 6} +!6 = !{i32 0, i32 44, i32 231915, !"no_state_machine_weak_callee", i32 106, i32 7} +!7 = !{i32 0, i32 44, i32 231915, !"simple_state_machine", i32 19, i32 1} +!8 = !{void ()* @__omp_offloading_2c_389eb_no_state_machine_needed_l14, !"kernel", i32 1} +!9 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_l19, !"kernel", i32 1} +!10 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35, !"kernel", i32 1} +!11 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50, !"kernel", i32 1} +!12 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61, !"kernel", i32 1} +!13 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_pure_l72, !"kernel", i32 1} +!14 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86, !"kernel", i32 1} +!15 = !{void ()* @__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106, !"kernel", i32 1} +!16 = !{i32 1, !"wchar_size", i32 4} +!17 = !{i32 7, !"PIC Level", i32 2} +!18 = !{i32 7, !"frame-pointer", i32 2} +!19 = !{!"clang version 13.0.0"}