diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -610,8 +610,20 @@ Optional Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA, bool &UsedAssumedInformation) { - const auto &ValueSimplifyAA = getAAFor( - AA, IRPosition::value(V, AA.getCallBaseContext()), DepClassTy::NONE); + // First check all callbacks provided by outside AAs. If any of them returns + // a non-null value that is different from the associated value, or None, we + // assume it's simpliied. + IRPosition IRP = IRPosition::value(V, AA.getCallBaseContext()); + for (auto &CB : SimplificationCallbacks[IRP]) { + Optional SimplifiedV = CB(IRP, &AA, UsedAssumedInformation); + if (!SimplifiedV.hasValue()) + return llvm::None; + if (*SimplifiedV && *SimplifiedV != &IRP.getAssociatedValue() && + isa(*SimplifiedV)) + return cast(*SimplifiedV); + } + const auto &ValueSimplifyAA = + getAAFor(AA, IRP, DepClassTy::NONE); Optional SimplifiedV = ValueSimplifyAA.getAssumedSimplifiedValue(*this); bool IsKnown = ValueSimplifyAA.isAtFixpoint(); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -25,6 +25,9 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" +#include "llvm/IR/Assumptions.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" @@ -70,6 +73,15 @@ "Number of OpenMP runtime function uses identified"); STATISTIC(NumOpenMPTargetRegionKernels, "Number of OpenMP target region entry points (=kernels) identified"); +STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine, + "Number of OpenMP target region entry points (=kernels) executed in " + "generic-mode without a state machines"); +STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback, + "Number of OpenMP target region entry points (=kernels) executed in " + "generic-mode with customized state machines with fallback"); +STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback, + "Number of OpenMP target region entry points (=kernels) executed in " + "generic-mode with customized state machines without fallback"); STATISTIC( NumOpenMPParallelRegionsReplacedInGPUStateMachine, "Number of OpenMP parallel regions replaced with ID in GPU state machines"); @@ -228,6 +240,11 @@ /// Map from functions to all uses of this runtime function contained in /// them. DenseMap> UsesMap; + + public: + /// Iterators for the uses of this runtime function. + decltype(UsesMap)::iterator begin() { return UsesMap.begin(); } + decltype(UsesMap)::iterator end() { return UsesMap.end(); } }; /// An OpenMP-IR-Builder instance @@ -238,6 +255,9 @@ RuntimeFunction::OMPRTL___last> RFIs; + /// Map from function declarations/definitions to their runtime enum type. + DenseMap RuntimeFunctionIDMap; + /// Map from ICV kind to the ICV description. EnumeratedArray @@ -380,6 +400,7 @@ SmallVector ArgsTypes({__VA_ARGS__}); \ Function *F = M.getFunction(_Name); \ if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \ + RuntimeFunctionIDMap[F] = _Enum; \ auto &RFI = RFIs[_Enum]; \ RFI.Kind = _Enum; \ RFI.Name = _Name; \ @@ -408,6 +429,141 @@ SmallPtrSetImpl &Kernels; }; +template +struct BooleanStateWithPtrSetVector : public BooleanState { + + bool contains(Ty *Elem) const { return Set.contains(Elem); } + bool insert(Ty *Elem) { + if (InsertInvalidates) + BooleanState::indicatePessimisticFixpoint(); + return Set.insert(Elem); + } + + Ty *operator[](int Idx) const { return Set[Idx]; } + bool operator==(const BooleanStateWithPtrSetVector &RHS) const { + return BooleanState::operator==(RHS) && Set == RHS.Set; + } + bool operator!=(const BooleanStateWithPtrSetVector &RHS) const { + return !(*this == RHS); + } + + bool empty() const { return Set.empty(); } + size_t size() const { return Set.size(); } + + /// "Clamp" this state with \p RHS. + BooleanStateWithPtrSetVector & + operator^=(const BooleanStateWithPtrSetVector &RHS) { + BooleanState::operator^=(RHS); + Set.insert(RHS.Set.begin(), RHS.Set.end()); + return *this; + } + +private: + /// A set to keep track of elements. + SetVector Set; + +public: + typename decltype(Set)::iterator begin() { return Set.begin(); } + typename decltype(Set)::iterator end() { return Set.end(); } + typename decltype(Set)::const_iterator begin() const { return Set.begin(); } + typename decltype(Set)::const_iterator end() const { return Set.end(); } +}; + +struct KernelInfoState : AbstractState { + /// Flag to track if we reached a fixpoint. + bool IsAtFixpoint = false; + + /// The parallel regions (identified by the outlined parallel functions) that + /// can be reached from the associated function. + BooleanStateWithPtrSetVector + ReachedKnownParallelRegions; + + /// State to track what parallel region we might reach. + BooleanStateWithPtrSetVector ReachedUnknownParallelRegions; + + /// The __kmpc_target_init call in this kernel, if any. If we find more than + /// one we abort as the kernel is malformed. + CallBase *KernelInitCB = nullptr; + + /// The __kmpc_target_deinit call in this kernel, if any. If we find more than + /// one we abort as the kernel is malformed. + CallBase *KernelDeinitCB = nullptr; + + /// Abstract State interface + ///{ + + KernelInfoState() {} + KernelInfoState(bool BestState) { + if (!BestState) + indicatePessimisticFixpoint(); + } + + /// See AbstractState::isValidState(...) + bool isValidState() const override { return true; } + + /// See AbstractState::isAtFixpoint(...) + bool isAtFixpoint() const override { return IsAtFixpoint; } + + /// See AbstractState::indicatePessimisticFixpoint(...) + ChangeStatus indicatePessimisticFixpoint() override { + IsAtFixpoint = true; + ReachedUnknownParallelRegions.indicatePessimisticFixpoint(); + return ChangeStatus::CHANGED; + } + + /// See AbstractState::indicateOptimisticFixpoint(...) + ChangeStatus indicateOptimisticFixpoint() override { + IsAtFixpoint = true; + return ChangeStatus::UNCHANGED; + } + + /// Return the assumed state + KernelInfoState &getAssumed() { return *this; } + const KernelInfoState &getAssumed() const { return *this; } + + bool operator==(const KernelInfoState &RHS) const { + if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions) + return false; + if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions) + return false; + return true; + } + + /// Return empty set as the best state of potential values. + static KernelInfoState getBestState() { return KernelInfoState(true); } + + static KernelInfoState getBestState(KernelInfoState &KIS) { + return getBestState(); + } + + /// Return full set as the worst state of potential values. + static KernelInfoState getWorstState() { return KernelInfoState(false); } + + /// "Clamp" this state with \p KIS. + KernelInfoState operator^=(const KernelInfoState &KIS) { + // Do not merge two different _init and _deinit call sites. + if (KIS.KernelInitCB) { + if (KernelInitCB && KernelInitCB != KIS.KernelInitCB) + indicatePessimisticFixpoint(); + KernelInitCB = KIS.KernelInitCB; + } + if (KIS.KernelDeinitCB) { + if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB) + indicatePessimisticFixpoint(); + KernelDeinitCB = KIS.KernelDeinitCB; + } + ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions; + ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions; + return *this; + } + + KernelInfoState operator&=(const KernelInfoState &KIS) { + return (*this ^= KIS); + } + + ///} +}; + /// Used to map the values physically (in the IR) stored in an offload /// array, to a vector in memory. struct OffloadArray { @@ -522,7 +678,7 @@ << OMPInfoCache.ModuleSlice.size() << " functions\n"); if (IsModulePass) { - Changed |= runAttributor(); + Changed |= runAttributor(IsModulePass); // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); @@ -535,14 +691,14 @@ if (PrintOpenMPKernels) printKernels(); - Changed |= rewriteDeviceCodeStateMachine(); - - Changed |= runAttributor(); + Changed |= runAttributor(IsModulePass); // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); Changed |= deleteParallelRegions(); + Changed |= rewriteDeviceCodeStateMachine(); + if (HideMemoryTransferLatency) Changed |= hideMemTransfersLatency(); Changed |= deduplicateRuntimeCalls(); @@ -1573,11 +1729,11 @@ Attributor &A; /// Helper function to run Attributor on SCC. - bool runAttributor() { + bool runAttributor(bool IsModulePass) { if (SCC.empty()) return false; - registerAAs(); + registerAAs(IsModulePass); ChangeStatus Changed = A.run(); @@ -1589,46 +1745,7 @@ /// Populate the Attributor with abstract attribute opportunities in the /// function. - void registerAAs() { - if (SCC.empty()) - return; - - // Create CallSite AA for all Getters. - for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { - auto ICVInfo = OMPInfoCache.ICVs[static_cast(Idx)]; - - auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; - - auto CreateAA = [&](Use &U, Function &Caller) { - CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); - if (!CI) - return false; - - auto &CB = cast(*CI); - - IRPosition CBPos = IRPosition::callsite_function(CB); - A.getOrCreateAAFor(CBPos); - return false; - }; - - GetterRFI.foreachUse(SCC, CreateAA); - } - auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; - auto CreateAA = [&](Use &U, Function &F) { - A.getOrCreateAAFor(IRPosition::function(F)); - return false; - }; - GlobalizationRFI.foreachUse(SCC, CreateAA); - - // Create an ExecutionDomain AA for every function and a HeapToStack AA for - // every function if there is a device kernel. - for (auto *F : SCC) { - if (!F->isDeclaration()) - A.getOrCreateAAFor(IRPosition::function(*F)); - if (isOpenMPDevice(M)) - A.getOrCreateAAFor(IRPosition::function(*F)); - } - } + void registerAAs(bool IsModulePass); }; Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { @@ -1766,7 +1883,7 @@ // TODO: Checking the number of uses is not a necessary restriction and // should be lifted. if (UnknownUse || NumDirectCalls != 1 || - ToBeReplacedStateMachineUses.size() != 2) { + ToBeReplacedStateMachineUses.size() > 2) { { auto Remark = [&](OptimizationRemarkAnalysis ORA) { return ORA << "Parallel region is used in " @@ -2541,9 +2658,587 @@ SmallPtrSet MallocCalls; }; +struct AAKernelInfo : public StateWrapper { + using Base = StateWrapper; + AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Statistics are tracked as part of manifest for now. + void trackStatistics() const override {} + + /// See AbstractAttribute::getAsStr() + const std::string getAsStr() const override { + if (!isValidState()) + return ""; + return + + std::string(" #PRs: ") + + std::to_string(ReachedKnownParallelRegions.size()) + + ", #Unknown PRs: " + + std::to_string(ReachedUnknownParallelRegions.size()); + } + + /// Create an abstract attribute biew for the position \p IRP. + static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A); + + /// See AbstractAttribute::getName() + const std::string getName() const override { return "AAKernelInfo"; } + + /// See AbstractAttribute::getIdAddr() + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is AAKernelInfo + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + static const char ID; +}; + +/// The function kernel info abstract attribute, basically, what can we say +/// about a function with regards to the KernelInfoState. +struct AAKernelInfoFunction : AAKernelInfo { + AAKernelInfoFunction(const IRPosition &IRP, Attributor &A) + : AAKernelInfo(IRP, A) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + // This is a high-level transform that might change the constant arguments + // of the init and dinit calls. We need to tell the Attributor about this + // to avoid other parts using the current constant value for simpliication. + auto &OMPInfoCache = static_cast(A.getInfoCache()); + + Function *Fn = getAnchorScope(); + if (!OMPInfoCache.Kernels.count(Fn)) + return; + + OMPInformationCache::RuntimeFunctionInfo &InitRFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; + OMPInformationCache::RuntimeFunctionInfo &DeinitRFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit]; + + // For kernels we perform more initialization work, first we find the init + // and deinit calls. + auto StoreCallBase = [](Use &U, + OMPInformationCache::RuntimeFunctionInfo &RFI, + CallBase *&Storage) { + CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI); + assert(CB && + "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!"); + assert(!Storage && + "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!"); + Storage = CB; + return false; + }; + InitRFI.foreachUse( + [&](Use &U, Function &) { + StoreCallBase(U, InitRFI, KernelInitCB); + return false; + }, + Fn); + DeinitRFI.foreachUse( + [&](Use &U, Function &) { + StoreCallBase(U, DeinitRFI, KernelDeinitCB); + return false; + }, + Fn); + + assert((KernelInitCB && KernelDeinitCB) && + "Kernel without __kmpc_target_init or __kmpc_target_deinit!"); + + // For kernels we need to register a simplification callback so that the Attributor + // knows the constant arguments to ___kmpc_target_init and + // __kmpc_target_deinit might actually change. + + Attributor::SimplifictionCallbackTy StateMachineSimplifyCB = + [&](const IRPosition &IRP, const AbstractAttribute *AA, + bool &UsedAssumedInformation) -> Optional { + // IRP represents the "use generic state machine" argument of an + // __kmpc_target_init call. We will answer this one with the internal + // state. As long as we are not in an invalid state, we will create a + // custom state machine so the value should be a `i1 false`. If we are + // in an invalid state, we won't change the value that is in the IR. + if (!isValidState()) + return nullptr; + if (AA) + A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); + UsedAssumedInformation = !isAtFixpoint(); + auto *FalseVal = + ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0); + return FalseVal; + }; + + constexpr const int InitUseStateMachineArgNo = 2; + A.registerSimplificationCallback( + IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo), + StateMachineSimplifyCB); + } + + /// Modify the IR based on the KernelInfoState as the fixpoint iteration is + /// finished now. + ChangeStatus manifest(Attributor &A) override { + // If we are not looking at a kernel with __kmpc_target_init and + // __kmpc_target_deinit call we cannot actually manifest the information. + if (!KernelInitCB || !KernelDeinitCB) + return ChangeStatus::UNCHANGED; + + buildCustomStateMachine(A); + + return ChangeStatus::CHANGED; + } + + ChangeStatus buildCustomStateMachine(Attributor &A) { + assert(ReachedKnownParallelRegions.isValidState() && + "Custom state machine with invalid parallel region states?"); + + const int InitIsSPMDArgNo = 1; + const int InitUseStateMachineArgNo = 2; + + // Check if the current configuration is non-SPMD and generic state machine. + // If we already have SPMD mode or a custom state machine we do not need to + // go any further. If it is anything but a constant something is weird and + // we give up. + ConstantInt *UseStateMachine = dyn_cast( + KernelInitCB->getArgOperand(InitUseStateMachineArgNo)); + ConstantInt *IsSPMD = + dyn_cast(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); + + // If we are stuck with generic mode, try to create a custom device (=GPU) + // state machine which is specialized for the parallel regions that are + // reachable by the kernel. + if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD || + !IsSPMD->isZero()) + return ChangeStatus::UNCHANGED; + + // First, indicate we use a custom state machine now. + auto &Ctx = getAnchorValue().getContext(); + auto *FalseVal = ConstantInt::getBool(Ctx, 0); + A.changeUseAfterManifest( + KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal); + + // If we don't actually need a state machine we are done here. This can + // happen if there simply are no parallel regions. In the resulting kernel + // all worker threads will simply exit right away, leaving the main thread + // to do the work alone. + if (ReachedKnownParallelRegions.empty() && + ReachedUnknownParallelRegions.empty()) { + ++NumOpenMPTargetRegionKernelsWithoutStateMachine; + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Generic-mode kernel is executed without state machine " + "(good)"; + }; + A.emitRemark( + KernelInitCB, "OpenMPKernelWithoutStateMachine", Remark); + + return ChangeStatus::CHANGED; + } + + // Keep track in the statistics of our new shiny custom state machine. + if (ReachedUnknownParallelRegions.empty()) { + ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback; + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Generic-mode kernel is executed with a customized state " + "machine [" + << ore::NV("ParallelRegions", + ReachedKnownParallelRegions.size()) + << " known parallel regions] (good)."; + }; + A.emitRemark( + KernelInitCB, "OpenMPKernelWithCustomizedStateMachine", Remark); + } else { + ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback; + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Generic-mode kernel is executed with a customized state " + "machine that requires a fallback [" + << ore::NV("ParallelRegions", + ReachedKnownParallelRegions.size()) + << " known parallel regions, " + << ore::NV("UnknownParallelRegions", + ReachedUnknownParallelRegions.size()) + << " unkown parallel regions] (bad)."; + }; + A.emitRemark( + KernelInitCB, "OpenMPKernelWithCustomizedStateMachineAndFallback", + Remark); + + // Tell the user why we ended up with a fallback. + for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) { + if (!UnknownParallelRegionCB) + continue; + auto Remark = [&](OptimizationRemarkAnalysis ORA) { + return ORA + << "State machine fallback caused by this call. If it is a " + "false positive, use " + "`__attribute__((assume(\"omp_no_openmp\"))` " + "(or \"omp_no_parallelism\")."; + }; + A.emitRemark( + UnknownParallelRegionCB, + "OpenMPKernelWithCustomizedStateMachineAndFallback", Remark); + } + } + + // Create all the blocks: + // + // InitCB = __kmpc_target_init(...) + // bool IsWorker = InitCB >= 0; + // if (IsWorker) { + // SMBeginBB: __kmpc_barrier_simple_spmd(...); + // void *WorkFn; + // bool Active = __kmpc_kernel_parallel(&WorkFn); + // if (!WorkFn) return; + // SMIsActiveCheckBB: if (Active) { + // SMIfCascadeCurrentBB: if (WorkFn == ) + // ParFn0(...); + // SMIfCascadeCurrentBB: else if (WorkFn == ) + // ParFn1(...); + // ... + // SMIfCascadeCurrentBB: else + // ((WorkFnTy*)WorkFn)(...); + // SMEndParallelBB: __kmpc_kernel_end_parallel(...); + // } + // SMDoneBB: __kmpc_barrier_simple_spmd(...); + // goto SMBeginBB; + // } + // UserCodeEntryBB: // user code + // __kmpc_target_deinit(...) + // + Function *Kernel = getAssociatedFunction(); + assert(Kernel && "Expected an associated function!"); + + BasicBlock *InitBB = KernelInitCB->getParent(); + BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock( + KernelInitCB->getNextNode(), "thread.user_code.check"); + BasicBlock *StateMachineBeginBB = BasicBlock::Create( + Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB); + BasicBlock *StateMachineFinishedBB = BasicBlock::Create( + Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB); + BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create( + Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB); + BasicBlock *StateMachineIfCascadeCurrentBB = + BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check", + Kernel, UserCodeEntryBB); + BasicBlock *StateMachineEndParallelBB = + BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end", + Kernel, UserCodeEntryBB); + BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create( + Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB); + + const DebugLoc &DLoc = KernelInitCB->getDebugLoc(); + ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc); + + InitBB->getTerminator()->eraseFromParent(); + Instruction *IsWorker = + ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, + ConstantInt::get(KernelInitCB->getType(), -1), + "thread.is_worker", InitBB); + IsWorker->setDebugLoc(DLoc); + BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB); + + // Create local storage for the work function pointer. + Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); + AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "worker.work_fn.addr", + &Kernel->getEntryBlock().front()); + WorkFnAI->setDebugLoc(DLoc); + + auto &OMPInfoCache = static_cast(A.getInfoCache()); + OMPInfoCache.OMPBuilder.updateToLocation( + OpenMPIRBuilder::LocationDescription( + IRBuilder<>::InsertPoint(StateMachineBeginBB, + StateMachineBeginBB->end()), + DLoc)); + + Value *Ident = KernelInitCB->getArgOperand(0); + Value *GTid = KernelInitCB; + + Module &M = *Kernel->getParent(); + FunctionCallee BarrierFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_barrier_simple_spmd); + CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB) + ->setDebugLoc(DLoc); + + FunctionCallee KernelParallelFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_kernel_parallel); + Instruction *IsActiveWorker = CallInst::Create( + KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB); + IsActiveWorker->setDebugLoc(DLoc); + Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn", + StateMachineBeginBB); + WorkFn->setDebugLoc(DLoc); + + FunctionType *ParallelRegionFnTy = FunctionType::get( + Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)}, + false); + Value *WorkFnCast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( + WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast", + StateMachineBeginBB); + + Instruction *IsDone = + ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn, + Constant::getNullValue(VoidPtrTy), "worker.is_done", + StateMachineBeginBB); + IsDone->setDebugLoc(DLoc); + BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB, + IsDone, StateMachineBeginBB) + ->setDebugLoc(DLoc); + + BranchInst::Create(StateMachineIfCascadeCurrentBB, + StateMachineDoneBarrierBB, IsActiveWorker, + StateMachineIsActiveCheckBB) + ->setDebugLoc(DLoc); + + Value *ZeroArg = + Constant::getNullValue(ParallelRegionFnTy->getParamType(0)); + + // Now that we have most of the CFG skeleton it is time for the if-cascade + // that checks the function pointer we got from the runtime against the + // parallel regions we expect, if there are any. + for (int i = 0, e = ReachedKnownParallelRegions.size(); i < e; ++i) { + auto *ParallelRegion = ReachedKnownParallelRegions[i]; + BasicBlock *PRExecuteBB = BasicBlock::Create( + Ctx, "worker_state_machine.parallel_region.execute", Kernel, + StateMachineEndParallelBB); + CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB) + ->setDebugLoc(DLoc); + BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB) + ->setDebugLoc(DLoc); + + BasicBlock *PRNextBB = + BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check", + Kernel, StateMachineEndParallelBB); + + // Check if we need to compare the pointer at all or if we can just + // call the parallel region function. + Value *IsPR; + if (i + 1 < e || !ReachedUnknownParallelRegions.empty()) { + Instruction *CmpI = ICmpInst::Create( + ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion, + "worker.check_parallel_region", StateMachineIfCascadeCurrentBB); + CmpI->setDebugLoc(DLoc); + IsPR = CmpI; + } else { + IsPR = ConstantInt::getTrue(Ctx); + } + + BranchInst::Create(PRExecuteBB, PRNextBB, IsPR, + StateMachineIfCascadeCurrentBB) + ->setDebugLoc(DLoc); + StateMachineIfCascadeCurrentBB = PRNextBB; + } + + // At the end of the if-cascade we place the indirect function pointer call + // in case we might need it, that is if there can be parallel regions we + // have not handled in the if-cascade above. + if (!ReachedUnknownParallelRegions.empty()) { + StateMachineIfCascadeCurrentBB->setName( + "worker_state_machine.parallel_region.fallback.execute"); + CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "", + StateMachineIfCascadeCurrentBB) + ->setDebugLoc(DLoc); + } + BranchInst::Create(StateMachineEndParallelBB, + StateMachineIfCascadeCurrentBB) + ->setDebugLoc(DLoc); + + CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_kernel_end_parallel), + {}, "", StateMachineEndParallelBB) + ->setDebugLoc(DLoc); + BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB) + ->setDebugLoc(DLoc); + + CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB) + ->setDebugLoc(DLoc); + BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB) + ->setDebugLoc(DLoc); + + return ChangeStatus::CHANGED; + } + + /// Fixpoint iteration update function. Will be called every time a dependence + /// changed its state (and in the beginning). + ChangeStatus updateImpl(Attributor &A) override { + KernelInfoState StateBefore = getState(); + + // Callback to check a call instruction. + auto CheckCallInst = [&](Instruction &I) { + auto &CB = cast(I); + auto &CBAA = A.getAAFor( + *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); + if (CBAA.getState().isValidState()) + getState() ^= CBAA.getState(); + return true; + }; + + if (!A.checkForAllCallLikeInstructions(CheckCallInst, *this)) + return indicatePessimisticFixpoint(); + + return StateBefore == getState() ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; + } +}; + +/// The call site kernel info abstract attribute, basically, what can we say +/// about a call site with regards to the KernelInfoState. For now this simply +/// forwards the information from the callee. +struct AAKernelInfoCallSite : AAKernelInfo { + AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A) + : AAKernelInfo(IRP, A) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AAKernelInfo::initialize(A); + + CallBase &CB = cast(getAssociatedValue()); + Function *Callee = getAssociatedFunction(); + + // Helper to lookup an assumption string. + auto HasAssumption = [](Function *Fn, StringRef AssumptionStr) { + return Fn && hasAssumption(*Fn, AssumptionStr); + }; + + // First weed out calls we do not care about, that is readonly/readnone + // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a + // parallel region or anything else we are looking for. + if (!CB.mayWriteToMemory() || isa(CB)) { + indicateOptimisticFixpoint(); + return; + } + + // Next we check if we know the callee. If it is a known OpenMP function + // we will handle them explicitly in the switch below. If it is not, we + // will use an AAKernelInfo object on the callee to gather information and + // merge that into the current state. The latter happens in the updateImpl. + auto &OMPInfoCache = static_cast(A.getInfoCache()); + const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee); + if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) { + // Unknown caller or declarations are not analyzable, we give up. + if (!Callee || !A.isFunctionIPOAmendable(*Callee)) { + + // Unknown callees might contain parallel regions, except if they have + // an appropriate assumption attached. + if (!(HasAssumption(Callee, "omp_no_openmp") || + HasAssumption(Callee, "omp_no_parallelism"))) + ReachedUnknownParallelRegions.insert(&CB); + + // We have updated the state for this unknown call properly, there won't + // be any change so we indicate a fixpoint. + indicateOptimisticFixpoint(); + } + // If the callee is known and can be used in IPO, we will update the state + // based on the callee state in updateImpl. + return; + } + + const unsigned int WrapperFunctionArgNo = 6; + RuntimeFunction RF = It->getSecond(); + switch (RF) { + case OMPRTL___kmpc_target_init: + KernelInitCB = &CB; + break; + case OMPRTL___kmpc_target_deinit: + KernelDeinitCB = &CB; + break; + case OMPRTL___kmpc_parallel_51: + if (auto *ParallelRegion = dyn_cast( + CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) { + ReachedKnownParallelRegions.insert(ParallelRegion); + break; + } + // The condition above should usually get the parallel region function + // pointer and record it. In the off chance it doesn't we assume the + // worst. + ReachedUnknownParallelRegions.insert(&CB); + break; + case OMPRTL___kmpc_omp_task: + // We do not look into tasks right now, just give up. + ReachedUnknownParallelRegions.insert(&CB); + break; + default: + break; + } + // All other OpenMP runtime calls will not reach parallel regions so they + // can be safely ignored for now. Since it is a known OpenMP runtime call we + // have now modeled all effects and there is no need for any update. + indicateOptimisticFixpoint(); + } + + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::function(*F); + auto &FnAA = A.getAAFor(*this, FnPos, DepClassTy::REQUIRED); + if (getState() == FnAA.getState()) + return ChangeStatus::UNCHANGED; + getState() = FnAA.getState(); + return ChangeStatus::CHANGED; + } +}; + } // namespace +void OpenMPOpt::registerAAs(bool IsModulePass) { + if (SCC.empty()) + + return; + if (IsModulePass) { + // Ensure we create the AAKernelInfo AAs first and without triggering an + // update. This will make sure we register all value simplification + // callbacks before any other AA has the chance to create an AAValueSimplify + // or similar. + for (Function *Kernel : OMPInfoCache.Kernels) + A.getOrCreateAAFor( + IRPosition::function(*Kernel), /* QueryingAA */ nullptr, + DepClassTy::NONE, /* ForceUpdate */ false, + /* UpdateAfterInit */ false); + } + + // Create CallSite AA for all Getters. + for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { + auto ICVInfo = OMPInfoCache.ICVs[static_cast(Idx)]; + + auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; + + auto CreateAA = [&](Use &U, Function &Caller) { + CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); + if (!CI) + return false; + + auto &CB = cast(*CI); + + IRPosition CBPos = IRPosition::callsite_function(CB); + A.getOrCreateAAFor(CBPos); + return false; + }; + + GetterRFI.foreachUse(SCC, CreateAA); + } + auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + auto CreateAA = [&](Use &U, Function &F) { + A.getOrCreateAAFor(IRPosition::function(F)); + return false; + }; + GlobalizationRFI.foreachUse(SCC, CreateAA); + + // Create an ExecutionDomain AA for every function and a HeapToStack AA for + // every function if there is a device kernel. + for (auto *F : SCC) { + if (!F->isDeclaration()) + A.getOrCreateAAFor(IRPosition::function(*F)); + if (isOpenMPDevice(M)) + A.getOrCreateAAFor(IRPosition::function(*F)); + } +} + const char AAICVTracker::ID = 0; +const char AAKernelInfo::ID = 0; const char AAExecutionDomain::ID = 0; const char AAHeapToShared::ID = 0; @@ -2615,6 +3310,28 @@ return *AA; } +AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP, + Attributor &A) { + AAKernelInfo *AA = nullptr; + switch (IRP.getPositionKind()) { + case IRPosition::IRP_INVALID: + case IRPosition::IRP_FLOAT: + case IRPosition::IRP_ARGUMENT: + case IRPosition::IRP_RETURNED: + case IRPosition::IRP_CALL_SITE_RETURNED: + case IRPosition::IRP_CALL_SITE_ARGUMENT: + llvm_unreachable("KernelInfo can only be created for function position!"); + case IRPosition::IRP_CALL_SITE: + AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A); + break; + case IRPosition::IRP_FUNCTION: + AA = new (A.Allocator) AAKernelInfoFunction(IRP, A); + break; + } + + return *AA; +} + PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { if (!containsOpenMP(M)) return PreservedAnalyses::all(); diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll @@ -0,0 +1,1924 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --include-generated-funcs +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s + +;; void p0(void); +;; void p1(void); +;; int unknown(void); +;; void unknown_pure(void) __attribute__((pure)); +;; void unknown_no_openmp(void) __attribute__((assume("omp_no_openmp"))); +;; +;; int G; +;; void no_parallel_region_in_here(void) { +;; #pragma omp single +;; G = 0; +;; } +;; +;; void no_state_machine_needed() { +;; #pragma omp target teams +;; no_parallel_region_in_here(); +;; } +;; +;; void simple_state_machine() { +;; #pragma omp target teams +;; { +;; #pragma omp parallel +;; { p0(); } +;; no_parallel_region_in_here(); +;; #pragma omp parallel +;; { p1(); } +;; } +;; } +;; +;; void simple_state_machine_interprocedural_after(void); +;; void simple_state_machine_interprocedural_before(void) { +;; #pragma omp parallel +;; { p0(); } +;; } +;; void simple_state_machine_interprocedural() { +;; #pragma omp target teams +;; { +;; simple_state_machine_interprocedural_before(); +;; no_parallel_region_in_here(); +;; #pragma omp parallel +;; { p1(); } +;; simple_state_machine_interprocedural_after(); +;; } +;; } +;; void simple_state_machine_interprocedural_after(void) { +;; #pragma omp parallel +;; { p0(); } +;; } +;; +;; void simple_state_machine_with_fallback() { +;; #pragma omp target teams +;; { +;; #pragma omp parallel +;; { p0(); } +;; unknown(); +;; #pragma omp parallel +;; { p1(); } +;; } +;; } +;; +;; void simple_state_machine_no_openmp_attr() { +;; #pragma omp target teams +;; { +;; #pragma omp parallel +;; { p0(); } +;; unknown_no_openmp(); +;; #pragma omp parallel +;; { p1(); } +;; } +;; } +;; +;; void simple_state_machine_pure() { +;; #pragma omp target teams +;; { +;; #pragma omp parallel +;; { p0(); } +;; unknown_pure(); +;; #pragma omp parallel +;; { p1(); } +;; } +;; } +;; +;; int omp_get_thread_num(); +;; void simple_state_machine_interprocedural_nested_recursive_after(int); +;; void simple_state_machine_interprocedural_nested_recursive_after_after(void); +;; void simple_state_machine_interprocedural_nested_recursive() { +;; #pragma omp target teams +;; { +;; simple_state_machine_interprocedural_nested_recursive_after( +;; omp_get_thread_num()); +;; } +;; } +;; +;; void simple_state_machine_interprocedural_nested_recursive_after(int a) { +;; if (a == 0) +;; return; +;; simple_state_machine_interprocedural_nested_recursive_after(a - 1); +;; simple_state_machine_interprocedural_nested_recursive_after_after(); +;; } +;; void simple_state_machine_interprocedural_nested_recursive_after_after(void) { +;; #pragma omp parallel +;; { p0(); } +;; } +;; +;; __attribute__((weak)) void weak_callee_empty(void) {} +;; void no_state_machine_weak_callee() { +;; #pragma omp target teams +;; { weak_callee_empty(); } +;; } + +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* undef +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@__omp_offloading_2c_389eb_no_state_machine_needed_l14_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_l19_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_pure_l72_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86_exec_mode = weak constant i8 1 +@__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106_exec_mode = weak constant i8 1 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@G = external global i32, align 4 +@V = external global i1, align 4 +@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@llvm.compiler.used = appending global [8 x i8*] [i8* @__omp_offloading_2c_389eb_no_state_machine_needed_l14_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_l19_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_pure_l72_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86_exec_mode, i8* @__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106_exec_mode], section "llvm.metadata" + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; No user code state machine is build because we do not need one. +define weak void @__omp_offloading_2c_389eb_no_state_machine_needed_l14() #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Verify we will not store a constant true here even though initially all call sites pass `i1 true` for the second-to-last argument. +define internal i32 @__kmpc_target_init(%struct.ident_t*, i1, i1 %use_generic_state_machine, i1) { + store i1 %use_generic_state_machine, i1* @V + %call = call i32 @unknown() + ret i32 %call +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @no_parallel_region_in_here() #8 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @no_parallel_region_in_here() #1 { +entry: + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = call i32 @__kmpc_single(%struct.ident_t* @2, i32 %0) + %2 = icmp ne i32 %1, 0 + br i1 %2, label %omp_if.then, label %omp_if.end + +omp_if.then: ; preds = %entry + store i32 0, i32* @G, align 4 + call void @__kmpc_end_single(%struct.ident_t* @2, i32 %0) + br label %omp_if.end + +omp_if.end: ; preds = %omp_if.then, %entry + call void @__kmpc_barrier(%struct.ident_t* @3, i32 %0) + ret void +} + +; Function Attrs: nounwind +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #2 + +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and only one pointer comparison are needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_l19() #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + %captured_vars_addrs1 = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %2, i64 0) + call void @no_parallel_region_in_here() #8 + %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %3, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent +declare void @p0() #3 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +declare void @__kmpc_get_shared_variables(i8***) + +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent +declare void @p1() #3 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and only two pointer comparison are needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35() #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__4(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__4(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @simple_state_machine_interprocedural_before() #8 + call void @no_parallel_region_in_here() #8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** %2, i64 0) + call void @simple_state_machine_interprocedural_after() #8 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @simple_state_machine_interprocedural_before() #1 { +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @2, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** %1, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__5(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__5(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @simple_state_machine_interprocedural_after() #1 { +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @2, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** %1, i64 0) + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. A fallback indirect call and only two pointer comparison are needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50() #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__6(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + %captured_vars_addrs1 = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** %2, i64 0) + %3 = call i32 @unknown() #8 + %4 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** %4, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__7(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__7(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent +declare i32 @unknown() #3 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__8(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__8_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__8(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61() #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__9(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__9(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + %captured_vars_addrs1 = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** %2, i64 0) + call void @unknown_no_openmp() #9 + %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** %3, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__10(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__10_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__10(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent +declare void @unknown_no_openmp() #5 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__11(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__11_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__11(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_pure_l72() #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__12(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__12(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + %captured_vars_addrs1 = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** %2, i64 0) + call void @unknown_pure() #10 + %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** %3, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__13(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__13_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__13(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent nounwind readonly willreturn +declare void @unknown_pure() #6 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__14(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p1() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__14_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__14(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A user code state machine is build because we do need one. No fallback and no pointer comparison is needed. +define weak void @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86() #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__15(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__15(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %call = call i32 @omp_get_thread_num() + call void @simple_state_machine_interprocedural_nested_recursive_after(i32 %call) #8 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @simple_state_machine_interprocedural_nested_recursive_after(i32 %a) #1 { +entry: + %a.addr = alloca i32, align 4 + store i32 %a, i32* %a.addr, align 4 + %0 = load i32, i32* %a.addr, align 4 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + br label %return + +if.end: ; preds = %entry + %1 = load i32, i32* %a.addr, align 4 + %sub = sub nsw i32 %1, 1 + call void @simple_state_machine_interprocedural_nested_recursive_after(i32 %sub) #8 + call void @simple_state_machine_interprocedural_nested_recursive_after_after() #8 + br label %return + +return: ; preds = %if.end, %if.then + ret void +} + +; Function Attrs: convergent +declare i32 @omp_get_thread_num() #3 + +; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. +; A pretty generic user code state machine is build because we do not know anything about the weak callee. +define weak void @__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106() #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__16(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__16(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @weak_callee_empty() #8 + ret void +} + +; Function Attrs: convergent nounwind +define weak hidden void @weak_callee_empty() #1 { +entry: + ret void +} + +; Function Attrs: convergent nounwind +declare void @__kmpc_end_single(%struct.ident_t*, i32) #7 + +; Function Attrs: convergent nounwind +declare i32 @__kmpc_single(%struct.ident_t*, i32) #7 + +; Function Attrs: convergent nounwind +declare void @__kmpc_barrier(%struct.ident_t*, i32) #7 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__17(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__17_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__17(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__18(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__18_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__18(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +; Function Attrs: convergent nounwind +define hidden void @simple_state_machine_interprocedural_nested_recursive_after_after() #1 { +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @2, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** %1, i64 0) + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__19(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @p0() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__19_wrapper(i16 zeroext %0, i32 %1) #4 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__19(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #2 = { nounwind } +attributes #3 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #4 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #5 = { convergent "frame-pointer"="all" "llvm.assume"="omp_no_openmp" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #6 = { convergent nounwind readonly willreturn "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #7 = { convergent nounwind } +attributes #8 = { convergent } +attributes #9 = { convergent "llvm.assume"="omp_no_openmp" } +attributes #10 = { convergent nounwind readonly willreturn } + +!omp_offload.info = !{!0, !1, !2, !3, !4, !5, !6, !7} +!nvvm.annotations = !{!8, !9, !10, !11, !12, !13, !14, !15} +!llvm.module.flags = !{!16, !17, !18, !20, !21} +!llvm.ident = !{!19} + +!0 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_interprocedural", i32 35, i32 2} +!1 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_no_openmp_attr", i32 61, i32 4} +!2 = !{i32 0, i32 44, i32 231915, !"no_state_machine_needed", i32 14, i32 0} +!3 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_with_fallback", i32 50, i32 3} +!4 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_pure", i32 72, i32 5} +!5 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_interprocedural_nested_recursive", i32 86, i32 6} +!6 = !{i32 0, i32 44, i32 231915, !"no_state_machine_weak_callee", i32 106, i32 7} +!7 = !{i32 0, i32 44, i32 231915, !"simple_state_machine", i32 19, i32 1} +!8 = !{void ()* @__omp_offloading_2c_389eb_no_state_machine_needed_l14, !"kernel", i32 1} +!9 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_l19, !"kernel", i32 1} +!10 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35, !"kernel", i32 1} +!11 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50, !"kernel", i32 1} +!12 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61, !"kernel", i32 1} +!13 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_pure_l72, !"kernel", i32 1} +!14 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86, !"kernel", i32 1} +!15 = !{void ()* @__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106, !"kernel", i32 1} +!16 = !{i32 1, !"wchar_size", i32 4} +!17 = !{i32 7, !"PIC Level", i32 2} +!18 = !{i32 7, !"frame-pointer", i32 2} +!19 = !{!"clang version 13.0.0"} +!20 = !{i32 7, !"openmp", i32 50} +!21 = !{i32 7, !"openmp-device", i32 50} +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_no_state_machine_needed_l14 +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1:[0-9]+]], i1 noundef false, i1 noundef false, i1 noundef true) +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init +; CHECK-SAME: (%struct.ident_t* noalias nocapture nofree nonnull readnone align 8 dereferenceable(24) [[TMP0:%.*]], i1 [[TMP1:%.*]], i1 [[USE_GENERIC_STATE_MACHINE:%.*]], i1 [[TMP2:%.*]]) { +; CHECK-NEXT: store i1 false, i1* @V, align 4 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @unknown() +; CHECK-NEXT: ret i32 [[CALL]] +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +; CHECK-SAME: (i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7:[0-9]+]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@no_parallel_region_in_here.internalized +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; CHECK: omp_if.then: +; CHECK-NEXT: store i32 0, i32* @G, align 4 +; CHECK-NEXT: call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR2]] +; CHECK-NEXT: br label [[OMP_IF_END]] +; CHECK: omp_if.end: +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@no_parallel_region_in_here +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; CHECK: omp_if.then: +; CHECK-NEXT: store i32 0, i32* @G, align 4 +; CHECK-NEXT: call void @__kmpc_end_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[OMP_IF_END]] +; CHECK: omp_if.end: +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_l19 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__2_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check3: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__1(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 +; CHECK-SAME: (i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR8:[0-9]+]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__17_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__5_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check3: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE5:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK6:%.*]] +; CHECK: worker_state_machine.parallel_region.execute5: +; CHECK-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check6: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__4(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4 +; CHECK-SAME: (i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR7]] +; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; CHECK-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR7]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before.internalized +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** [[TMP1]], i64 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after.internalized +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** [[TMP1]], i64 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__7_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__8_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.fallback.execute: +; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__6(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__6 +; CHECK-SAME: (i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @unknown() #[[ATTR8]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP4]], i64 noundef 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__10_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check3: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__9(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__9 +; CHECK-SAME: (i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR9:[0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_pure_l72 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__13_wrapper +; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; CHECK: worker_state_machine.parallel_region.execute2: +; CHECK-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.check3: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__12(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__12 +; CHECK-SAME: (i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__13 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__14 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p1() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.check: +; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; CHECK: worker_state_machine.parallel_region.execute: +; CHECK-NEXT: call void @__omp_outlined__19_wrapper(i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.check1: +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__15(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__15 +; CHECK-SAME: (i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR2]] +; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR7]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after.internalized +; CHECK-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: if.end: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR7]] +; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR7]] +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after +; CHECK-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: if.end: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR8]] +; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR8]] +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.fallback.execute: +; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__16(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__16 +; CHECK-SAME: (i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @weak_callee_empty() #[[ATTR7]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@weak_callee_empty +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__17 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__18 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after.internalized +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent nounwind +; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** [[TMP1]], i64 0) +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__19 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: call void @p0() #[[ATTR8]] +; CHECK-NEXT: ret void +; +; +; CHECK: Function Attrs: convergent norecurse nounwind +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll @@ -0,0 +1,224 @@ +; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s +target triple = "nvptx64" + +; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback [1 known parallel regions, 2 unkown parallel regions] (bad) +; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:13:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp"))` (or "omp_no_parallelism") +; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:15:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp"))` (or "omp_no_parallelism") +; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:20:1: Generic-mode kernel is executed with a customized state machine [1 known parallel regions] (good) + +;; void unknown(void); +;; void known(void) { +;; #pragma omp parallel +;; { +;; unknown(); +;; } +;; } +;; +;; void test_fallback(void) { +;; #pragma omp target teams +;; { +;; unknown(); +;; known(); +;; unknown(); +;; } +;; } +;; +;; void test_no_fallback(void) { +;; #pragma omp target teams +;; { +;; known(); +;; known(); +;; known(); +;; } +;; } + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @0, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant [82 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;test_fallback;11;1;;\00", align 1 +@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([82 x i8], [82 x i8]* @2, i32 0, i32 0) }, align 8 +@4 = private unnamed_addr constant [114 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;25;;\00", align 1 +@5 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([114 x i8], [114 x i8]* @4, i32 0, i32 0) }, align 8 +@__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode = weak constant i8 1 +@6 = private unnamed_addr constant [116 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;1;;\00", align 1 +@7 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([116 x i8], [116 x i8]* @6, i32 0, i32 0) }, align 8 +@8 = private unnamed_addr constant [85 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;test_no_fallback;20;1;;\00", align 1 +@9 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([85 x i8], [85 x i8]* @8, i32 0, i32 0) }, align 8 +@10 = private unnamed_addr constant [117 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;25;;\00", align 1 +@11 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([117 x i8], [117 x i8]* @10, i32 0, i32 0) }, align 8 +@__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1 +@12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1 +@13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([73 x i8], [73 x i8]* @12, i32 0, i32 0) }, align 8 +@llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode, i8* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode], section "llvm.metadata" + +; Function Attrs: convergent norecurse nounwind +define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 { +entry: + %captured_vars_addrs.i.i = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) #3, !dbg !18 + %exec_user_code = icmp eq i32 %0, -1, !dbg !18 + br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18 + +common.ret: ; preds = %entry, %user_code.entry + ret void, !dbg !19 + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @3) #3 + call void @unknown() #6, !dbg !20 + %2 = bitcast [0 x i8*]* %captured_vars_addrs.i.i to i8* + call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3 + %4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i.i, i64 0, i64 0, !dbg !23 + call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !23 + call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !26 + call void @unknown() #6, !dbg !27 + call void @__kmpc_target_deinit(%struct.ident_t* nonnull @5, i1 false, i1 true) #3, !dbg !28 + br label %common.ret +} + +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) local_unnamed_addr + +; Function Attrs: convergent +declare void @unknown() local_unnamed_addr #1 + +; Function Attrs: nounwind +define hidden void @known() local_unnamed_addr #2 !dbg !29 { +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @13) + %1 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs, i64 0, i64 0, !dbg !30 + call void @__kmpc_parallel_51(%struct.ident_t* nonnull @13, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** nonnull %1, i64 0) #3, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: nounwind +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #3 + +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) local_unnamed_addr + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 { +entry: + %captured_vars_addrs.i2.i = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @7, i1 false, i1 true, i1 true) #3, !dbg !33 + %exec_user_code = icmp eq i32 %0, -1, !dbg !33 + br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33 + +common.ret: ; preds = %entry, %user_code.entry + ret void, !dbg !34 + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @9) #3 + %2 = bitcast [0 x i8*]* %captured_vars_addrs.i2.i to i8* + call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3 + %4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i2.i, i64 0, i64 0, !dbg !35 + call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !35 + call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !39 + call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3 + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3 + call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %5, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !40 + call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !42 + call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3 + %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3 + call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %6, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !43 + call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45 + call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i1 false, i1 true) #3, !dbg !46 + br label %common.ret +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__2(i32* noalias nocapture nofree readnone %.global_tid., i32* noalias nocapture nofree readnone %.bound_tid.) #0 !dbg !47 { +entry: + call void @unknown() #6, !dbg !48 + ret void, !dbg !49 +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #0 !dbg !50 { +entry: + %global_args = alloca i8**, align 8 + call void @__kmpc_get_shared_variables(i8*** nonnull %global_args) #3, !dbg !51 + call void @unknown() #6, !dbg !52 + ret void, !dbg !51 +} + +declare void @__kmpc_get_shared_variables(i8***) local_unnamed_addr + +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) local_unnamed_addr + +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5 + +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5 + +attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #3 = { nounwind } +attributes #4 = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #5 = { argmemonly nofree nosync nounwind willreturn } +attributes #6 = { convergent nounwind } + +!llvm.dbg.cu = !{!0} +!omp_offload.info = !{!3, !4} +!nvvm.annotations = !{!5, !6} +!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13} +!llvm.ident = !{!14} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "custom_state_machines_remarks.c", directory: "/data/src/llvm-project") +!2 = !{} +!3 = !{i32 0, i32 42, i32 14159165, !"test_no_fallback", i32 20, i32 1} +!4 = !{i32 0, i32 42, i32 14159165, !"test_fallback", i32 11, i32 0} +!5 = !{void ()* @__omp_offloading_2a_d80d3d_test_fallback_l11, !"kernel", i32 1} +!6 = !{void ()* @__omp_offloading_2a_d80d3d_test_no_fallback_l20, !"kernel", i32 1} +!7 = !{i32 7, !"Dwarf Version", i32 2} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{i32 7, !"openmp", i32 50} +!11 = !{i32 7, !"openmp-device", i32 50} +!12 = !{i32 7, !"PIC Level", i32 2} +!13 = !{i32 7, !"frame-pointer", i32 2} +!14 = !{!"clang version 13.0.0"} +!15 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_fallback_l11", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!16 = !DIFile(filename: "llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c", directory: "/data/src/llvm-project") +!17 = !DISubroutineType(types: !2) +!18 = !DILocation(line: 11, column: 1, scope: !15) +!19 = !DILocation(line: 0, scope: !15) +!20 = !DILocation(line: 13, column: 5, scope: !21, inlinedAt: !22) +!21 = distinct !DISubprogram(name: "__omp_outlined__", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!22 = distinct !DILocation(line: 11, column: 1, scope: !15) +!23 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !25) +!24 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!25 = distinct !DILocation(line: 14, column: 5, scope: !21, inlinedAt: !22) +!26 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !25) +!27 = !DILocation(line: 15, column: 5, scope: !21, inlinedAt: !22) +!28 = !DILocation(line: 11, column: 25, scope: !15) +!29 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!30 = !DILocation(line: 4, column: 1, scope: !29) +!31 = !DILocation(line: 8, column: 1, scope: !29) +!32 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_no_fallback_l20", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!33 = !DILocation(line: 20, column: 1, scope: !32) +!34 = !DILocation(line: 0, scope: !32) +!35 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !36) +!36 = distinct !DILocation(line: 22, column: 5, scope: !37, inlinedAt: !38) +!37 = distinct !DISubprogram(name: "__omp_outlined__1", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!38 = distinct !DILocation(line: 20, column: 1, scope: !32) +!39 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !36) +!40 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !41) +!41 = distinct !DILocation(line: 23, column: 5, scope: !37, inlinedAt: !38) +!42 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !41) +!43 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !44) +!44 = distinct !DILocation(line: 24, column: 5, scope: !37, inlinedAt: !38) +!45 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !44) +!46 = !DILocation(line: 20, column: 25, scope: !32) +!47 = distinct !DISubprogram(name: "__omp_outlined__2", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!48 = !DILocation(line: 6, column: 5, scope: !47) +!49 = !DILocation(line: 7, column: 3, scope: !47) +!50 = distinct !DISubprogram(linkageName: "__omp_outlined__2_wrapper", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagArtificial, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!51 = !DILocation(line: 4, column: 1, scope: !50) +!52 = !DILocation(line: 6, column: 5, scope: !47, inlinedAt: !53) +!53 = distinct !DILocation(line: 4, column: 1, scope: !50) diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll --- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll +++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll @@ -7,15 +7,19 @@ ; CHECK: remark: globalization_remarks.c:5:7: Could not move globalized variable to the stack. Variable is potentially captured. ; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. +%struct.ident_t = type { i32, i32, i32, i32, i8* } + @S = external local_unnamed_addr global i8* define void @foo() { entry: + %c = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true) %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !10 %x_on_stack = bitcast i8* %0 to i32* %1 = bitcast i32* %x_on_stack to i8* call void @share(i8* %1) call void @__kmpc_free_shared(i8* %0) + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) ret void } @@ -29,6 +33,8 @@ declare void @__kmpc_free_shared(i8*) +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1); +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll --- a/llvm/test/Transforms/OpenMP/remove_globalization.ll +++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll @@ -7,25 +7,34 @@ ; CHECK-REMARKS: remark: remove_globalization.c:4:2: Could not move globalized variable to the stack. Variable is potentially captured. Mark as noescape to override. ; CHECK-REMARKS: remark: remove_globalization.c:2:2: Moving globalized variable to the stack. ; CHECK-REMARKS: remark: remove_globalization.c:6:2: Moving globalized variable to the stack. +; CHECK-REMARKS: remark: remove_globalization.c:4:2: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. @S = external local_unnamed_addr global i8* +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) + define void @kernel() { ; CHECK-LABEL: define {{[^@]+}}@kernel() { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @foo() -; CHECK-NEXT: call void @bar() +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i1 false, i1 false, i1 true) +; CHECK-NEXT: call void @foo() #[[ATTR0:[0-9]+]] +; CHECK-NEXT: call void @bar() #[[ATTR0]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i1 false, i1 true) ; CHECK-NEXT: ret void -; entry: + %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i1 false, i1 true, i1 true) call void @foo() call void @bar() + call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i1 false, i1 true) ret void } define internal void @foo() { ; CHECK-LABEL: define {{[^@]+}}@foo -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 ; CHECK-NEXT: ret void @@ -41,8 +50,8 @@ ; CHECK-LABEL: define {{[^@]+}}@bar ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]], !dbg [[DBG6:![0-9]+]] -; CHECK-NEXT: call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]], !dbg [[DBG8:![0-9]+]] +; CHECK-NEXT: call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: call void @__kmpc_free_shared(i8* [[TMP0]]) #[[ATTR0]] ; CHECK-NEXT: ret void ; @@ -54,13 +63,18 @@ } define internal void @use(i8* %x) { +; CHECK-LABEL: define {{[^@]+}}@use +; CHECK-SAME: (i8* noalias nocapture nofree readnone [[X:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; entry: ret void } define internal void @share(i8* %x) { ; CHECK-LABEL: define {{[^@]+}}@share -; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: store i8* [[X]], i8** @S, align 8 ; CHECK-NEXT: ret void @@ -71,6 +85,12 @@ } define void @unused() { +; CHECK-LABEL: define {{[^@]+}}@unused() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 +; CHECK-NEXT: call void @use(i8* noalias readnone undef) +; CHECK-NEXT: ret void +; entry: %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !14 call void @use(i8* %0) diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll --- a/llvm/test/Transforms/OpenMP/replace_globalization.ll +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -20,18 +20,22 @@ ; CHECK: call void @__kmpc_free_shared({{.*}}) define dso_local void @foo() { entry: + %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) %x = call i8* @__kmpc_alloc_shared(i64 4) %x_on_stack = bitcast i8* %x to i32* %0 = bitcast i32* %x_on_stack to i8* call void @use(i8* %0) call void @__kmpc_free_shared(i8* %x) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) ret void } define void @bar() { + %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) call void @baz() call void @qux() call void @negative_qux_spmd() + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) ret void } @@ -104,6 +108,8 @@ declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) + !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} !nvvm.annotations = !{!7, !8} diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll --- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll +++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll @@ -8,25 +8,36 @@ @0 = private unnamed_addr constant [1 x i8] c"\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, align 8 + +; CHECK-NOT: [openmp-opt] Basic block @kernel entry is executed by a single thread. +; CHECK: [openmp-opt] Basic block @kernel if.then is executed by a single thread. +; CHECK-NOT: [openmp-opt] Basic block @kernel if.else is executed by a single thread. +; CHECK-NOT: [openmp-opt] Basic block @kernel if.end is executed by a single thread. define void @kernel() { - call void @__kmpc_kernel_prepare_parallel(i8* null) + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false) + %cmp = icmp eq i32 %call, -1 + br i1 %cmp, label %if.then, label %if.else +if.then: call void @nvptx() call void @amdgcn() + br label %if.end +if.else: + br label %if.end +if.end: + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) ret void } ; REMARKS: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible. ; REMARKS-NOT: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible. -; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread. -; CHECK: [openmp-opt] Basic block @nvptx if.then is executed by a single thread. -; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread. +; CHECK-DAG: [openmp-opt] Basic block @nvptx entry is executed by a single thread. +; CHECK-DAG: [openmp-opt] Basic block @nvptx if.then is executed by a single thread. +; CHECK-DAG: [openmp-opt] Basic block @nvptx if.end is executed by a single thread. ; Function Attrs: noinline define internal void @nvptx() { entry: - %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false) - %cmp = icmp eq i32 %call, -1 - br i1 %cmp, label %if.then, label %if.end + br i1 true, label %if.then, label %if.end if.then: call void @foo() @@ -39,15 +50,13 @@ ret void } -; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread. -; CHECK: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread. -; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread. +; CHECK-DAG: [openmp-opt] Basic block @amdgcn entry is executed by a single thread. +; CHECK-DAG: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread. +; CHECK-DAG: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread. ; Function Attrs: noinline define internal void @amdgcn() { entry: - %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) - %cmp = icmp eq i32 %call, -1 - br i1 %cmp, label %if.then, label %if.end + br i1 false, label %if.then, label %if.end if.then: call void @foo() @@ -95,6 +104,7 @@ declare void @__kmpc_kernel_prepare_parallel(i8*) declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) attributes #0 = { cold noinline }