Index: llvm/include/llvm/Transforms/IPO/Attributor.h =================================================================== --- llvm/include/llvm/Transforms/IPO/Attributor.h +++ llvm/include/llvm/Transforms/IPO/Attributor.h @@ -110,6 +110,7 @@ #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/PostDominators.h" @@ -389,6 +390,13 @@ bool isAssumedThreadLocalObject(Attributor &A, Value &Obj, const AbstractAttribute &QueryingAA); +/// Return true if \p Loc is potentially affected by a barrier at position +/// \p CtxI. +bool isPotentiallyAffectedByBarrier(Attributor &A, + std::optional Loc, + const AbstractAttribute &QueryingAA, + Instruction *CtxI); + } // namespace AA template <> @@ -1920,7 +1928,8 @@ bool isAssumedDead(const Instruction &I, const AbstractAttribute *QueryingAA, const AAIsDead *LivenessAA, bool &UsedAssumedInformation, bool CheckBBLivenessOnly = false, - DepClassTy DepClass = DepClassTy::OPTIONAL); + DepClassTy DepClass = DepClassTy::OPTIONAL, + bool CheckForDeadStore = false); /// Return true if \p U is assumed dead. /// @@ -3322,6 +3331,10 @@ /// Helper function specific for intrinsics which are potentially volatile. static bool isNoSyncIntrinsic(const Instruction *I); + /// Helper function to determine if \p is an aligned (GPU) barrier. + /// Aligned barriers have to be executed by all threads. + static bool isAlignedBarrier(const CallBase &CB); + /// Create an abstract attribute view for the position \p IRP. static AANoSync &createForPosition(const IRPosition &IRP, Attributor &A); @@ -3616,9 +3629,6 @@ /// Returns true if the underlying value is known dead. virtual bool isKnownDead() const = 0; - /// Returns true if \p BB is assumed dead. - virtual bool isAssumedDead(const BasicBlock *BB) const = 0; - /// Returns true if \p BB is known dead. virtual bool isKnownDead(const BasicBlock *BB) const = 0; @@ -3657,6 +3667,9 @@ return F.hasPersonalityFn() && !canSimplifyInvokeNoUnwind(&F); } + /// Returns true if \p BB is assumed dead. + virtual bool isAssumedDead(const BasicBlock *BB) const = 0; + /// Return if the edge from \p From BB to \p To BB is assumed dead. /// This is specifically useful in AAReachability. virtual bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const { @@ -4986,6 +4999,47 @@ using Base = StateWrapper; AAExecutionDomain(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + /// Summary about the execution domain of a block or instruction. + struct ExecutionDomainTy { + using BarriersSetTy = SmallPtrSet; + using AssumesSetTy = SmallPtrSet; + + ExecutionDomainTy() {} + ~ExecutionDomainTy() { + // Cleanup has to happen by the user. + } + + void addAssumeInst(Attributor &A, AssumeInst &AI) { + if (!IsReachedFromAlignedBarrierOnly || EncounteredNonLocalSideEffect) + return; + if (!EncounteredAssumes) + EncounteredAssumes = new (A.Allocator) AssumesSetTy(); + EncounteredAssumes->insert(&AI); + } + + void addAlignedBarrier(Attributor &A, CallBase &CB) { + if (!IsReachedFromAlignedBarrierOnly || EncounteredNonLocalSideEffect) + return; + if (!AlignedBarriers) + AlignedBarriers = new (A.Allocator) BarriersSetTy(); + AlignedBarriers->insert(&CB); + } + + void clearAssumeInstAndAlignedBarriers() { + if (EncounteredAssumes) + EncounteredAssumes->clear(); + if (AlignedBarriers) + AlignedBarriers->clear(); + } + + bool IsExecutedByInitialThreadOnly = true; + bool IsReachedFromAlignedBarrierOnly = true; + bool IsReachingAlignedBarrierOnly = true; + bool EncounteredNonLocalSideEffect = false; + BarriersSetTy *AlignedBarriers = nullptr; + AssumesSetTy *EncounteredAssumes = nullptr; + }; + /// Create an abstract attribute view for the position \p IRP. static AAExecutionDomain &createForPosition(const IRPosition &IRP, Attributor &A); @@ -4997,11 +5051,17 @@ const char *getIdAddr() const override { return &ID; } /// Check if an instruction is executed only by the initial thread. - virtual bool isExecutedByInitialThreadOnly(const Instruction &) const = 0; + bool isExecutedByInitialThreadOnly(const Instruction &I) const { + return isExecutedByInitialThreadOnly(*I.getParent()); + } /// Check if a basic block is executed only by the initial thread. virtual bool isExecutedByInitialThreadOnly(const BasicBlock &) const = 0; + virtual ExecutionDomainTy getExecutionDomain(const BasicBlock &) const = 0; + virtual ExecutionDomainTy getExecutionDomain(const CallBase &) const = 0; + virtual ExecutionDomainTy getFunctionExecutionDomain() const = 0; + /// This function should return true if the type of the \p AA is /// AAExecutionDomain. static bool classof(const AbstractAttribute *AA) { Index: llvm/lib/Transforms/IPO/Attributor.cpp =================================================================== --- llvm/lib/Transforms/IPO/Attributor.cpp +++ llvm/lib/Transforms/IPO/Attributor.cpp @@ -782,6 +782,38 @@ return false; } +bool AA::isPotentiallyAffectedByBarrier(Attributor &A, + std::optional Loc, + const AbstractAttribute &QueryingAA, + Instruction *CtxI) { + if (!Loc || !Loc->Ptr) { + LLVM_DEBUG( + dbgs() << "[AA] Access to unknown location; -> requires barriers\n"); + return true; + } + + SmallSetVector Objects; + bool UsedAssumedInformation = false; + if (!AA::getAssumedUnderlyingObjects(A, *Loc->Ptr, Objects, QueryingAA, CtxI, + UsedAssumedInformation, + AA::Intraprocedural)) { + LLVM_DEBUG( + dbgs() + << "[AA] Failed to aquire underlying objects; -> requires barriers\n"); + return true; + } + + for (Value *Obj : Objects) { + if (AA::isAssumedThreadLocalObject(A, *Obj, QueryingAA)) + continue; + + LLVM_DEBUG(dbgs() << "[AA] Access to '" << *Obj << "' requires barrier\n"); + return true; + } + return false; +} + + /// Return true if \p New is equal or worse than \p Old. static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) { if (!Old.isIntAttribute()) @@ -1347,7 +1379,8 @@ const AbstractAttribute *QueryingAA, const AAIsDead *FnLivenessAA, bool &UsedAssumedInformation, - bool CheckBBLivenessOnly, DepClassTy DepClass) { + bool CheckBBLivenessOnly, DepClassTy DepClass, + bool CheckForDeadStore) { const IRPosition::CallBaseContext *CBCtx = QueryingAA ? QueryingAA->getCallBaseContext() : nullptr; @@ -1389,6 +1422,14 @@ return true; } + if (CheckForDeadStore && isa(I) && IsDeadAA.isRemovableStore()) { + if (QueryingAA) + recordDependence(IsDeadAA, *QueryingAA, DepClass); + if (!IsDeadAA.isKnownDead()) + UsedAssumedInformation = true; + return true; + } + return false; } Index: llvm/lib/Transforms/IPO/AttributorAttributes.cpp =================================================================== --- llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -50,6 +50,8 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/NoFolder.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" @@ -2238,6 +2240,20 @@ /// ------------------------ NoSync Function Attribute ------------------------- +bool AANoSync::isAlignedBarrier(const CallBase &CB) { + switch (CB.getIntrinsicID()) { + case Intrinsic::nvvm_barrier0: + case Intrinsic::nvvm_barrier0_and: + case Intrinsic::nvvm_barrier0_or: + case Intrinsic::nvvm_barrier0_popc: + return true; + // TODO: Check for amdgcn_s_barrier executed in a uniform/aligned way. + default: + break; + } + return hasAssumption(CB, KnownAssumptionString("ompx_aligned_barrier")); +} + bool AANoSync::isNonRelaxedAtomic(const Instruction *I) { if (!I->isAtomic()) return false; Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" @@ -32,6 +33,7 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/Assumptions.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/GlobalValue.h" @@ -51,6 +53,7 @@ #include #include +#include using namespace llvm; using namespace omp; @@ -817,8 +820,6 @@ if (remarksEnabled()) analysisGlobalization(); - - Changed |= eliminateBarriers(); } else { if (PrintICVValues) printICVs(); @@ -841,8 +842,6 @@ Changed = true; } } - - Changed |= eliminateBarriers(); } return Changed; @@ -1408,223 +1407,6 @@ return Changed; } - /// Eliminates redundant, aligned barriers in OpenMP offloaded kernels. - /// TODO: Make this an AA and expand it to work across blocks and functions. - bool eliminateBarriers() { - bool Changed = false; - - if (DisableOpenMPOptBarrierElimination) - return /*Changed=*/false; - - if (OMPInfoCache.Kernels.empty()) - return /*Changed=*/false; - - enum ImplicitBarrierType { IBT_ENTRY, IBT_EXIT }; - - class BarrierInfo { - Instruction *I; - enum ImplicitBarrierType Type; - - public: - BarrierInfo(enum ImplicitBarrierType Type) : I(nullptr), Type(Type) {} - BarrierInfo(Instruction &I) : I(&I) {} - - bool isImplicit() { return !I; } - - bool isImplicitEntry() { return isImplicit() && Type == IBT_ENTRY; } - - bool isImplicitExit() { return isImplicit() && Type == IBT_EXIT; } - - Instruction *getInstruction() { return I; } - }; - - for (Function *Kernel : OMPInfoCache.Kernels) { - for (BasicBlock &BB : *Kernel) { - SmallVector BarriersInBlock; - SmallPtrSet BarriersToBeDeleted; - - // Add the kernel entry implicit barrier. - if (&Kernel->getEntryBlock() == &BB) - BarriersInBlock.push_back(IBT_ENTRY); - - // Find implicit and explicit aligned barriers in the same basic block. - for (Instruction &I : BB) { - if (isa(I)) { - // Add the implicit barrier when exiting the kernel. - BarriersInBlock.push_back(IBT_EXIT); - continue; - } - CallBase *CB = dyn_cast(&I); - if (!CB) - continue; - - auto IsAlignBarrierCB = [&](CallBase &CB) { - switch (CB.getIntrinsicID()) { - case Intrinsic::nvvm_barrier0: - case Intrinsic::nvvm_barrier0_and: - case Intrinsic::nvvm_barrier0_or: - case Intrinsic::nvvm_barrier0_popc: - return true; - default: - break; - } - return hasAssumption(CB, - KnownAssumptionString("ompx_aligned_barrier")); - }; - - if (IsAlignBarrierCB(*CB)) { - // Add an explicit aligned barrier. - BarriersInBlock.push_back(I); - } - } - - if (BarriersInBlock.size() <= 1) - continue; - - // A barrier in a barrier pair is removeable if all instructions - // between the barriers in the pair are side-effect free modulo the - // barrier operation. - auto IsBarrierRemoveable = [&Kernel]( - BarrierInfo *StartBI, BarrierInfo *EndBI, - SmallVector &Assumptions) { - assert( - !StartBI->isImplicitExit() && - "Expected start barrier to be other than a kernel exit barrier"); - assert( - !EndBI->isImplicitEntry() && - "Expected end barrier to be other than a kernel entry barrier"); - // If StarBI instructions is null then this the implicit - // kernel entry barrier, so iterate from the first instruction in the - // entry block. - Instruction *I = (StartBI->isImplicitEntry()) - ? &Kernel->getEntryBlock().front() - : StartBI->getInstruction()->getNextNode(); - assert(I && "Expected non-null start instruction"); - Instruction *E = (EndBI->isImplicitExit()) - ? I->getParent()->getTerminator() - : EndBI->getInstruction(); - assert(E && "Expected non-null end instruction"); - - for (; I != E; I = I->getNextNode()) { - if (!I->mayHaveSideEffects() && !I->mayReadFromMemory()) - continue; - - auto IsPotentiallyAffectedByBarrier = - [](std::optional Loc) { - const Value *Obj = (Loc && Loc->Ptr) - ? getUnderlyingObject(Loc->Ptr) - : nullptr; - if (!Obj) { - LLVM_DEBUG( - dbgs() - << "Access to unknown location requires barriers\n"); - return true; - } - if (isa(Obj)) - return false; - if (isa(Obj)) - return false; - if (auto *GV = dyn_cast(Obj)) { - if (GV->isConstant()) - return false; - if (GV->isThreadLocal()) - return false; - if (GV->getAddressSpace() == (int)AddressSpace::Local) - return false; - if (GV->getAddressSpace() == (int)AddressSpace::Constant) - return false; - } - LLVM_DEBUG(dbgs() << "Access to '" << *Obj - << "' requires barriers\n"); - return true; - }; - - if (MemIntrinsic *MI = dyn_cast(I)) { - std::optional Loc = - MemoryLocation::getForDest(MI); - if (IsPotentiallyAffectedByBarrier(Loc)) - return false; - if (MemTransferInst *MTI = dyn_cast(I)) { - std::optional Loc = - MemoryLocation::getForSource(MTI); - if (IsPotentiallyAffectedByBarrier(Loc)) - return false; - } - continue; - } - - if (auto *AI = dyn_cast(I)) { - Assumptions.push_back(AI); - continue; - } - - if (auto *LI = dyn_cast(I)) - if (LI->hasMetadata(LLVMContext::MD_invariant_load)) - continue; - - std::optional Loc = MemoryLocation::getOrNone(I); - if (IsPotentiallyAffectedByBarrier(Loc)) - return false; - } - - return true; - }; - - // Iterate barrier pairs and remove an explicit barrier if analysis - // deems it removeable. - for (auto *It = BarriersInBlock.begin(), - *End = BarriersInBlock.end() - 1; - It != End; ++It) { - - BarrierInfo *StartBI = It; - BarrierInfo *EndBI = (It + 1); - - // Cannot remove when both are implicit barriers, continue. - if (StartBI->isImplicit() && EndBI->isImplicit()) - continue; - - SmallVector Assumptions; - if (!IsBarrierRemoveable(StartBI, EndBI, Assumptions)) - continue; - - assert(!(StartBI->isImplicit() && EndBI->isImplicit()) && - "Expected at least one explicit barrier to remove."); - - for (auto *Assumption : Assumptions) - Assumption->eraseFromParent(); - - // Remove an explicit barrier, check first, then second. - if (!StartBI->isImplicit()) { - LLVM_DEBUG(dbgs() << "Remove start barrier " - << *StartBI->getInstruction() << "\n"); - BarriersToBeDeleted.insert(StartBI->getInstruction()); - } else { - LLVM_DEBUG(dbgs() << "Remove end barrier " - << *EndBI->getInstruction() << "\n"); - BarriersToBeDeleted.insert(EndBI->getInstruction()); - } - } - - if (BarriersToBeDeleted.empty()) - continue; - - Changed = true; - for (Instruction *I : BarriersToBeDeleted) { - ++NumBarriersEliminated; - auto Remark = [&](OptimizationRemark OR) { - return OR << "Redundant barrier eliminated."; - }; - - if (EnableVerboseRemarks) - emitRemark(I, "OMP190", Remark); - I->eraseFromParent(); - } - } - } - - return Changed; - } - void analysisGlobalization() { auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; @@ -2768,77 +2550,154 @@ AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A) : AAExecutionDomain(IRP, A) {} + ~AAExecutionDomainFunction() { + for (auto &It : BEDMap) { + if (It.second.EncounteredAssumes) + It.second.EncounteredAssumes->~SmallPtrSet(); + if (It.second.AlignedBarriers) + It.second.AlignedBarriers->~SmallPtrSet(); + } + for (auto &It : CEDMap) { + if (It.second.EncounteredAssumes) + It.second.EncounteredAssumes->~SmallPtrSet(); + if (It.second.AlignedBarriers) + It.second.AlignedBarriers->~SmallPtrSet(); + } + } + + void initialize(Attributor &A) override { + if (getAnchorScope()->isDeclaration()) { + indicatePessimisticFixpoint(); + return; + } + RPOT = new ReversePostOrderTraversal(getAnchorScope()); + } + const std::string getAsStr() const override { - return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) + - "/" + std::to_string(NumBBs) + " BBs thread 0 only."; + // TODO: Print useful summary. + const auto &ED = BEDMap.lookup(nullptr); + return "[AAExecutionDomain] " + + std::to_string(ED.IsReachingAlignedBarrierOnly); } /// See AbstractAttribute::trackStatistics(). void trackStatistics() const override {} - void initialize(Attributor &A) override { - Function *F = getAnchorScope(); - for (const auto &BB : *F) - SingleThreadedBBs.insert(&BB); - NumBBs = SingleThreadedBBs.size(); - } - ChangeStatus manifest(Attributor &A) override { LLVM_DEBUG({ - for (const BasicBlock *BB : SingleThreadedBBs) + for (const BasicBlock &BB : *getAnchorScope()) { + if (!isExecutedByInitialThreadOnly(BB)) + continue; dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " " - << BB->getName() << " is executed by a single thread.\n"; + << BB.getName() << " is executed by a single thread.\n"; + } }); - return ChangeStatus::UNCHANGED; - } - ChangeStatus updateImpl(Attributor &A) override; + ChangeStatus Changed = ChangeStatus::UNCHANGED; - /// Check if an instruction is executed by a single thread. - bool isExecutedByInitialThreadOnly(const Instruction &I) const override { - return isExecutedByInitialThreadOnly(*I.getParent()); - } + if (DisableOpenMPOptBarrierElimination) + return Changed; - bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override { - return isValidState() && SingleThreadedBBs.contains(&BB); + SmallPtrSet DeletedBarriers; + auto HandleAlignedBarrier = [&](CallBase *CB) { + const ExecutionDomainTy &ED = CEDMap[CB]; + if (!ED.IsReachedFromAlignedBarrierOnly || + ED.EncounteredNonLocalSideEffect) + return; + + // We can remove this barrier, if it is one, or all aligned barriers + // reaching the kernel end. In the latter case we can transitively work + // our way back until we find a barrier that guards a side-effect if we + // are dealing with the kernel end here. + if (CB) { + DeletedBarriers.insert(CB); + A.deleteAfterManifest(*CB); + ++NumBarriersEliminated; + Changed = ChangeStatus::CHANGED; + } else if (ED.AlignedBarriers) { + NumBarriersEliminated += ED.AlignedBarriers->size(); + Changed = ChangeStatus::CHANGED; + SmallVector Worklist(ED.AlignedBarriers->begin(), + ED.AlignedBarriers->end()); + SmallSetVector Visited; + while (!Worklist.empty()) { + CallBase *LastCB = Worklist.pop_back_val(); + if (!Visited.insert(LastCB)) + continue; + if (!DeletedBarriers.count(LastCB)) { + A.deleteAfterManifest(*LastCB); + continue; + } + // The final aligned barrier (LastCB) reaching the kernel end was + // removed already. This means we can go one step further and remove + // the barriers encoutered last before (LastCB). + const ExecutionDomainTy &LastED = CEDMap[LastCB]; + if (!LastED.AlignedBarriers) + continue; + Worklist.append(LastED.AlignedBarriers->begin(), + LastED.AlignedBarriers->end()); + } + } + + // If we actually eliminated a barrier we need to eliminate the associated + // llvm.assumes as well to avoid creating UB. + if (ED.EncounteredAssumes && (CB || ED.AlignedBarriers)) + for (auto *AssumeCB : *ED.EncounteredAssumes) + A.deleteAfterManifest(*AssumeCB); + }; + + for (auto *CB : AlignedBarriers) + HandleAlignedBarrier(CB); + + auto &OMPInfoCache = static_cast(A.getInfoCache()); + // Handle the "kernel end barrier" for kernels too. + if (OMPInfoCache.Kernels.count(getAnchorScope())) + HandleAlignedBarrier(nullptr); + + return Changed; } - /// Set of basic blocks that are executed by a single thread. - SmallSetVector SingleThreadedBBs; + void + mergeInPredecessorBarriersAndAssumptions(Attributor &A, ExecutionDomainTy &ED, + const ExecutionDomainTy &PredED); - /// Total number of basic blocks in this function. - long unsigned NumBBs = 0; -}; + void mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED, + const ExecutionDomainTy &PredED, + bool InitialEdgeOnly = false); -ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { - Function *F = getAnchorScope(); - ReversePostOrderTraversal RPOT(F); - auto NumSingleThreadedBBs = SingleThreadedBBs.size(); + void handleEntryBB(Attributor &A, ExecutionDomainTy &EntryBBED); - bool AllCallSitesKnown; - auto PredForCallSite = [&](AbstractCallSite ACS) { - const auto &ExecutionDomainAA = A.getAAFor( - *this, IRPosition::function(*ACS.getInstruction()->getFunction()), - DepClassTy::REQUIRED); - return ACS.isDirectCall() && - ExecutionDomainAA.isExecutedByInitialThreadOnly( - *ACS.getInstruction()); - }; + ChangeStatus updateImpl(Attributor &A) override; - if (!A.checkForAllCallSites(PredForCallSite, *this, - /* RequiresAllCallSites */ true, - AllCallSitesKnown)) - SingleThreadedBBs.remove(&F->getEntryBlock()); + bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override { + if (!isValidState()) + return false; + return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly; + } - auto &OMPInfoCache = static_cast(A.getInfoCache()); - auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; + ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override { + assert(isValidState() && + "No request should be made against an invalid state!"); + return BEDMap.lookup(&BB); + } + ExecutionDomainTy getExecutionDomain(const CallBase &CB) const override { + assert(isValidState() && + "No request should be made against an invalid state!"); + return CEDMap.lookup(&CB); + } + ExecutionDomainTy getFunctionExecutionDomain() const override { + assert(isValidState() && + "No request should be made against an invalid state!"); + return BEDMap.lookup(nullptr); + } // Check if the edge into the successor block contains a condition that only // lets the main thread execute it. - auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) { + bool isInitialThreadOnly(Attributor &A, BranchInst *Edge, + BasicBlock &SuccessorBB) { if (!Edge || !Edge->isConditional()) return false; - if (Edge->getSuccessor(0) != SuccessorBB) + if (Edge->getSuccessor(0) != &SuccessorBB) return false; auto *Cmp = dyn_cast(Edge->getCondition()); @@ -2852,6 +2711,8 @@ // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!) if (C->isAllOnesValue()) { auto *CB = dyn_cast(Cmp->getOperand(0)); + auto &OMPInfoCache = static_cast(A.getInfoCache()); + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr; if (!CB) return false; @@ -2875,30 +2736,336 @@ return false; }; - // Merge all the predecessor states into the current basic block. A basic - // block is executed by a single thread if all of its predecessors are. - auto MergePredecessorStates = [&](BasicBlock *BB) { - if (pred_empty(BB)) - return SingleThreadedBBs.contains(BB); - - bool IsInitialThread = true; - for (BasicBlock *PredBB : predecessors(BB)) { - if (!IsInitialThreadOnly(dyn_cast(PredBB->getTerminator()), - BB)) - IsInitialThread &= SingleThreadedBBs.contains(PredBB); + /// Mapping containing information per block. + DenseMap BEDMap; + DenseMap CEDMap; + SmallSetVector AlignedBarriers; + + ReversePostOrderTraversal *RPOT; +}; + +void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions( + Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) { + if (PredED.EncounteredAssumes && !PredED.EncounteredAssumes->empty()) { + for (auto *EA : *PredED.EncounteredAssumes) + ED.addAssumeInst(A, *EA); + } + + if (PredED.AlignedBarriers && !PredED.AlignedBarriers->empty()) { + for (auto *AB : *PredED.AlignedBarriers) + ED.addAlignedBarrier(A, *AB); + } +} + +void AAExecutionDomainFunction::mergeInPredecessor( + Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED, + bool InitialEdgeOnly) { + ED.IsExecutedByInitialThreadOnly = + InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly && + ED.IsExecutedByInitialThreadOnly); + + ED.IsReachedFromAlignedBarrierOnly = ED.IsReachedFromAlignedBarrierOnly && + PredED.IsReachedFromAlignedBarrierOnly; + ED.EncounteredNonLocalSideEffect = + ED.EncounteredNonLocalSideEffect | PredED.EncounteredNonLocalSideEffect; + if (ED.IsReachedFromAlignedBarrierOnly) + mergeInPredecessorBarriersAndAssumptions(A, ED, PredED); + else + ED.clearAssumeInstAndAlignedBarriers(); +} + +void AAExecutionDomainFunction::handleEntryBB(Attributor &A, + ExecutionDomainTy &EntryBBED) { + SmallVector PredExecDomains; + auto PredForCallSite = [&](AbstractCallSite ACS) { + const auto &EDAA = A.getAAFor( + *this, IRPosition::function(*ACS.getInstruction()->getFunction()), + DepClassTy::OPTIONAL); + if (!EDAA.getState().isValidState()) + return false; + PredExecDomains.emplace_back( + EDAA.getExecutionDomain(*cast(ACS.getInstruction()))); + return true; + }; + + bool AllCallSitesKnown; + if (A.checkForAllCallSites(PredForCallSite, *this, + /* RequiresAllCallSites */ true, + AllCallSitesKnown)) { + for (const auto &PredED : PredExecDomains) + mergeInPredecessor(A, EntryBBED, PredED); + + } else { + // We could not find all predecessors, so this is either a kernel or a + // function with external linkage (or with some other weird uses). + auto &OMPInfoCache = static_cast(A.getInfoCache()); + if (OMPInfoCache.Kernels.count(getAnchorScope())) { + EntryBBED.IsExecutedByInitialThreadOnly = false; + EntryBBED.IsReachedFromAlignedBarrierOnly = true; + EntryBBED.EncounteredNonLocalSideEffect = false; + } else { + EntryBBED.IsExecutedByInitialThreadOnly = false; + EntryBBED.IsReachedFromAlignedBarrierOnly = false; + EntryBBED.EncounteredNonLocalSideEffect = true; } + } + + auto &FnED = BEDMap[nullptr]; + FnED.IsReachingAlignedBarrierOnly &= + EntryBBED.IsReachedFromAlignedBarrierOnly; +} + +ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { + + bool Changed = false; - return IsInitialThread; + // Helper to deal with an aligned barrier encountered during the forward + // traversal. \p CB is the aligned barrier, \p ED is the execution domain when + // it was encountered. + auto HandleAlignedBarrier = [&](CallBase *CB, ExecutionDomainTy &ED) { + if (CB) + Changed |= AlignedBarriers.insert(CB); + // First, update the barrier ED kept in the separate CEDMap. + auto &CallED = CEDMap[CB]; + mergeInPredecessor(A, CallED, ED); + // Next adjust the ED we use for the traversal. + ED.EncounteredNonLocalSideEffect = false; + ED.IsReachedFromAlignedBarrierOnly = true; + // Aligned barrier collection has to come last. + ED.clearAssumeInstAndAlignedBarriers(); + ED.addAlignedBarrier(A, *CB); }; - for (auto *BB : RPOT) { - if (!MergePredecessorStates(BB)) - SingleThreadedBBs.remove(BB); + auto &LivenessAA = + A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL); + + // Set \p R to \V and report true if that changed \p R. + auto SetAndRecord = [&](bool &R, bool V) { + bool Eq = (R == V); + R = V; + return !Eq; + }; + + auto &OMPInfoCache = static_cast(A.getInfoCache()); + + Function *F = getAnchorScope(); + BasicBlock &EntryBB = F->getEntryBlock(); + + SmallVector SyncInstWorklist; + for (auto &RIt : *RPOT) { + BasicBlock &BB = *RIt; + + ExecutionDomainTy ED; + // Propagate "incoming edges" into information about this block. + if (&BB == &EntryBB) { + handleEntryBB(A, ED); + } else { + // For live non-entry blocks we only propagate information via live edges. + if (LivenessAA.isAssumedDead(&BB)) + continue; + + for (auto *PredBB : predecessors(&BB)) { + if (LivenessAA.isEdgeDead(PredBB, &BB)) + continue; + bool InitialEdgeOnly = isInitialThreadOnly( + A, dyn_cast(PredBB->getTerminator()), BB); + mergeInPredecessor(A, ED, BEDMap[PredBB], InitialEdgeOnly); + } + } + + // Now we traverse the block, accumulate effects in ED and attach + // information to calls. + for (Instruction &I : BB) { + bool UsedAssumedInformation; + if (A.isAssumedDead(I, *this, &LivenessAA, UsedAssumedInformation, + /* CheckBBLivenessOnly */ false, DepClassTy::OPTIONAL, + /* CheckForDeadStore */ true)) + continue; + + // Asummes and "assume-like" (dbg, lifetime, ...) are handled first, the + // former is collected the latter is ignored. + if (auto *II = dyn_cast(&I)) { + if (auto *AI = dyn_cast_or_null(II)) { + ED.addAssumeInst(A, *AI); + continue; + } + // TODO: Should we also collect and delete lifetime markers? + if (II->isAssumeLikeIntrinsic()) + continue; + } + + auto *CB = dyn_cast(&I); + bool IsNoSync = AA::isNoSyncInst(A, I, *this); + bool IsAlignedBarrier = + !IsNoSync && CB && AANoSync::isAlignedBarrier(*CB); + + // Next we check for calls. Aligned barriers are handled + // explicitly, everything else is kept for the backward traversal and will + // also affect our state. + if (CB) { + if (IsAlignedBarrier) { + HandleAlignedBarrier(CB, ED); + continue; + } + + // Check the pointer(s) of a memory intrinsic explicitly. + if (MemIntrinsic *MI = dyn_cast(&I)) { + auto Loc = MemoryLocation::getForDest(MI); + if (!ED.EncounteredNonLocalSideEffect && + AA::isPotentiallyAffectedByBarrier(A, Loc, *this, &I)) + ED.EncounteredNonLocalSideEffect = true; + if (MemTransferInst *MTI = dyn_cast(&I)) { + auto Loc = MemoryLocation::getForSource(MTI); + if (!ED.EncounteredNonLocalSideEffect && + AA::isPotentiallyAffectedByBarrier(A, Loc, *this, &I)) + ED.EncounteredNonLocalSideEffect = true; + } + if (!IsNoSync) { + ED.IsReachedFromAlignedBarrierOnly = false; + SyncInstWorklist.push_back(&I); + } + continue; + } + + // Record how we entered the call, then accumulate the effect of the + // call in EDf or potential use by the callee. + auto &CallED = CEDMap[CB]; + mergeInPredecessor(A, CallED, ED); + + // If we have a sync-definition we can check if it starts/ends in an + // aligned barrier. If we are unsure we assume any sync breaks + // alignment. + Function *Callee = CB->getCalledFunction(); + if (!IsNoSync && Callee && !Callee->isDeclaration()) { + const auto &EDAA = A.getAAFor( + *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL); + if (EDAA.getState().isValidState()) { + const auto &CalleeED = EDAA.getFunctionExecutionDomain(); + ED.IsReachedFromAlignedBarrierOnly = + CalleeED.IsReachedFromAlignedBarrierOnly; + if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly) + ED.EncounteredNonLocalSideEffect |= + CalleeED.EncounteredNonLocalSideEffect; + else + ED.EncounteredNonLocalSideEffect = + CalleeED.EncounteredNonLocalSideEffect; + if (!CalleeED.IsReachingAlignedBarrierOnly) + SyncInstWorklist.push_back(&I); + if (CalleeED.IsReachedFromAlignedBarrierOnly) + mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED); + continue; + } + } + ED.IsReachedFromAlignedBarrierOnly = + IsNoSync && ED.IsReachedFromAlignedBarrierOnly; + ED.EncounteredNonLocalSideEffect |= true; + if (!IsNoSync) + SyncInstWorklist.push_back(&I); + } + + if (!I.mayHaveSideEffects() && !I.mayReadFromMemory()) + continue; + + // If we have a callee we try to use fine-grained information to + // determine local side-effects. + if (CB) { + const auto &MemAA = A.getAAFor( + *this, IRPosition::callsite_function(*CB), DepClassTy::OPTIONAL); + + auto AccessPred = [&](const Instruction *I, const Value *Ptr, + AAMemoryLocation::AccessKind, + AAMemoryLocation::MemoryLocationsKind) { + return !AA::isPotentiallyAffectedByBarrier( + A, MemoryLocation::getAfter(Ptr), *this, + const_cast(I)); + }; + if (MemAA.getState().isValidState() && + MemAA.checkForAllAccessesToMemoryKind( + AccessPred, AAMemoryLocation::ALL_LOCATIONS)) + continue; + } + + if (!I.mayHaveSideEffects() && OMPInfoCache.isOnlyUsedByAssume(I)) + continue; + + if (auto *LI = dyn_cast(&I)) + if (LI->hasMetadata(LLVMContext::MD_invariant_load)) + continue; + + auto Loc = MemoryLocation::getOrNone(&I); + if (!ED.EncounteredNonLocalSideEffect && + AA::isPotentiallyAffectedByBarrier(A, Loc, *this, &I)) + ED.EncounteredNonLocalSideEffect = true; + } + + if (!isa(BB.getTerminator()) && + !BB.getTerminator()->getNumSuccessors()) { + + auto &FnED = BEDMap[nullptr]; + mergeInPredecessor(A, FnED, ED); + + if (OMPInfoCache.Kernels.count(F)) + HandleAlignedBarrier(nullptr, ED); + } + + ExecutionDomainTy &StoredED = BEDMap[&BB]; + ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly; + + // Check if we computed anything different as part of the forward + // traversal. We do not take assumptions and aligned barriers into account + // as they do not influence our state. Backward traversal values are + // handled later on. + if (ED.IsExecutedByInitialThreadOnly != + StoredED.IsExecutedByInitialThreadOnly || + ED.IsReachedFromAlignedBarrierOnly != + StoredED.IsReachedFromAlignedBarrierOnly || + ED.EncounteredNonLocalSideEffect != + StoredED.EncounteredNonLocalSideEffect) + Changed = true; + + // Update the state with the new value. + StoredED = std::move(ED); } - return (NumSingleThreadedBBs == SingleThreadedBBs.size()) - ? ChangeStatus::UNCHANGED - : ChangeStatus::CHANGED; + // Propagate (non-aligned) sync instruction effects backwards until the + // entry is hit or an aligned barrier. + SmallSetVector Visited; + while (!SyncInstWorklist.empty()) { + Instruction *SyncInst = SyncInstWorklist.pop_back_val(); + Instruction *CurInst = SyncInst; + bool HitAlignedBarrier = false; + while ((CurInst = CurInst->getPrevNode())) { + auto *CB = dyn_cast(CurInst); + if (!CB) + continue; + auto &CallED = CEDMap[CB]; + if (SetAndRecord(CallED.IsReachingAlignedBarrierOnly, false)) + Changed = true; + HitAlignedBarrier = AlignedBarriers.count(CB); + if (HitAlignedBarrier) + break; + } + if (HitAlignedBarrier) + continue; + BasicBlock *SyncBB = SyncInst->getParent(); + for (auto *PredBB : predecessors(SyncBB)) { + if (LivenessAA.isEdgeDead(PredBB, SyncBB)) + continue; + if (!Visited.insert(PredBB)) + continue; + SyncInstWorklist.push_back(PredBB->getTerminator()); + auto &PredED = BEDMap[PredBB]; + if (SetAndRecord(PredED.IsReachingAlignedBarrierOnly, false)) + Changed = true; + } + if (SyncBB != &EntryBB) + continue; + auto &FnED = BEDMap[nullptr]; + if (SetAndRecord(FnED.IsReachingAlignedBarrierOnly, false)) + Changed = true; + } + + return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; } /// Try to replace memory allocation calls called by a single thread with a @@ -2981,8 +3148,12 @@ Attributor::SimplifictionCallbackTy SCB = [](const IRPosition &, const AbstractAttribute *, bool &) -> std::optional { return nullptr; }; + + Function *F = getAnchorScope(); for (User *U : RFI.Declaration->users()) if (CallBase *CB = dyn_cast(U)) { + if (CB->getFunction() != F) + continue; MallocCalls.insert(CB); A.registerSimplificationCallback(IRPosition::callsite_returned(*CB), SCB); @@ -3087,12 +3258,16 @@ // Only consider malloc calls executed by a single thread with a constant. for (User *U : RFI.Declaration->users()) { - const auto &ED = A.getAAFor( - *this, IRPosition::function(*F), DepClassTy::REQUIRED); - if (CallBase *CB = dyn_cast(U)) + if (CallBase *CB = dyn_cast(U)) { + if (!MallocCalls.count(CB)) + continue; + + const auto &ED = A.getAAFor( + *this, IRPosition::function(*F), DepClassTy::REQUIRED); if (!isa(CB->getArgOperand(0)) || !ED.isExecutedByInitialThreadOnly(*CB)) MallocCalls.remove(CB); + } } findPotentialRemovedFreeCalls(A); Index: llvm/test/Transforms/OpenMP/always_inline_device.ll =================================================================== --- llvm/test/Transforms/OpenMP/always_inline_device.ll +++ llvm/test/Transforms/OpenMP/always_inline_device.ll @@ -11,7 +11,7 @@ ; Function Attrs: convergent norecurse nounwind define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 { ; CHECK: Function Attrs: convergent norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_c0934fc2_foo_l4( +; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false) ; CHECK-NEXT: [[THREAD_ID_IN_BLOCK:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() @@ -58,7 +58,7 @@ ; Function Attrs: convergent nounwind define hidden void @bar() #1 { ; CHECK: Function Attrs: alwaysinline convergent nounwind -; CHECK-LABEL: define {{[^@]+}}@bar( +; CHECK-LABEL: @bar( ; CHECK-NEXT: entry: ; CHECK-NEXT: ret void ; Index: llvm/test/Transforms/OpenMP/barrier_removal.ll =================================================================== --- llvm/test/Transforms/OpenMP/barrier_removal.ll +++ llvm/test/Transforms/OpenMP/barrier_removal.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt < %s -S -passes=openmp-opt-cgscc | FileCheck %s +; RUN: opt < %s -S -passes=openmp-opt | FileCheck %s --check-prefixes=CHECK,MODULE +; RUN: opt < %s -S -passes=openmp-opt-cgscc | FileCheck %s --check-prefixes=CHECK,CGSCC +target triple = "amdgcn-amd-amdhsa" declare void @useI32(i32) declare void @unknown() @@ -102,7 +104,6 @@ ; CHECK-NEXT: [[B:%.*]] = load i32, i32* addrspacecast (i32 addrspace(4)* @GC2 to i32*), align 4 ; CHECK-NEXT: [[ARGC:%.*]] = addrspacecast i32 addrspace(4)* [[ARG]] to i32* ; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[ARGC]], align 4 -; CHECK-NEXT: call void @aligned_barrier() ; CHECK-NEXT: [[D:%.*]] = add i32 42, [[B]] ; CHECK-NEXT: [[E:%.*]] = add i32 [[D]], [[C]] ; CHECK-NEXT: call void @useI32(i32 [[E]]) @@ -164,7 +165,6 @@ ; CHECK-NEXT: [[A:%.*]] = load i32, i32* @PG1, align 4 ; CHECK-NEXT: store i32 [[A]], i32* [[LOC]], align 4 ; CHECK-NEXT: [[B:%.*]] = load i32, i32* addrspacecast (i32 addrspace(5)* @PG2 to i32*), align 4 -; CHECK-NEXT: call void @aligned_barrier() ; CHECK-NEXT: [[ARGC:%.*]] = addrspacecast i32 addrspace(5)* [[ARG]] to i32* ; CHECK-NEXT: store i32 [[B]], i32* [[ARGC]], align 4 ; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[LOC]], align 4 @@ -228,8 +228,625 @@ ret void } -!llvm.module.flags = !{!12,!13} -!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11} +define void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) { +; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_1 +; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) { +; CHECK-NEXT: br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]] +; CHECK: t0: +; CHECK-NEXT: br label [[T0B:%.*]] +; CHECK: t0b: +; CHECK-NEXT: br label [[M:%.*]] +; CHECK: f0: +; CHECK-NEXT: br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: t1: +; CHECK-NEXT: br label [[M]] +; CHECK: f1: +; CHECK-NEXT: br label [[M]] +; CHECK: m: +; CHECK-NEXT: ret void +; + call void @llvm.nvvm.barrier0() + call void @aligned_barrier() + br i1 %c0, label %t0, label %f0 +t0: + call void @aligned_barrier() + br label %t0b +t0b: + call void @aligned_barrier() + br label %m +f0: + call void @aligned_barrier() + call void @llvm.nvvm.barrier0() + br i1 %c1, label %t1, label %f1 +t1: + call void @aligned_barrier() + br label %m +f1: + call void @aligned_barrier() + br label %m +m: + call void @aligned_barrier() + ret void +} + +define void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, i32* %p) { +; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_2 +; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i32* [[P:%.*]]) { +; CHECK-NEXT: store i32 4, i32* [[P]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]] +; CHECK: t0: +; CHECK-NEXT: br label [[T0B:%.*]] +; CHECK: t0b: +; CHECK-NEXT: br label [[M:%.*]] +; CHECK: f0: +; CHECK-NEXT: store i32 4, i32* [[P]], align 4 +; CHECK-NEXT: call void @llvm.nvvm.barrier0() +; CHECK-NEXT: br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: t1: +; CHECK-NEXT: br label [[M]] +; CHECK: f1: +; CHECK-NEXT: br label [[M]] +; CHECK: m: +; CHECK-NEXT: store i32 4, i32* [[P]], align 4 +; CHECK-NEXT: ret void +; + call void @llvm.nvvm.barrier0() + store i32 4, i32* %p + call void @aligned_barrier() + br i1 %c0, label %t0, label %f0 +t0: + call void @aligned_barrier() + br label %t0b +t0b: + call void @aligned_barrier() + br label %m +f0: + call void @aligned_barrier() + store i32 4, i32* %p + call void @llvm.nvvm.barrier0() + br i1 %c1, label %t1, label %f1 +t1: + call void @aligned_barrier() + br label %m +f1: + call void @aligned_barrier() + br label %m +m: + store i32 4, i32* %p + call void @aligned_barrier() + ret void +} + +define void @multiple_blocks_non_kernel_1(i1 %c0, i1 %c1) { +; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_non_kernel_1 +; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) { +; CHECK-NEXT: call void @llvm.nvvm.barrier0() +; CHECK-NEXT: br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]] +; CHECK: t0: +; CHECK-NEXT: br label [[T0B:%.*]] +; CHECK: t0b: +; CHECK-NEXT: br label [[M:%.*]] +; CHECK: f0: +; CHECK-NEXT: br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: t1: +; CHECK-NEXT: br label [[M]] +; CHECK: f1: +; CHECK-NEXT: br label [[M]] +; CHECK: m: +; CHECK-NEXT: ret void +; + call void @llvm.nvvm.barrier0() + call void @aligned_barrier() + br i1 %c0, label %t0, label %f0 +t0: + call void @aligned_barrier() + br label %t0b +t0b: + call void @aligned_barrier() + br label %m +f0: + call void @aligned_barrier() + call void @llvm.nvvm.barrier0() + br i1 %c1, label %t1, label %f1 +t1: + call void @aligned_barrier() + br label %m +f1: + call void @aligned_barrier() + br label %m +m: + call void @aligned_barrier() + ret void +} + +define void @multiple_blocks_non_kernel_2(i1 %c0, i1 %c1) { +; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_non_kernel_2 +; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) { +; CHECK-NEXT: br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]] +; CHECK: t0: +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: br label [[T0B:%.*]] +; CHECK: t0b: +; CHECK-NEXT: br label [[M:%.*]] +; CHECK: f0: +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: t1: +; CHECK-NEXT: br label [[M]] +; CHECK: f1: +; CHECK-NEXT: br label [[M]] +; CHECK: m: +; CHECK-NEXT: ret void +; + br i1 %c0, label %t0, label %f0 +t0: + call void @aligned_barrier() + br label %t0b +t0b: + call void @aligned_barrier() + br label %m +f0: + call void @aligned_barrier() + call void @llvm.nvvm.barrier0() + br i1 %c1, label %t1, label %f1 +t1: + call void @aligned_barrier() + br label %m +f1: + call void @aligned_barrier() + br label %m +m: + call void @aligned_barrier() + ret void +} + +define void @multiple_blocks_non_kernel_3(i1 %c0, i1 %c1) { +; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_non_kernel_3 +; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) { +; CHECK-NEXT: br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]] +; CHECK: t0: +; CHECK-NEXT: br label [[T0B:%.*]] +; CHECK: t0b: +; CHECK-NEXT: br label [[M:%.*]] +; CHECK: f0: +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: t1: +; CHECK-NEXT: br label [[M]] +; CHECK: f1: +; CHECK-NEXT: br label [[M]] +; CHECK: m: +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: ret void +; + br i1 %c0, label %t0, label %f0 +t0: + br label %t0b +t0b: + br label %m +f0: + call void @aligned_barrier() + call void @llvm.nvvm.barrier0() + br i1 %c1, label %t1, label %f1 +t1: + call void @aligned_barrier() + br label %m +f1: + call void @aligned_barrier() + br label %m +m: + call void @aligned_barrier() + ret void +} + +define void @multiple_blocks_non_kernel_effects_1(i1 %c0, i1 %c1, i32* %p) { +; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_non_kernel_effects_1 +; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i32* [[P:%.*]]) { +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: store i32 0, i32* [[P]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]] +; CHECK: t0: +; CHECK-NEXT: store i32 1, i32* [[P]], align 4 +; CHECK-NEXT: br label [[T0B:%.*]] +; CHECK: t0b: +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: br label [[M:%.*]] +; CHECK: f0: +; CHECK-NEXT: store i32 2, i32* [[P]], align 4 +; CHECK-NEXT: br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: t1: +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: br label [[M]] +; CHECK: f1: +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: br label [[M]] +; CHECK: m: +; CHECK-NEXT: store i32 3, i32* [[P]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: ret void +; + call void @aligned_barrier() + store i32 0, i32* %p + call void @aligned_barrier() + br i1 %c0, label %t0, label %f0 +t0: + call void @aligned_barrier() + store i32 1, i32* %p + br label %t0b +t0b: + call void @aligned_barrier() + br label %m +f0: + call void @aligned_barrier() + call void @llvm.nvvm.barrier0() + store i32 2, i32* %p + br i1 %c1, label %t1, label %f1 +t1: + call void @aligned_barrier() + br label %m +f1: + call void @aligned_barrier() + br label %m +m: + call void @aligned_barrier() + store i32 3, i32* %p + call void @aligned_barrier() + ret void +} + +define internal void @write_then_barrier0(i32* %p) { +; MODULE-LABEL: define {{[^@]+}}@write_then_barrier0 +; MODULE-SAME: (i32* [[P:%.*]]) { +; MODULE-NEXT: store i32 0, i32* [[P]], align 4 +; MODULE-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@write_then_barrier0 +; CGSCC-SAME: (i32* [[P:%.*]]) { +; CGSCC-NEXT: store i32 0, i32* [[P]], align 4 +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: ret void +; + store i32 0, i32* %p + call void @aligned_barrier() + ret void +} +define internal void @barrier_then_write0(i32* %p) { +; MODULE-LABEL: define {{[^@]+}}@barrier_then_write0 +; MODULE-SAME: (i32* [[P:%.*]]) { +; MODULE-NEXT: store i32 0, i32* [[P]], align 4 +; MODULE-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@barrier_then_write0 +; CGSCC-SAME: (i32* [[P:%.*]]) { +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: store i32 0, i32* [[P]], align 4 +; CGSCC-NEXT: ret void +; + call void @aligned_barrier() + store i32 0, i32* %p + ret void +} +define internal void @barrier_then_write_then_barrier0(i32* %p) { +; MODULE-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier0 +; MODULE-SAME: (i32* [[P:%.*]]) { +; MODULE-NEXT: store i32 0, i32* [[P]], align 4 +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier0 +; CGSCC-SAME: (i32* [[P:%.*]]) { +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: store i32 0, i32* [[P]], align 4 +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: ret void +; + call void @aligned_barrier() + store i32 0, i32* %p + call void @aligned_barrier() + ret void +} +define void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, i32* %p) { +; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_kernel_effects_0 +; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i32* [[P:%.*]]) { +; MODULE-NEXT: call void @barrier_then_write_then_barrier0(i32* [[P]]) +; MODULE-NEXT: br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]] +; MODULE: t03: +; MODULE-NEXT: call void @barrier_then_write0(i32* [[P]]) +; MODULE-NEXT: br label [[T0B3:%.*]] +; MODULE: t0b3: +; MODULE-NEXT: br label [[M3:%.*]] +; MODULE: f03: +; MODULE-NEXT: call void @barrier_then_write0(i32* [[P]]) +; MODULE-NEXT: br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]] +; MODULE: t13: +; MODULE-NEXT: br label [[M3]] +; MODULE: f13: +; MODULE-NEXT: br label [[M3]] +; MODULE: m3: +; MODULE-NEXT: call void @write_then_barrier0(i32* [[P]]) +; MODULE-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@multiple_blocks_functions_kernel_effects_0 +; CGSCC-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i32* [[P:%.*]]) { +; CGSCC-NEXT: call void @barrier_then_write_then_barrier0(i32* [[P]]) +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]] +; CGSCC: t03: +; CGSCC-NEXT: call void @barrier_then_write0(i32* [[P]]) +; CGSCC-NEXT: br label [[T0B3:%.*]] +; CGSCC: t0b3: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br label [[M3:%.*]] +; CGSCC: f03: +; CGSCC-NEXT: call void @barrier_then_write0(i32* [[P]]) +; CGSCC-NEXT: br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]] +; CGSCC: t13: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br label [[M3]] +; CGSCC: f13: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br label [[M3]] +; CGSCC: m3: +; CGSCC-NEXT: call void @write_then_barrier0(i32* [[P]]) +; CGSCC-NEXT: ret void +; + call void @barrier_then_write_then_barrier0(i32* %p) + call void @aligned_barrier() + br i1 %c0, label %t03, label %f03 +t03: + call void @barrier_then_write0(i32* %p) + br label %t0b3 +t0b3: + call void @aligned_barrier() + br label %m3 +f03: + call void @aligned_barrier() + call void @barrier_then_write0(i32* %p) + br i1 %c1, label %t13, label %f13 +t13: + call void @aligned_barrier() + br label %m3 +f13: + call void @aligned_barrier() + br label %m3 +m3: + call void @aligned_barrier() + call void @write_then_barrier0(i32* %p) + ret void +} +define internal void @write_then_barrier1(i32* %p) { +; CHECK-LABEL: define {{[^@]+}}@write_then_barrier1 +; CHECK-SAME: (i32* [[P:%.*]]) { +; CHECK-NEXT: store i32 0, i32* [[P]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: ret void +; + store i32 0, i32* %p + call void @aligned_barrier() + ret void +} +define internal void @barrier_then_write1(i32* %p) { +; MODULE-LABEL: define {{[^@]+}}@barrier_then_write1 +; MODULE-SAME: (i32* [[P:%.*]]) { +; MODULE-NEXT: store i32 0, i32* [[P]], align 4 +; MODULE-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@barrier_then_write1 +; CGSCC-SAME: (i32* [[P:%.*]]) { +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: store i32 0, i32* [[P]], align 4 +; CGSCC-NEXT: ret void +; + call void @aligned_barrier() + store i32 0, i32* %p + ret void +} +define internal void @barrier_then_write_then_barrier1(i32* %p) { +; CHECK-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier1 +; CHECK-SAME: (i32* [[P:%.*]]) { +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: store i32 0, i32* [[P]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: ret void +; + call void @aligned_barrier() + store i32 0, i32* %p + call void @aligned_barrier() + ret void +} +define void @multiple_blocks_functions_non_kernel_effects_1(i1 %c0, i1 %c1, i32* %p) { +; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_non_kernel_effects_1 +; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i32* [[P:%.*]]) { +; MODULE-NEXT: call void @barrier_then_write_then_barrier1(i32* [[P]]) +; MODULE-NEXT: br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]] +; MODULE: t03: +; MODULE-NEXT: call void @barrier_then_write1(i32* [[P]]) +; MODULE-NEXT: br label [[T0B3:%.*]] +; MODULE: t0b3: +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: br label [[M3:%.*]] +; MODULE: f03: +; MODULE-NEXT: call void @barrier_then_write1(i32* [[P]]) +; MODULE-NEXT: br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]] +; MODULE: t13: +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: br label [[M3]] +; MODULE: f13: +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: br label [[M3]] +; MODULE: m3: +; MODULE-NEXT: call void @write_then_barrier1(i32* [[P]]) +; MODULE-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@multiple_blocks_functions_non_kernel_effects_1 +; CGSCC-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i32* [[P:%.*]]) { +; CGSCC-NEXT: call void @barrier_then_write_then_barrier1(i32* [[P]]) +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]] +; CGSCC: t03: +; CGSCC-NEXT: call void @barrier_then_write1(i32* [[P]]) +; CGSCC-NEXT: br label [[T0B3:%.*]] +; CGSCC: t0b3: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br label [[M3:%.*]] +; CGSCC: f03: +; CGSCC-NEXT: call void @barrier_then_write1(i32* [[P]]) +; CGSCC-NEXT: br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]] +; CGSCC: t13: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br label [[M3]] +; CGSCC: f13: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br label [[M3]] +; CGSCC: m3: +; CGSCC-NEXT: call void @write_then_barrier1(i32* [[P]]) +; CGSCC-NEXT: ret void +; + call void @barrier_then_write_then_barrier1(i32* %p) + call void @aligned_barrier() + br i1 %c0, label %t03, label %f03 +t03: + call void @barrier_then_write1(i32* %p) + br label %t0b3 +t0b3: + call void @aligned_barrier() + br label %m3 +f03: + call void @aligned_barrier() + call void @barrier_then_write1(i32* %p) + br i1 %c1, label %t13, label %f13 +t13: + call void @aligned_barrier() + br label %m3 +f13: + call void @aligned_barrier() + br label %m3 +m3: + call void @aligned_barrier() + call void @write_then_barrier1(i32* %p) + ret void +} + +define internal void @write_then_barrier2(i32* %p) { +; CHECK-LABEL: define {{[^@]+}}@write_then_barrier2 +; CHECK-SAME: (i32* [[P:%.*]]) { +; CHECK-NEXT: store i32 0, i32* [[P]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: ret void +; + store i32 0, i32* %p + call void @aligned_barrier() + ret void +} +define internal void @barrier_then_write2(i32* %p) { +; CHECK-LABEL: define {{[^@]+}}@barrier_then_write2 +; CHECK-SAME: (i32* [[P:%.*]]) { +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: store i32 0, i32* [[P]], align 4 +; CHECK-NEXT: ret void +; + call void @aligned_barrier() + store i32 0, i32* %p + ret void +} +define internal void @barrier_then_write_then_barrier2(i32* %p) { +; CHECK-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier2 +; CHECK-SAME: (i32* [[P:%.*]]) { +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: store i32 0, i32* [[P]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: ret void +; + call void @aligned_barrier() + store i32 0, i32* %p + call void @aligned_barrier() + ret void +} +define void @multiple_blocks_functions_non_kernel_effects_2(i1 %c0, i1 %c1, i32* %p) { +; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_non_kernel_effects_2 +; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i32* [[P:%.*]]) { +; MODULE-NEXT: call void @barrier_then_write_then_barrier2(i32* [[P]]) +; MODULE-NEXT: store i32 0, i32* [[P]], align 4 +; MODULE-NEXT: br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]] +; MODULE: t03: +; MODULE-NEXT: call void @barrier_then_write2(i32* [[P]]) +; MODULE-NEXT: br label [[T0B3:%.*]] +; MODULE: t0b3: +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: br label [[M3:%.*]] +; MODULE: f03: +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: call void @barrier_then_write2(i32* [[P]]) +; MODULE-NEXT: br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]] +; MODULE: t13: +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: br label [[M3]] +; MODULE: f13: +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: br label [[M3]] +; MODULE: m3: +; MODULE-NEXT: call void @write_then_barrier2(i32* [[P]]) +; MODULE-NEXT: store i32 0, i32* [[P]], align 4 +; MODULE-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@multiple_blocks_functions_non_kernel_effects_2 +; CGSCC-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i32* [[P:%.*]]) { +; CGSCC-NEXT: call void @barrier_then_write_then_barrier2(i32* [[P]]) +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: store i32 0, i32* [[P]], align 4 +; CGSCC-NEXT: br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]] +; CGSCC: t03: +; CGSCC-NEXT: call void @barrier_then_write2(i32* [[P]]) +; CGSCC-NEXT: br label [[T0B3:%.*]] +; CGSCC: t0b3: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br label [[M3:%.*]] +; CGSCC: f03: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: call void @barrier_then_write2(i32* [[P]]) +; CGSCC-NEXT: br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]] +; CGSCC: t13: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br label [[M3]] +; CGSCC: f13: +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: br label [[M3]] +; CGSCC: m3: +; CGSCC-NEXT: call void @write_then_barrier2(i32* [[P]]) +; CGSCC-NEXT: store i32 0, i32* [[P]], align 4 +; CGSCC-NEXT: ret void +; + call void @barrier_then_write_then_barrier2(i32* %p) + call void @aligned_barrier() + store i32 0, i32* %p + br i1 %c0, label %t03, label %f03 +t03: + call void @barrier_then_write2(i32* %p) + br label %t0b3 +t0b3: + call void @aligned_barrier() + br label %m3 +f03: + call void @aligned_barrier() + call void @barrier_then_write2(i32* %p) + br i1 %c1, label %t13, label %f13 +t13: + call void @aligned_barrier() + br label %m3 +f13: + call void @aligned_barrier() + br label %m3 +m3: + call void @aligned_barrier() + call void @write_then_barrier2(i32* %p) + store i32 0, i32* %p + ret void +} + +!llvm.module.flags = !{!16,!15} +!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14} !0 = !{void ()* @pos_empty_1, !"kernel", i32 1} !1 = !{void ()* @pos_empty_2, !"kernel", i32 1} @@ -243,16 +860,19 @@ !9 = !{void ()* @pos_priv_mem, !"kernel", i32 1} !10 = !{void ()* @neg_mem, !"kernel", i32 1} !11 = !{void ()* @pos_multiple, !"kernel", i32 1} -!12 = !{i32 7, !"openmp", i32 50} -!13 = !{i32 7, !"openmp-device", i32 50} +!12 = !{void (i1,i1)* @multiple_blocks_kernel_1, !"kernel", i32 1} +!13 = !{void (i1,i1,i32*)* @multiple_blocks_kernel_2, !"kernel", i32 1} +!14 = !{void (i1,i1,i32*)* @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1} +!15 = !{i32 7, !"openmp", i32 50} +!16 = !{i32 7, !"openmp-device", i32 50} ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_aligned_barrier" } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nounwind } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } ;. -; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META2:![0-9]+]] = !{void ()* @pos_empty_1, !"kernel", i32 1} ; CHECK: [[META3:![0-9]+]] = !{void ()* @pos_empty_2, !"kernel", i32 1} ; CHECK: [[META4:![0-9]+]] = !{void ()* @pos_empty_3, !"kernel", i32 1} @@ -265,4 +885,7 @@ ; CHECK: [[META11:![0-9]+]] = !{void ()* @pos_priv_mem, !"kernel", i32 1} ; CHECK: [[META12:![0-9]+]] = !{void ()* @neg_mem, !"kernel", i32 1} ; CHECK: [[META13:![0-9]+]] = !{void ()* @pos_multiple, !"kernel", i32 1} +; CHECK: [[META14:![0-9]+]] = !{void (i1, i1)* @multiple_blocks_kernel_1, !"kernel", i32 1} +; CHECK: [[META15:![0-9]+]] = !{void (i1, i1, i32*)* @multiple_blocks_kernel_2, !"kernel", i32 1} +; CHECK: [[META16:![0-9]+]] = !{void (i1, i1, i32*)* @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1} ;. Index: llvm/test/Transforms/OpenMP/deduplication_target.ll =================================================================== --- llvm/test/Transforms/OpenMP/deduplication_target.ll +++ llvm/test/Transforms/OpenMP/deduplication_target.ll @@ -18,7 +18,6 @@ ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_50_a3e09bf8_foo_l2 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false) ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 Index: llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll =================================================================== --- llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll +++ llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature ; RUN: opt < %s -S -passes=openmp-opt | FileCheck %s --check-prefix=MODULE ; RUN: opt < %s -S -passes=openmp-opt-cgscc | FileCheck %s --check-prefix=CGSCC @@ -7,12 +7,13 @@ declare noalias ptr @alloc() define internal i32 @nblist() { -; MODULE-LABEL: define {{[^@]+}}@nblist( +; MODULE-LABEL: define {{[^@]+}}@nblist +; MODULE-SAME: () #[[ATTR0:[0-9]+]] { ; MODULE-NEXT: [[TMP1:%.*]] = call ptr @alloc() -; MODULE-NEXT: call fastcc void @rec.internalized(ptr [[TMP1]], i64 0) +; MODULE-NEXT: call fastcc void @rec.internalized(ptr [[TMP1]], i64 0) #[[ATTR2:[0-9]+]] ; MODULE-NEXT: ret i32 0 ; -; CGSCC-LABEL: define {{[^@]+}}@nblist( +; CGSCC-LABEL: define {{[^@]+}}@nblist() { ; CGSCC-NEXT: [[TMP1:%.*]] = call ptr @alloc() ; CGSCC-NEXT: call fastcc void @rec(ptr [[TMP1]], i64 0) ; CGSCC-NEXT: ret i32 0 @@ -22,24 +23,23 @@ ret i32 0 } +; MODULE-LABEL: define {{[^@]+}}@rec.internalized + define fastcc void @rec(ptr %0, i64 %1) { -; MODULE-LABEL: define {{[^@]+}}@rec( -; MODULE-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0:%.*]], i64 [[TMP1:%.*]] +; MODULE-LABEL: define {{[^@]+}}@rec +; MODULE-SAME: (ptr [[TMP0:%.*]], i64 [[TMP1:%.*]]) { +; MODULE-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[TMP1]] ; MODULE-NEXT: store i32 0, ptr [[TMP3]], align 4 ; MODULE-NEXT: call fastcc void @rec(ptr [[TMP0]], i64 0) ; MODULE-NEXT: ret void ; -; CGSCC-LABEL: define {{[^@]+}}@rec( -; CGSCC-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0:%.*]], i64 [[TMP1:%.*]] +; CGSCC-LABEL: define {{[^@]+}}@rec +; CGSCC-SAME: (ptr nocapture writeonly [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { +; CGSCC-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[TMP1]] ; CGSCC-NEXT: store i32 0, ptr [[TMP3]], align 4 -; CGSCC-NEXT: call fastcc void @rec(ptr [[TMP0]], i64 0) +; CGSCC-NEXT: call fastcc void @rec(ptr nocapture writeonly [[TMP0]], i64 0) #[[ATTR1:[0-9]+]] ; CGSCC-NEXT: ret void ; -; CHECK-LABEL: define {{[^@]+}}@rec( -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0:%.*]], i64 [[TMP1:%.*]] -; CHECK-NEXT: store i32 0, ptr [[TMP3]], align 4 -; CHECK-NEXT: call fastcc void @rec(ptr [[TMP0]], i64 0) -; CHECK-NEXT: ret void %3 = getelementptr i32, ptr %0, i64 %1 store i32 0, ptr %3, align 4 call fastcc void @rec(ptr %0, i64 0) Index: llvm/test/Transforms/OpenMP/remove_globalization.ll =================================================================== --- llvm/test/Transforms/OpenMP/remove_globalization.ll +++ llvm/test/Transforms/OpenMP/remove_globalization.ll @@ -70,17 +70,17 @@ ; CHECK-LABEL: define {{[^@]+}}@foo ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 1 +; CHECK-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 4 ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@foo ; CHECK-DISABLED-SAME: () #[[ATTR0]] { ; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 1 +; CHECK-DISABLED-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 4 ; CHECK-DISABLED-NEXT: ret void ; entry: - %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !12 + %0 = call align 4 i8* @__kmpc_alloc_shared(i64 4), !dbg !12 call void @use(i8* %0) call void @__kmpc_free_shared(i8* %0, i64 4) ret void @@ -88,23 +88,23 @@ define internal void @bar() { ; CHECK-LABEL: define {{[^@]+}}@bar -; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) #[[ATTR0]], !dbg [[DBG8:![0-9]+]] -; CHECK-NEXT: call void @share(i8* nofree [[TMP0]]) #[[ATTR1]], !dbg [[DBG8]] -; CHECK-NEXT: call void @__kmpc_free_shared(i8* [[TMP0]], i64 4) #[[ATTR0]] +; CHECK-NEXT: [[TMP0:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 4) #[[ATTR5:[0-9]+]], !dbg [[DBG8:![0-9]+]] +; CHECK-NEXT: call void @share(i8* nofree [[TMP0]]) #[[ATTR0]], !dbg [[DBG8]] +; CHECK-NEXT: call void @__kmpc_free_shared(i8* [[TMP0]], i64 4) #[[ATTR5]] ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@bar -; CHECK-DISABLED-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-DISABLED-SAME: () #[[ATTR0]] { ; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) #[[ATTR0]], !dbg [[DBG8:![0-9]+]] -; CHECK-DISABLED-NEXT: call void @share(i8* nofree [[TMP0]]) #[[ATTR1]], !dbg [[DBG8]] -; CHECK-DISABLED-NEXT: call void @__kmpc_free_shared(i8* [[TMP0]], i64 4) #[[ATTR0]] +; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 4) #[[ATTR5:[0-9]+]], !dbg [[DBG8:![0-9]+]] +; CHECK-DISABLED-NEXT: call void @share(i8* nofree [[TMP0]]) #[[ATTR0]], !dbg [[DBG8]] +; CHECK-DISABLED-NEXT: call void @__kmpc_free_shared(i8* [[TMP0]], i64 4) #[[ATTR5]] ; CHECK-DISABLED-NEXT: ret void ; entry: - %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !13 + %0 = call align 4 i8* @__kmpc_alloc_shared(i64 4), !dbg !13 call void @share(i8* %0), !dbg !13 call void @__kmpc_free_shared(i8* %0, i64 4) ret void @@ -112,12 +112,12 @@ define internal void @use(i8* %x) { ; CHECK-LABEL: define {{[^@]+}}@use -; CHECK-SAME: (i8* [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: (i8* [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@use -; CHECK-DISABLED-SAME: (i8* [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-DISABLED-SAME: (i8* [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-DISABLED-NEXT: entry: ; CHECK-DISABLED-NEXT: ret void ; @@ -127,13 +127,13 @@ define internal void @share(i8* %x) { ; CHECK-LABEL: define {{[^@]+}}@share -; CHECK-SAME: (i8* nofree [[X:%.*]]) #[[ATTR3:[0-9]+]] { +; CHECK-SAME: (i8* nofree [[X:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: store i8* [[X]], i8** @S, align 8 ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@share -; CHECK-DISABLED-SAME: (i8* nofree [[X:%.*]]) #[[ATTR3:[0-9]+]] { +; CHECK-DISABLED-SAME: (i8* nofree [[X:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-DISABLED-NEXT: entry: ; CHECK-DISABLED-NEXT: store i8* [[X]], i8** @S, align 8 ; CHECK-DISABLED-NEXT: ret void @@ -146,19 +146,17 @@ define void @unused() { ; CHECK-LABEL: define {{[^@]+}}@unused() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 1 -; CHECK-NEXT: call void @use(i8* undef) +; CHECK-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 4 ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@unused() { ; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 4), !dbg [[DBG11:![0-9]+]] -; CHECK-DISABLED-NEXT: call void @use(i8* [[TMP0]]) -; CHECK-DISABLED-NEXT: call void @__kmpc_free_shared(i8* [[TMP0]], i64 4) +; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 4) #[[ATTR5]], !dbg [[DBG11:![0-9]+]] +; CHECK-DISABLED-NEXT: call void @__kmpc_free_shared(i8* [[TMP0]], i64 4) #[[ATTR5]] ; CHECK-DISABLED-NEXT: ret void ; entry: - %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !14 + %0 = call align 4 i8* @__kmpc_alloc_shared(i64 4), !dbg !14 call void @use(i8* %0) call void @__kmpc_free_shared(i8* %0, i64 4) ret void @@ -166,9 +164,9 @@ define internal void @convert_and_move_alloca() { ; CHECK-LABEL: define {{[^@]+}}@convert_and_move_alloca -; CHECK-SAME: () #[[ATTR1]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 1 +; CHECK-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 4 ; CHECK-NEXT: [[IV_PTR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: br label [[INITLOOP:%.*]] ; CHECK: initloop: @@ -186,9 +184,9 @@ ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@convert_and_move_alloca -; CHECK-DISABLED-SAME: () #[[ATTR1]] { +; CHECK-DISABLED-SAME: () #[[ATTR0]] { ; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 1 +; CHECK-DISABLED-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 4 ; CHECK-DISABLED-NEXT: [[IV_PTR:%.*]] = alloca i32, align 4 ; CHECK-DISABLED-NEXT: br label [[INITLOOP:%.*]] ; CHECK-DISABLED: initloop: @@ -217,7 +215,7 @@ br label %loopbody loopbody: - %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !16 + %0 = call align 4 i8* @__kmpc_alloc_shared(i64 4), !dbg !16 call void @use(i8* %0) call void @__kmpc_free_shared(i8* %0, i64 4) %iv = load i32, i32* %iv_ptr @@ -263,19 +261,19 @@ !15 = !DILocation(line: 8, column: 2, scope: !9) !16 = !DILocation(line: 10, column: 2, scope: !9) ;. -; CHECK: attributes #[[ATTR0]] = { nounwind } -; CHECK: attributes #[[ATTR1]] = { nosync nounwind } -; CHECK: attributes #[[ATTR2]] = { nounwind memory(none) } -; CHECK: attributes #[[ATTR3]] = { nofree norecurse nosync nounwind memory(write) } -; CHECK: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind allocsize(0) } -; CHECK: attributes #[[ATTR5:[0-9]+]] = { "llvm.assume"="omp_no_openmp" } +; CHECK: attributes #[[ATTR0]] = { nosync nounwind } +; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) } +; CHECK: attributes #[[ATTR2]] = { nofree norecurse nosync nounwind memory(write) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nosync nounwind allocsize(0) } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { "llvm.assume"="omp_no_openmp" } +; CHECK: attributes #[[ATTR5]] = { nounwind } ;. -; CHECK-DISABLED: attributes #[[ATTR0]] = { nounwind } -; CHECK-DISABLED: attributes #[[ATTR1]] = { nosync nounwind } -; CHECK-DISABLED: attributes #[[ATTR2]] = { nounwind memory(none) } -; CHECK-DISABLED: attributes #[[ATTR3]] = { nofree norecurse nosync nounwind memory(write) } -; CHECK-DISABLED: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind allocsize(0) } -; CHECK-DISABLED: attributes #[[ATTR5:[0-9]+]] = { "llvm.assume"="omp_no_openmp" } +; CHECK-DISABLED: attributes #[[ATTR0]] = { nosync nounwind } +; CHECK-DISABLED: attributes #[[ATTR1]] = { nounwind memory(none) } +; CHECK-DISABLED: attributes #[[ATTR2]] = { nofree norecurse nosync nounwind memory(write) } +; CHECK-DISABLED: attributes #[[ATTR3:[0-9]+]] = { nosync nounwind allocsize(0) } +; CHECK-DISABLED: attributes #[[ATTR4:[0-9]+]] = { "llvm.assume"="omp_no_openmp" } +; CHECK-DISABLED: attributes #[[ATTR5]] = { nounwind } ;. ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) ; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "remove_globalization.c", directory: "/tmp/remove_globalization.c") Index: llvm/test/Transforms/OpenMP/remove_noinline_attributes.ll =================================================================== --- llvm/test/Transforms/OpenMP/remove_noinline_attributes.ll +++ llvm/test/Transforms/OpenMP/remove_noinline_attributes.ll @@ -6,7 +6,7 @@ ; __kmpc functions define void @__kmpc_noinline() noinline nounwind { ; CHECK: Function Attrs: nounwind -; CHECK-LABEL: define {{[^@]+}}@__kmpc_noinline( +; CHECK-LABEL: @__kmpc_noinline( ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -16,7 +16,7 @@ ; omp_X functions define void @omp_noinline() noinline nounwind { ; CHECK: Function Attrs: nounwind -; CHECK-LABEL: define {{[^@]+}}@omp_noinline( +; CHECK-LABEL: @omp_noinline( ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -26,7 +26,7 @@ ; _OMP namespace define void @_ZN4ompx_noinline() noinline nounwind { ; CHECK: Function Attrs: nounwind -; CHECK-LABEL: define {{[^@]+}}@_ZN4ompx_noinline( +; CHECK-LABEL: @_ZN4ompx_noinline( ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -38,7 +38,7 @@ define void @__kmpc_noinline_optnone() noinline optnone nounwind { ; CHECK: Function Attrs: noinline nounwind optnone -; CHECK-LABEL: define {{[^@]+}}@__kmpc_noinline_optnone( +; CHECK-LABEL: @__kmpc_noinline_optnone( ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -47,7 +47,7 @@ } define void @omp_noinline_optnone() noinline optnone nounwind { ; CHECK: Function Attrs: noinline nounwind optnone -; CHECK-LABEL: define {{[^@]+}}@omp_noinline_optnone( +; CHECK-LABEL: @omp_noinline_optnone( ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -57,7 +57,7 @@ ; _OMP namespace define void @_ZN4ompx_noinline_optnone() noinline optnone nounwind { ; CHECK: Function Attrs: noinline nounwind optnone -; CHECK-LABEL: define {{[^@]+}}@_ZN4ompx_noinline_optnone( +; CHECK-LABEL: @_ZN4ompx_noinline_optnone( ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -66,7 +66,7 @@ } define void @a___kmpc_noinline() noinline nounwind { ; CHECK: Function Attrs: noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@a___kmpc_noinline( +; CHECK-LABEL: @a___kmpc_noinline( ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -75,7 +75,7 @@ } define void @a_omp_noinline() noinline nounwind { ; CHECK: Function Attrs: noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@a_omp_noinline( +; CHECK-LABEL: @a_omp_noinline( ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -84,7 +84,7 @@ } define void @a__ZN4ompx_noinline() noinline nounwind { ; CHECK: Function Attrs: noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@a__ZN4ompx_noinline( +; CHECK-LABEL: @a__ZN4ompx_noinline( ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; Index: llvm/test/Transforms/OpenMP/replace_globalization.ll =================================================================== --- llvm/test/Transforms/OpenMP/replace_globalization.ll +++ llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -150,7 +150,7 @@ ; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false) ; CHECK-NEXT: [[X:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 4) #[[ATTR6:[0-9]+]] ; CHECK-NEXT: call void @unknown_no_openmp() -; CHECK-NEXT: call void @use.internalized(i8* nofree [[X]]) #[[ATTR6]] +; CHECK-NEXT: call void @use.internalized(i8* nofree [[X]]) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: call void @__kmpc_free_shared(i8* [[X]], i64 4) #[[ATTR6]] ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1) ; CHECK-NEXT: ret void @@ -163,14 +163,14 @@ ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[C]], -1 ; CHECK-NEXT: br i1 [[CMP]], label [[MASTER1:%.*]], label [[EXIT:%.*]] ; CHECK: master1: -; CHECK-NEXT: call void @use.internalized(i8* nofree addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* @x_shared, i32 0, i32 0) to i8*)) #[[ATTR6]] +; CHECK-NEXT: call void @use.internalized(i8* nofree addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* @x_shared, i32 0, i32 0) to i8*)) #[[ATTR3]] ; CHECK-NEXT: br label [[NEXT:%.*]] ; CHECK: next: ; CHECK-NEXT: call void @unknown_no_openmp() ; CHECK-NEXT: [[B0:%.*]] = icmp eq i32 [[C]], -1 ; CHECK-NEXT: br i1 [[B0]], label [[MASTER2:%.*]], label [[EXIT]] ; CHECK: master2: -; CHECK-NEXT: call void @use.internalized(i8* nofree addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @y_shared, i32 0, i32 0) to i8*)) #[[ATTR6]] +; CHECK-NEXT: call void @use.internalized(i8* nofree addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @y_shared, i32 0, i32 0) to i8*)) #[[ATTR3]] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1) @@ -185,7 +185,7 @@ ; CHECK-NEXT: br i1 [[C0]], label [[MASTER3:%.*]], label [[EXIT:%.*]] ; CHECK: master3: ; CHECK-NEXT: [[Z:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 24) #[[ATTR6]], !dbg [[DBG10:![0-9]+]] -; CHECK-NEXT: call void @use.internalized(i8* nofree [[Z]]) #[[ATTR6]] +; CHECK-NEXT: call void @use.internalized(i8* nofree [[Z]]) #[[ATTR3]] ; CHECK-NEXT: call void @__kmpc_free_shared(i8* [[Z]], i64 24) #[[ATTR6]] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -193,7 +193,7 @@ ; CHECK-NEXT: ret void ; ; -; CHECK: Function Attrs: nofree norecurse nounwind memory(write) +; CHECK: Function Attrs: nofree norecurse nosync nounwind memory(write) ; CHECK-LABEL: define {{[^@]+}}@use.internalized ; CHECK-SAME: (i8* nofree [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: @@ -208,7 +208,7 @@ ; CHECK-NEXT: ret void ; ; -; CHECK: Function Attrs: nosync nounwind allocsize(0) memory(read) +; CHECK: Function Attrs: norecurse nosync nounwind allocsize(0) memory(read) ; CHECK-LABEL: define {{[^@]+}}@__kmpc_alloc_shared ; CHECK-SAME: (i64 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: [[L:%.*]] = load i32, i32* @offset, align 4 @@ -222,9 +222,9 @@ ; ;. ; CHECK: attributes #[[ATTR0]] = { "kernel" } -; CHECK: attributes #[[ATTR1]] = { nofree norecurse nounwind memory(write) } -; CHECK: attributes #[[ATTR2]] = { nosync nounwind allocsize(0) memory(read) } -; CHECK: attributes #[[ATTR3:[0-9]+]] = { nosync nounwind } +; CHECK: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind memory(write) } +; CHECK: attributes #[[ATTR2]] = { norecurse nosync nounwind allocsize(0) memory(read) } +; CHECK: attributes #[[ATTR3]] = { nosync nounwind } ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR5:[0-9]+]] = { "llvm.assume"="omp_no_openmp" } ; CHECK: attributes #[[ATTR6]] = { nounwind } Index: llvm/test/Transforms/OpenMP/spmdization.ll =================================================================== --- llvm/test/Transforms/OpenMP/spmdization.ll +++ llvm/test/Transforms/OpenMP/spmdization.ll @@ -1998,10 +1998,9 @@ ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] ; AMDGPU-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR4]] -; AMDGPU-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct.kmp_task_t_with_privates* -; AMDGPU-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR4]] -; AMDGPU-NEXT: [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP5]], i64 0) +; AMDGPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR4]] +; AMDGPU-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0) ; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1) ; AMDGPU-NEXT: br label [[COMMON_RET]] ; @@ -2053,10 +2052,9 @@ ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] ; NVPTX-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR4]] -; NVPTX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct.kmp_task_t_with_privates* -; NVPTX-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR4]] -; NVPTX-NEXT: [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP5]], i64 0) +; NVPTX-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR4]] +; NVPTX-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0) ; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1) ; NVPTX-NEXT: br label [[COMMON_RET]] ; @@ -2109,10 +2107,9 @@ ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] ; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct.kmp_task_t_with_privates* -; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP5]], i64 0) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1) ; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] ; @@ -2164,10 +2161,9 @@ ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] ; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct.kmp_task_t_with_privates* -; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP5]], i64 0) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0) ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1) ; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] ; @@ -2515,9 +2511,6 @@ ; AMDGPU: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} ; AMDGPU: [[LOOP28]] = distinct !{!28, !23, !24} ; AMDGPU: [[LOOP29]] = distinct !{!29, !23, !24} -; AMDGPU: [[META30:![0-9]+]] = !{!31, !27, i64 0} -; AMDGPU: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0} -; AMDGPU: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} ;. ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} @@ -2549,9 +2542,6 @@ ; NVPTX: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} ; NVPTX: [[LOOP28]] = distinct !{!28, !23, !24} ; NVPTX: [[LOOP29]] = distinct !{!29, !23, !24} -; NVPTX: [[META30:![0-9]+]] = !{!31, !27, i64 0} -; NVPTX: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0} -; NVPTX: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} ;. ; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} ; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} @@ -2583,9 +2573,6 @@ ; AMDGPU-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} ; AMDGPU-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24} ; AMDGPU-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24} -; AMDGPU-DISABLED: [[META30:![0-9]+]] = !{!31, !27, i64 0} -; AMDGPU-DISABLED: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0} -; AMDGPU-DISABLED: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} ;. ; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} ; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} @@ -2617,7 +2604,4 @@ ; NVPTX-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} ; NVPTX-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24} ; NVPTX-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24} -; NVPTX-DISABLED: [[META30:![0-9]+]] = !{!31, !27, i64 0} -; NVPTX-DISABLED: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0} -; NVPTX-DISABLED: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} ;. Index: llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll =================================================================== --- llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll +++ llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll @@ -163,7 +163,7 @@ ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: -; CHECK-NEXT: call void @generic_helper() #[[ATTR5]] +; CHECK-NEXT: call void @generic_helper() #[[ATTR6:[0-9]+]] ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2) ; CHECK-NEXT: ret void ; CHECK: worker.exit: @@ -176,7 +176,7 @@ ; CHECK-DISABLE-SPMDIZATION-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-DISABLE-SPMDIZATION-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK-DISABLE-SPMDIZATION: user_code.entry: -; CHECK-DISABLE-SPMDIZATION-NEXT: call void @generic_helper() #[[ATTR5]] +; CHECK-DISABLE-SPMDIZATION-NEXT: call void @generic_helper() #[[ATTR6:[0-9]+]] ; CHECK-DISABLE-SPMDIZATION-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1) ; CHECK-DISABLE-SPMDIZATION-NEXT: ret void ; CHECK-DISABLE-SPMDIZATION: worker.exit: @@ -202,7 +202,7 @@ ; CHECK-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: call void @leaf() #[[ATTR5]] +; CHECK-NEXT: call void @leaf() #[[ATTR6]] ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0) @@ -212,7 +212,7 @@ ; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-DISABLE-SPMDIZATION-NEXT: entry: ; CHECK-DISABLE-SPMDIZATION-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLE-SPMDIZATION-NEXT: call void @leaf() #[[ATTR5]] +; CHECK-DISABLE-SPMDIZATION-NEXT: call void @leaf() #[[ATTR6]] ; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) #[[ATTR2:[0-9]+]] ; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-DISABLE-SPMDIZATION-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* @__omp_outlined___wrapper.ID, i8** [[TMP1]], i64 0) @@ -234,7 +234,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 ; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]] +; CHECK-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] ; CHECK-NEXT: ret void ; ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__omp_outlined__ @@ -242,7 +242,7 @@ ; CHECK-DISABLE-SPMDIZATION-NEXT: entry: ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLE-SPMDIZATION-NEXT: call void @unknown() #[[ATTR6:[0-9]+]] +; CHECK-DISABLE-SPMDIZATION-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] ; CHECK-DISABLE-SPMDIZATION-NEXT: ret void ; entry: @@ -336,13 +336,13 @@ ; CHECK-LABEL: define {{[^@]+}}@generic_helper ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @leaf() #[[ATTR5]] +; CHECK-NEXT: call void @leaf() #[[ATTR6]] ; CHECK-NEXT: ret void ; ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@generic_helper ; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR4]] { ; CHECK-DISABLE-SPMDIZATION-NEXT: entry: -; CHECK-DISABLE-SPMDIZATION-NEXT: call void @leaf() #[[ATTR5]] +; CHECK-DISABLE-SPMDIZATION-NEXT: call void @leaf() #[[ATTR6]] ; CHECK-DISABLE-SPMDIZATION-NEXT: ret void ; entry: @@ -379,17 +379,19 @@ ; CHECK: attributes #[[ATTR1]] = { convergent noinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } ; CHECK: attributes #[[ATTR2]] = { nounwind } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { alwaysinline } -; CHECK: attributes #[[ATTR4]] = { convergent noinline nounwind memory(write) "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR4]] = { convergent noinline nosync nounwind memory(write) "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } ; CHECK: attributes #[[ATTR5]] = { convergent nounwind } -; CHECK: attributes #[[ATTR6]] = { convergent } +; CHECK: attributes #[[ATTR6]] = { convergent nosync nounwind } +; CHECK: attributes #[[ATTR7]] = { convergent } ;. ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR1]] = { convergent noinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR2]] = { nounwind } ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR3:[0-9]+]] = { alwaysinline } -; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR4]] = { convergent noinline nounwind memory(write) "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR4]] = { convergent noinline nosync nounwind memory(write) "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR5]] = { convergent nounwind } -; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR6]] = { convergent } +; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR6]] = { convergent nosync nounwind } +; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR7]] = { convergent } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0} ; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1} Index: llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll =================================================================== --- llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll +++ llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll @@ -13,53 +13,30 @@ ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4 ;. define void @kernel() "kernel" { -; TUNIT: Function Attrs: norecurse -; TUNIT-LABEL: define {{[^@]+}}@kernel -; TUNIT-SAME: () #[[ATTR0:[0-9]+]] { -; TUNIT-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* undef, i8 1, i1 false) -; TUNIT-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], -1 -; TUNIT-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] -; TUNIT: if.then: -; TUNIT-NEXT: store i32 1, i32 addrspace(3)* @G, align 4 -; TUNIT-NEXT: br label [[IF_MERGE:%.*]] -; TUNIT: if.else: -; TUNIT-NEXT: call void @barrier() #[[ATTR4:[0-9]+]] -; TUNIT-NEXT: [[L:%.*]] = load i32, i32 addrspace(3)* @G, align 4 -; TUNIT-NEXT: call void @use1(i32 [[L]]) #[[ATTR4]] -; TUNIT-NEXT: br label [[IF_MERGE]] -; TUNIT: if.merge: -; TUNIT-NEXT: br i1 [[CMP]], label [[IF_THEN2:%.*]], label [[IF_END:%.*]] -; TUNIT: if.then2: -; TUNIT-NEXT: store i32 2, i32 addrspace(3)* @G, align 4 -; TUNIT-NEXT: call void @barrier() #[[ATTR4]] -; TUNIT-NEXT: br label [[IF_END]] -; TUNIT: if.end: -; TUNIT-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* undef, i8 1) -; TUNIT-NEXT: ret void ; -; CGSCC: Function Attrs: norecurse -; CGSCC-LABEL: define {{[^@]+}}@kernel -; CGSCC-SAME: () #[[ATTR0:[0-9]+]] { -; CGSCC-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* undef, i8 1, i1 false) -; CGSCC-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], -1 -; CGSCC-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] -; CGSCC: if.then: -; CGSCC-NEXT: store i32 1, i32 addrspace(3)* @G, align 4 -; CGSCC-NEXT: br label [[IF_MERGE:%.*]] -; CGSCC: if.else: -; CGSCC-NEXT: call void @barrier() -; CGSCC-NEXT: [[L:%.*]] = load i32, i32 addrspace(3)* @G, align 4 -; CGSCC-NEXT: call void @use1(i32 [[L]]) -; CGSCC-NEXT: br label [[IF_MERGE]] -; CGSCC: if.merge: -; CGSCC-NEXT: br i1 [[CMP]], label [[IF_THEN2:%.*]], label [[IF_END:%.*]] -; CGSCC: if.then2: -; CGSCC-NEXT: store i32 2, i32 addrspace(3)* @G, align 4 -; CGSCC-NEXT: call void @barrier() -; CGSCC-NEXT: br label [[IF_END]] -; CGSCC: if.end: -; CGSCC-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* undef, i8 1) -; CGSCC-NEXT: ret void +; CHECK: Function Attrs: norecurse +; CHECK-LABEL: define {{[^@]+}}@kernel +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* undef, i8 1, i1 false) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], -1 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 1, i32 addrspace(3)* @G, align 4 +; CHECK-NEXT: br label [[IF_MERGE:%.*]] +; CHECK: if.else: +; CHECK-NEXT: call void @barrier() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: [[L:%.*]] = load i32, i32 addrspace(3)* @G, align 4 +; CHECK-NEXT: call void @use1(i32 [[L]]) #[[ATTR4]] +; CHECK-NEXT: br label [[IF_MERGE]] +; CHECK: if.merge: +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN2:%.*]], label [[IF_END:%.*]] +; CHECK: if.then2: +; CHECK-NEXT: store i32 2, i32 addrspace(3)* @G, align 4 +; CHECK-NEXT: call void @barrier() #[[ATTR4]] +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* undef, i8 1) +; CHECK-NEXT: ret void ; %call = call i32 @__kmpc_target_init(%struct.ident_t* undef, i8 1, i1 false) %cmp = icmp eq i32 %call, -1 @@ -96,20 +73,16 @@ !2 = !{void ()* @kernel, !"kernel", i32 1} ;. -; TUNIT: attributes #[[ATTR0]] = { norecurse "kernel" } -; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nocallback norecurse nounwind } -; TUNIT: attributes #[[ATTR2:[0-9]+]] = { nocallback norecurse nosync nounwind } -; TUNIT: attributes #[[ATTR3:[0-9]+]] = { nocallback } -; TUNIT: attributes #[[ATTR4]] = { nounwind } -;. -; CGSCC: attributes #[[ATTR0]] = { norecurse "kernel" } -; CGSCC: attributes #[[ATTR1:[0-9]+]] = { nocallback norecurse nounwind } -; CGSCC: attributes #[[ATTR2:[0-9]+]] = { nocallback norecurse nosync nounwind } -; CGSCC: attributes #[[ATTR3:[0-9]+]] = { nocallback } +; CHECK: attributes #[[ATTR0]] = { norecurse "kernel" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback norecurse nounwind } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback norecurse nosync nounwind } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback } +; CHECK: attributes #[[ATTR4]] = { nounwind } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ; CHECK: [[META2:![0-9]+]] = !{void ()* @kernel, !"kernel", i32 1} ;. ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} +; CGSCC: {{.*}} +; TUNIT: {{.*}}