Index: include/llvm/CodeGen/TailDuplicator.h =================================================================== --- include/llvm/CodeGen/TailDuplicator.h +++ include/llvm/CodeGen/TailDuplicator.h @@ -15,6 +15,7 @@ #ifndef LLVM_CODEGEN_TAILDUPLICATOR_H #define LLVM_CODEGEN_TAILDUPLICATOR_H +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -52,9 +53,12 @@ static bool isSimpleBB(MachineBasicBlock *TailBB); bool shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, MachineBasicBlock &TailBB, bool IgnoreFallthrough); - bool tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB, - MachineBasicBlock *ForcedLayoutPred); + bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB); + bool tailDuplicateAndUpdate( + MachineFunction &MF, bool IsSimple, + MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred, + llvm::function_ref *RemovalCallback = nullptr); private: typedef TargetInstrInfo::RegSubRegPair RegSubRegPair; @@ -87,7 +91,9 @@ SmallVectorImpl> &CopyInfos, SmallVectorImpl &Copies); - void removeDeadBlock(MachineBasicBlock *MBB); + void removeDeadBlock( + MachineBasicBlock *MBB, + llvm::function_ref *RemovalCallback); }; } // End llvm namespace Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -179,6 +179,16 @@ /// \brief End of blocks within the chain. iterator end() { return Blocks.end(); } + bool remove(MachineBasicBlock* BB) { + for(iterator i = begin(); i != end(); ++i) { + if (*i == BB) { + Blocks.erase(i); + return true; + } + } + return false; + } + /// \brief Merge a block chain into this one. /// /// This routine merges a block chain into this one. It takes care of forming @@ -215,6 +225,16 @@ for (MachineBasicBlock *MBB : *this) MBB->dump(); } + + void dump_brief() { + iterator i = begin(); + dbgs() << "["; + if (i != end()) + dbgs() << "#" << (*i)->getNumber(); + for (++i; i != end(); ++i) + dbgs() << ", #" << (*i)->getNumber(); + dbgs() << "]\n"; + } #endif // NDEBUG /// \brief Count of predecessors of any block within the chain which have not @@ -242,7 +262,7 @@ const MachineBlockFrequencyInfo *MBFI; /// \brief A handle to the loop info. - const MachineLoopInfo *MLI; + MachineLoopInfo *MLI; /// \brief A handle to the target's instruction info. const TargetInstrInfo *TII; @@ -260,10 +280,20 @@ /// must be done inline. TailDuplicator TailDup; - /// \brief A set of blocks that are unavoidably execute, i.e. they dominate - /// all terminators of the MachineFunction. + /// \brief A set of blocks that are unavoidably executed. + /// + /// i.e. they dominate + /// all terminators of the MachineFunction. Also used within loops for blocks + /// that are unavoidable within the loop. SmallPtrSet UnavoidableBlocks; + /// \brief A set of delayed blocks for tail-duplication. + /// + /// These blocks form a second spine through a loop/function, and so + /// predecessors within this set do not need to be able to placed. + /// This allows the tail-duplicated spine to grow beyond 2 blocks. + SmallPtrSet TailDupDelayBlocks; + /// \brief Allocator and owner of BlockChain structures. /// /// We build BlockChains lazily while processing the loop structure of @@ -309,7 +339,7 @@ void buildChain(MachineBasicBlock *BB, BlockChain &Chain, SmallVectorImpl &BlockWorkList, SmallVectorImpl &EHPadWorkList, - const BlockFilterSet *BlockFilter = nullptr); + BlockFilterSet *BlockFilter = nullptr); MachineBasicBlock *findBestLoopTop(MachineLoop &L, const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L, @@ -323,6 +353,8 @@ void buildCFGChains(MachineFunction &F); void optimizeBranches(MachineFunction &F); void alignBlocks(MachineFunction &F); + void computeLoopUnavoidableBlocks(MachineLoop &L); + void computeUnavoidableBlocks(MachineFunction &F); public: static char ID; // Pass identification, replacement for typeid @@ -473,11 +505,39 @@ else SuccProb = BranchProbability(SuccProbN, SuccProbD); - // If we outline optional branches, look whether Succ is unavoidable, i.e. - // dominates all terminators of the MachineFunction. If it does, other - // successors must be optional. Don't do this for cold branches. - if (OutlineOptionalBranches && SuccProb > HotProb.getCompl() && - UnavoidableBlocks.count(Succ) > 0) { + // Check if Succ is unavoidable, for outlining with tail-duplication, in + // addition to straight outlining. + if (UnavoidableBlocks.count(Succ) > 0 && SuccProb > HotProb.getCompl()) { + auto CanTailDuplicateAllPreds = [&]() { + DEBUG(dbgs() << "Checking to see if block " << getBlockName(Succ) + << " can tail duplicate into all its predecessors.\n"); + bool IsSimple = TailDup.isSimpleBB(Succ); + + bool IgnoreFallthrough = true; + if (!TailDup.shouldTailDuplicate(*Succ->getParent(), IsSimple, *Succ, + IgnoreFallthrough)) { + DEBUG(dbgs() << "Skipping because it is " + << "not a candidate for duplication.\n"); + return false; + } + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // Make sure all unplaced and unfiltered predecessors are either part + // of the second spine, or can be tail-duplicated into. + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain) + continue; + // If Pred is part of the growing second spine, we don't need to be + // able to copy succ onto the end of it. + if (TailDupDelayBlocks.count(Pred) > 0) + continue; + if (!TailDup.canTailDuplicate(Succ, Pred)) { + DEBUG(dbgs() << "Skipping because it can't be duplicated into block " + << getBlockName(Pred) << ".\n"); + return false; + } + } + return true; + }; auto HasShortOptionalBranch = [&]() { for (MachineBasicBlock *Pred : Succ->predecessors()) { // Check whether there is an unplaced optional branch. @@ -493,8 +553,25 @@ } return false; }; - if (!HasShortOptionalBranch()) - return Succ; + if (OutlineOptionalBranches) { + // Don't outline a small single block branch. + if (!HasShortOptionalBranch()) + return Succ; + } + if (TailDupPlacement && CanTailDuplicateAllPreds()) { + if (!HasShortOptionalBranch()) { + // Add blocks that were tail-duplicated into to the delay set so that + // the second spine can keep growing. + for (MachineBasicBlock *Pred : Succ->predecessors()) { + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain + || TailDupDelayBlocks.count(Pred) > 0) + continue; + TailDupDelayBlocks.insert(Pred); + } + return Succ; + } + } } // Only consider successors which are either "hot", or wouldn't violate @@ -646,6 +723,7 @@ SmallVectorImpl &BlockWorkList, SmallVectorImpl &EHPadWorkList, const BlockFilterSet *BlockFilter = nullptr) { + BlockChain &Chain = *BlockToChain[MBB]; if (!UpdatedPreds.insert(&Chain).second) return; @@ -676,9 +754,9 @@ MachineBasicBlock *BB, BlockChain &Chain, SmallVectorImpl &BlockWorkList, SmallVectorImpl &EHPadWorkList, - const BlockFilterSet *BlockFilter) { - assert(BB); - assert(BlockToChain[BB] == &Chain); + BlockFilterSet *BlockFilter) { + assert(BB && "BB must not be null.\n"); + assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n"); MachineFunction &F = *BB->getParent(); MachineFunction::iterator PrevUnplacedBlockIt = F.begin(); @@ -687,32 +765,15 @@ BlockFilter); BB = *std::prev(Chain.end()); for (;;) { - assert(BB); - assert(BlockToChain[BB] == &Chain); - assert(*std::prev(Chain.end()) == BB); + assert(BB && "null block found at end of chain in loop."); + assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop."); + assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain."); + // Look for the best viable successor if there is one to place immediately // after this block. MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); - // Placing an actual successor may have changed tail duplication - // opportunities. Check for that now. - if (TailDupPlacement && BestSucc) { - DEBUG(dbgs() << "Redoing tail duplication for BestSucc#" - << BestSucc->getNumber() << "\n"); - bool IsSimple = TailDup.isSimpleBB(BestSucc); - bool IgnoreFallthrough = true; - // Simple blocks should just fallthrough, so only worry about non-simple - // ones. - if (!IsSimple && TailDup.shouldTailDuplicate(F, IsSimple, - *BestSucc, IgnoreFallthrough)) { - SmallVector Cond; - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; - if (!TII->AnalyzeBranch(*BestSucc, TBB, FBB, Cond)) - TailDup.tailDuplicateAndUpdate(F, IsSimple, BestSucc, BB); - } - } - // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at // this point. This won't be a fallthrough, but it will increase locality. @@ -731,13 +792,93 @@ "layout successor until the CFG reduces\n"); } + // Placement may have changed tail duplication opportunities. + // Check for that now. + if (TailDupPlacement && BestSucc) { + DEBUG(dbgs() << "Redoing tail duplication for BestSucc#" + << BestSucc->getNumber() << "\n"); + bool IsSimple = TailDup.isSimpleBB(BestSucc); + bool IgnoreFallthrough = true; + if (TailDup.shouldTailDuplicate(F, IsSimple, + *BestSucc, IgnoreFallthrough)) { + SmallVector Cond; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + + // Update UnscheduledPredecessors to reflect tail-duplication. + for (MachineBasicBlock *Pred : BestSucc->predecessors()) { + // We're only looking for unscheduled predecessors that match the + // filter. + if (!TII->AnalyzeBranch(*BestSucc, TBB, FBB, Cond)) { + BlockChain* PredChain = BlockToChain[Pred]; + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || PredChain == &Chain) + continue; + if (TailDup.canTailDuplicate(BestSucc, Pred)) { + for (MachineBasicBlock *NewSucc : BestSucc->successors()) { + if (BlockFilter && !BlockFilter->count(NewSucc)) + continue; + BlockChain *NewChain = BlockToChain[NewSucc]; + if (NewChain != &Chain && NewChain != PredChain) + BlockToChain[NewSucc]->UnscheduledPredecessors++; + } + } + } + } + bool TailBlockRemoved = false; + auto RemovalCallback = + [&](MachineBasicBlock *RemBB) { + TailBlockRemoved = true; + // Remove from the Chain + if (BlockToChain.count(RemBB)) { + BlockChain *Chain = BlockToChain[RemBB]; + Chain->remove(RemBB); + BlockToChain.erase(RemBB); + } + // Handle the unplaced block iterator + if (&(*PrevUnplacedBlockIt) == RemBB) { + PrevUnplacedBlockIt++; + } + // Handle the Work Lists + SmallVectorImpl &RemoveList = BlockWorkList; + if (RemBB->isEHPad()) + RemoveList = EHPadWorkList; + for (auto it = RemoveList.begin(); it != RemoveList.end(); ++it) { + if (*it == RemBB) { + RemoveList.erase(it); + break; + } + } + // Handle the filter set + if (BlockFilter) { + BlockFilter->erase(RemBB); + } + // Remove it from loops. + MLI->removeBlock(RemBB); + DEBUG(dbgs() << "TailDuplicator deleted block: " + << getBlockName(RemBB) << "\n"); + }; + auto RemovalCallbackRef = + llvm::function_ref(RemovalCallback); + TailDup.tailDuplicateAndUpdate(F, IsSimple, BestSucc, BB, + &RemovalCallbackRef); + // If the chosen successor was duplicated into all its predecessors, + // don't bother laying it out, just go round the loop again with BB as + // the chain end. + if (TailBlockRemoved) + continue; + } + } + + // Place this block, updating the datastructures to reflect its placement. BlockChain &SuccChain = *BlockToChain[BestSucc]; + TailDupDelayBlocks.erase(BestSucc); // Zero out UnscheduledPredecessors for the successor we're about to merge in case // we selected a successor that didn't fit naturally into the CFG. SuccChain.UnscheduledPredecessors = 0; DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to " << getBlockName(BestSucc) << "\n"); + DEBUG(dbgs() << "Loop header is " << getBlockName(LoopHeaderBB) << "\n"); markChainSuccessors(SuccChain, LoopHeaderBB, BlockWorkList, EHPadWorkList, BlockFilter); Chain.merge(BestSucc, &SuccChain); @@ -1145,6 +1286,67 @@ return LoopBlockSet; } + +/// \brief Finds unavoidable blocks within a loop. +/// +/// These blocks form the loop spine, and knowing which blocks they are allow +/// the loop-optional blocks to be outlined to the end of the loop, +/// unconditionally or if they can form a second tail-duped spine. +void MachineBlockPlacement::computeLoopUnavoidableBlocks(MachineLoop &L) { + SmallVector Exits; + L.getExitingBlocks(Exits); + // Find the nearest common dominator of all of F's terminators. + MachineBasicBlock * Terminator = nullptr; + for (MachineBasicBlock *MBB : Exits) { + if (Terminator == nullptr) + Terminator = MBB; + else + Terminator = MDT->findNearestCommonDominator(Terminator, MBB); + } + + // MBBs dominating this common dominator are unavoidable. + UnavoidableBlocks.clear(); + // If there are no exit blocks from the loop, punt and assume that there are + // no unavoidable blocks. This will result in a linear layout. + if (Terminator == nullptr) + return; + for (MachineBasicBlock *MBB : L.getBlocks()) { + if (MDT->dominates(MBB, Terminator)) { + UnavoidableBlocks.insert(MBB); + } + } +} + + +/// \brief Finds unavoidable blocks for the entire function +/// +/// These blocks form the spine, and knowing which blocks they are allow +/// the optional blocks to be outlined to the end of the function +/// unconditionally or if they can form a second tail-duped spine. +void MachineBlockPlacement::computeUnavoidableBlocks(MachineFunction &F) { + MachineBasicBlock * Terminator = nullptr; + for (MachineBasicBlock &MBB : F) { + if (MBB.succ_size() == 0) { + if (Terminator == nullptr) + Terminator = &MBB; + else + Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); + } + } + + // MBBs dominating this common dominator are unavoidable. + UnavoidableBlocks.clear(); + // If there are no exit blocks from the function, punt and assume that there + // are no unavoidable blocks. This will result in a linear layout. + if (Terminator == nullptr) + return; + for (MachineBasicBlock &MBB : F) { + if (MDT->dominates(&MBB, Terminator)) { + UnavoidableBlocks.insert(&MBB); + } + } +} + /// \brief Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code @@ -1162,6 +1364,13 @@ SmallVector EHPadWorkList; BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L); + // Find the unavoidable blocks within this loop. This allows partial outlining + // with tail duplication within a loop. + if (TailDupPlacement) { + computeLoopUnavoidableBlocks(L); + TailDupDelayBlocks.clear(); + } + // Check if we have profile data for this function. If yes, we will rotate // this loop by modeling costs more precisely which requires the profile data // for better layout. @@ -1268,31 +1477,17 @@ } } - if (OutlineOptionalBranches) { - // Find the nearest common dominator of all of F's terminators. - MachineBasicBlock *Terminator = nullptr; - for (MachineBasicBlock &MBB : F) { - if (MBB.succ_size() == 0) { - if (Terminator == nullptr) - Terminator = &MBB; - else - Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); - } - } - - // MBBs dominating this common dominator are unavoidable. - UnavoidableBlocks.clear(); - for (MachineBasicBlock &MBB : F) { - if (MDT->dominates(&MBB, Terminator)) { - UnavoidableBlocks.insert(&MBB); - } - } - } - // Build any loop-based chains. for (MachineLoop *L : *MLI) buildLoopChains(F, *L); + // This must go after the loop chains, because the loop chains compute their + // own loop-relative UnavoidableBlocks + if (OutlineOptionalBranches || TailDupPlacement) { + computeUnavoidableBlocks(F); + TailDupDelayBlocks.clear(); + } + SmallVector BlockWorkList; SmallVector EHPadWorkList; Index: lib/CodeGen/TailDuplicator.cpp =================================================================== --- lib/CodeGen/TailDuplicator.cpp +++ lib/CodeGen/TailDuplicator.cpp @@ -122,7 +122,8 @@ /// Tail duplicate the block and cleanup. bool TailDuplicator::tailDuplicateAndUpdate( MachineFunction &MF, bool IsSimple, MachineBasicBlock *MBB, - MachineBasicBlock *ForcedLayoutPred) { + MachineBasicBlock *ForcedLayoutPred, + llvm::function_ref *RemovalCallback) { // Save the successors list. SmallSetVector Succs(MBB->succ_begin(), MBB->succ_end()); @@ -147,7 +148,7 @@ // If it is dead, remove it. if (isDead) { NumInstrDups -= MBB->size(); - removeDeadBlock(MBB); + removeDeadBlock(MBB, RemovalCallback); ++NumDeadBlocks; } @@ -512,7 +513,7 @@ bool IgnoreFallthrough) { // IgnoreFallthrough is set when considering duplication during layout. // Because the ultimate layout may change, it is better to consider - // duplicating blocks that can't fall through. + // duplicating blocks that can fall through. if (TailBB.canFallThrough() && !IgnoreFallthrough) return false; @@ -731,6 +732,27 @@ return Changed; } +bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB, + MachineBasicBlock *PredBB) { + // EH edges are ignored by AnalyzeBranch. + if (PredBB->succ_size() > 1) { + DEBUG(dbgs() << "Predecessor doesn't unconditionally jump to tail.\n"); + return false; + } + + MachineBasicBlock *PredTBB, *PredFBB; + SmallVector PredCond; + if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true)) { + DEBUG(dbgs() << "Branch cannot be analyzed.\n"); + return false; + } + if (!PredCond.empty()) { + DEBUG(dbgs() << "PredCond is not empty.\n"); + return false; + } + return true; +} + /// If it is profitable, duplicate TailBB's contents in each /// of its predecessors. bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple, @@ -756,19 +778,12 @@ PE = Preds.end(); PI != PE; ++PI) { MachineBasicBlock *PredBB = *PI; - assert(TailBB != PredBB && "Single-block loop should have been rejected earlier!"); - // EH edges are ignored by AnalyzeBranch. - if (PredBB->succ_size() > 1) - continue; - MachineBasicBlock *PredTBB, *PredFBB; - SmallVector PredCond; - if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true)) - continue; - if (!PredCond.empty()) + if (!canTailDuplicate(TailBB, PredBB)) continue; + // Don't duplicate into a fall-through predecessor (at least for now). bool IsLayoutSuccessor = false; if (ForcedLayoutPred) @@ -789,8 +804,9 @@ if (RS && !TailBB->livein_empty()) { // Update PredBB livein. RS->enterBasicBlock(*PredBB); - if (!PredBB->empty()) + if (!PredBB->empty()) { RS->forward(std::prev(PredBB->end())); + } for (const auto &LI : TailBB->liveins()) { if (!RS->isRegUsed(LI.PhysReg, false)) // If a register is previously livein to the tail but it's not live @@ -822,6 +838,8 @@ appendCopies(PredBB, CopyInfos, Copies); // Simplify + MachineBasicBlock *PredTBB, *PredFBB; + SmallVector PredCond; TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true); NumInstrDups += TailBB->size() - 1; // subtract one for removed branch @@ -841,14 +859,16 @@ // If TailBB was duplicated into all its predecessors except for the prior // block, which falls through unconditionally, move the contents of this - // block into the prior block. Don't do this when ForcedLayoutPred is - // non-null, as it can break layout to remove blocks. - MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator()); + // block into the prior block. + MachineBasicBlock *PrevBB = ForcedLayoutPred; + if (!PrevBB) + PrevBB = &*std::prev(TailBB->getIterator()); MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr; SmallVector PriorCond; // This has to check PrevBB->succ_size() because EH edges are ignored by // AnalyzeBranch. - if (ForcedLayoutPred == nullptr && PrevBB->succ_size() == 1 && + if (PrevBB->succ_size() == 1 && + *PrevBB->succ_begin() == TailBB && !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) && PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 && !TailBB->hasAddressTaken()) { @@ -952,10 +972,15 @@ /// Remove the specified dead machine basic block from the function, updating /// the CFG. -void TailDuplicator::removeDeadBlock(MachineBasicBlock *MBB) { +void TailDuplicator::removeDeadBlock( + MachineBasicBlock *MBB, + llvm::function_ref *RemovalCallback) { assert(MBB->pred_empty() && "MBB must be dead!"); DEBUG(dbgs() << "\nRemoving MBB: " << *MBB); + if (RemovalCallback) + (*RemovalCallback)(MBB); + // Remove all successors. while (!MBB->succ_empty()) MBB->removeSuccessor(MBB->succ_end() - 1); Index: test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll =================================================================== --- test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll +++ test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -664,12 +664,12 @@ ; No realignment in the prologue. ; CHECK-NOT: and ; CHECK-NOT: 0xffffffffffffffe0 -; CHECK: tbz {{.*}} .[[LABEL:.*]] +; CHECK: tbnz {{.*}} .[[LABEL:.*]] +; CHECK: ret +; CHECK: .[[LABEL]]: ; Stack is realigned in a non-entry BB. ; CHECK: sub [[REG:x[01-9]+]], sp, #64 ; CHECK: and sp, [[REG]], #0xffffffffffffffe0 -; CHECK: .[[LABEL]]: -; CHECK: ret define void @realign_conditional2(i1 %b) { @@ -687,15 +687,15 @@ ; CHECK-LABEL: realign_conditional2 ; Extra realignment in the prologue (performance issue). -; CHECK: tbz {{.*}} .[[LABEL:.*]] +; CHECK: tbnz {{.*}} .[[LABEL:.*]] +; CHECK: ret +; CHECK: .[[LABEL]]: ; CHECK: sub x9, sp, #32 // =32 ; CHECK: and sp, x9, #0xffffffffffffffe0 ; CHECK: mov x19, sp ; Stack is realigned in a non-entry BB. ; CHECK: sub [[REG:x[01-9]+]], sp, #64 ; CHECK: and sp, [[REG]], #0xffffffffffffffe0 -; CHECK: .[[LABEL]]: -; CHECK: ret attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/CodeGen/AArch64/arm64-atomic.ll =================================================================== --- test/CodeGen/AArch64/arm64-atomic.ll +++ test/CodeGen/AArch64/arm64-atomic.ll @@ -9,10 +9,10 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -27,10 +27,12 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret %new = load i32, i32* %pnew %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 @@ -41,15 +43,15 @@ ; CHECK-LABEL: val_compare_and_swap_rel: ; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: -; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]] +; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: cmp [[RESULT]], w1 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] -; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]] +; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -64,10 +66,10 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val Index: test/CodeGen/AArch64/arm64-ccmp.ll =================================================================== --- test/CodeGen/AArch64/arm64-ccmp.ll +++ test/CodeGen/AArch64/arm64-ccmp.ll @@ -51,7 +51,7 @@ ; CHECK: cmp ; CHECK: b.eq ; CHECK: cmp -; CHECK: b.gt +; CHECK: b.le define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp { entry: %cmp = icmp eq i32 %a, 5 @@ -78,7 +78,7 @@ ; CHECK: cmp ; CHECK: b.eq ; CHECK: cmp -; CHECK: tbz +; CHECK: tbnz define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp { entry: %cmp = icmp eq i32 %a, 5 Index: test/CodeGen/AArch64/arm64-extload-knownzero.ll =================================================================== --- test/CodeGen/AArch64/arm64-extload-knownzero.ll +++ test/CodeGen/AArch64/arm64-extload-knownzero.ll @@ -12,7 +12,6 @@ %tmp2 = load i16, i16* %ptr, align 2 br label %bb2 bb2: -; CHECK: %bb2 ; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff ; CHECK: cmp [[REG]], #23 %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ] Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -10,9 +10,11 @@ ; Compare the arguments and jump to exit. ; No prologue needed. ; ENABLE: cmp w0, w1 -; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]] +; ENABLE-NEXT: b.lt [[PROLOGUE_LABEL:LBB[0-9_]+]] +; ENABLE: ret ; ; Prologue code. +; ENABLE: [[PROLOGUE_LABEL]]: ; CHECK: sub sp, sp, #32 ; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16] ; CHECK-NEXT: add [[SAVE_SP]], sp, #16 @@ -37,7 +39,6 @@ ; CHECK-NEXT: add sp, sp, #32 ; ; With shrink-wrapping, exit block is a simple return. -; ENABLE: [[EXIT_LABEL]]: ; CHECK-NEXT: ret define i32 @foo(i32 %a, i32 %b) { %tmp = alloca i32, align 4 Index: test/CodeGen/AArch64/fcmp.ll =================================================================== --- test/CodeGen/AArch64/fcmp.ll +++ test/CodeGen/AArch64/fcmp.ll @@ -31,7 +31,7 @@ %tst4 = fcmp uge float %a, -0.0 br i1 %tst4, label %t5, label %end ; CHECK-NOT: fcmp {{s[0-9]+}}, #0.0 -; CHECK: b.mi .LBB +; CHECK: b.pl .LBB t5: call void @bar(i32 0) @@ -70,7 +70,7 @@ %tst4 = fcmp uge double %a, -0.0 br i1 %tst4, label %t5, label %end ; CHECK-NOT: fcmp {{d[0-9]+}}, #0.0 -; CHECK: b.mi .LBB +; CHECK: b.pl .LBB t5: call void @bar(i32 0) Index: test/CodeGen/AArch64/rm_redundant_cmp.ll =================================================================== --- test/CodeGen/AArch64/rm_redundant_cmp.ll +++ test/CodeGen/AArch64/rm_redundant_cmp.ll @@ -13,7 +13,7 @@ ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}} ; CHECK-NEXT: b.gt ; CHECK-NOT: cmp -; CHECK: b.ne +; CHECK: b.eq entry: %0 = load i16, i16* getelementptr inbounds (%struct.s_signed_i16, %struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2 %1 = load i16, i16* getelementptr inbounds (%struct.s_signed_i16, %struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2 @@ -69,7 +69,7 @@ ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}} ; CHECK-NEXT: b.hi ; CHECK-NOT: cmp -; CHECK: b.ne +; CHECK: b.eq entry: %0 = load i16, i16* getelementptr inbounds (%struct.s_unsigned_i16, %struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2 %1 = load i16, i16* getelementptr inbounds (%struct.s_unsigned_i16, %struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2 @@ -134,7 +134,7 @@ ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}} ; CHECK-NEXT: b.gt ; CHECK-NOT: cmp -; CHECK: b.ne +; CHECK: b.eq entry: %0 = load i8, i8* getelementptr inbounds (%struct.s_signed_i8, %struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2 %1 = load i8, i8* getelementptr inbounds (%struct.s_signed_i8, %struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2 @@ -190,7 +190,7 @@ ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}} ; CHECK-NEXT: b.hi ; CHECK-NOT: cmp -; CHECK: b.ne +; CHECK: b.eq entry: %0 = load i8, i8* getelementptr inbounds (%struct.s_unsigned_i8, %struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2 %1 = load i8, i8* getelementptr inbounds (%struct.s_unsigned_i8, %struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2 Index: test/CodeGen/AArch64/tbz-tbnz.ll =================================================================== --- test/CodeGen/AArch64/tbz-tbnz.ll +++ test/CodeGen/AArch64/tbz-tbnz.ll @@ -10,7 +10,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -28,7 +28,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:x[0-9]+]], x0, #12 -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() @@ -46,7 +46,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbnz [[CMP]], #31 +; CHECK: tbz [[CMP]], #31 if.then: call void @t() @@ -64,7 +64,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:x[0-9]+]], x0, #12 -; CHECK: tbnz [[CMP]], #63 +; CHECK: tbz [[CMP]], #63 if.then: call void @t() @@ -82,7 +82,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbnz [[CMP]], #31 +; CHECK: tbz [[CMP]], #31 if.then: call void @t() @@ -100,7 +100,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:x[0-9]+]], x0, #12 -; CHECK: tbnz [[CMP]], #63 +; CHECK: tbz [[CMP]], #63 if.then: call void @t() @@ -118,7 +118,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -162,7 +162,7 @@ br i1 %tst4, label %if.then4, label %if.end ; CHECK: tst x0, x1, lsl #62 -; CHECK: b.lt +; CHECK: b.ge if.then4: call void @t() @@ -178,7 +178,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -194,7 +194,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -209,7 +209,7 @@ ; CHECK: ldr [[CMP:x[0-9]+]], [x1] ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 %val = load i64, i64* %ptr %tst = icmp slt i64 %val, 0 @@ -229,7 +229,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -247,7 +247,7 @@ ; CHECK: orr [[CMP:x[0-9]+]], x0, x1 ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() @@ -262,7 +262,7 @@ br i1 %cond, label %if.end, label %if.then ; CHECK-NOT: and -; CHECK: tbnz w0, #0 +; CHECK: tbz w0, #0 if.then: call void @t() @@ -278,7 +278,7 @@ br i1 %cond1, label %if.then, label %if.end ; CHECK-NOT: movn -; CHECK: tbnz w0, #0 +; CHECK: tbz w0, #0 if.then: call void @t() @@ -296,7 +296,7 @@ br i1 %cond, label %then, label %end ; CHECK-NOT: lsl -; CHECK: tbnz w0, #2 +; CHECK: tbz w0, #2 then: call void @t() @@ -314,7 +314,7 @@ br i1 %cond, label %then, label %end ; CHECK-NOT: lsr -; CHECK: tbnz w0, #3 +; CHECK: tbz w0, #3 then: call void @t() @@ -331,7 +331,7 @@ br i1 %cond, label %then, label %end ; CHECK-NOT: asr -; CHECK: tbnz w0, #31 +; CHECK: tbz w0, #31 then: call void @t() @@ -350,7 +350,7 @@ br i1 %cond, label %then, label %end ; CHECK-NOT: ubfx -; CHECK: tbnz w0, #3 +; CHECK: tbz w0, #3 then: call void @t() Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -437,11 +437,12 @@ ; GCN: s_load_dword [[SGPR:s[0-9]+]] ; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}} ; GCN: s_and_b64 vcc, exec, vcc -; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]] +; GCN: s_cbranch_vccz [[SUCCESS:[A-Z0-9_]+]] +; GCN: s_endpgm +; GCN: {{^}}[[SUCCESS]]: ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN-NOHSA: buffer_store_dword [[ONE]] ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]] -; GCN; {{^}}[[EXIT]]: ; GCN: s_endpgm define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) { bb3: ; preds = %bb2 Index: test/CodeGen/AMDGPU/smrd-vccz-bug.ll =================================================================== --- test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -9,9 +9,10 @@ ; GCN: s_waitcnt lgkmcnt(0) ; VCCZ-BUG: s_mov_b64 vcc, vcc ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc -; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]] +; GCN: s_cbranch_vccz [[SUCCESS:[0-9A-Za-z_]+]] +; GCN: s_endpgm +; GCN: [[SUCCESS]]: ; GCN: buffer_store_dword -; GCN: [[EXIT]]: ; GCN: s_endpgm define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) { entry: Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -121,9 +121,10 @@ ; be selected for the SALU and then later moved to the VALU. ; SI: v_cmp_ne_i32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]] ; SI: s_and_b64 vcc, exec, [[COND]] -; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: s_cbranch_vccz [[SUCCESS_LABEL:[0-9_A-Za-z]+]] +; SI: s_endpgm +; SI: [[SUCCESS_LABEL]]: ; SI: buffer_store_dword -; SI: [[ENDIF_LABEL]]: ; SI: s_endpgm define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) { entry: @@ -146,9 +147,10 @@ ; be selected for the SALU and then later moved to the VALU. ; SI: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]] ; SI: s_and_b64 vcc, exec, [[COND]] -; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: s_cbranch_vccz [[SUCCESS_LABEL:[0-9_A-Za-z]+]] +; SI: s_endpgm +; SI: [[SUCCESS_LABEL]]: ; SI: buffer_store_dword -; SI: [[ENDIF_LABEL]]: ; SI: s_endpgm define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) { entry: @@ -231,9 +233,10 @@ ; SI-LABEL: {{^}}icmp_2_users: ; SI: s_cmp_lt_i32 s{{[0-9]+}}, 1 -; SI: s_cbranch_scc1 [[LABEL:[a-zA-Z0-9_]+]] +; SI: s_cbranch_scc0 [[SUCCESS:[a-zA-Z0-9_]+]] +; SI: s_endpgm +; SI: [[SUCCESS]]: ; SI: buffer_store_dword -; SI: [[LABEL]]: ; SI: s_endpgm define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { main_body: @@ -255,9 +258,10 @@ ; SI: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]] ; SI: v_cmp_lt_i32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0, [[COND]] ; SI: s_and_b64 vcc, exec, [[MASK]] -; SI: s_cbranch_vccnz [[EXIT]] +; SI: s_cbranch_vccz [[SUCCESS:[a-zA-Z0-9_]+]] +; SI: s_endpgm +; SI: {{^}}[[SUCCESS]]: ; SI: buffer_store -; SI: {{^}}[[EXIT]]: ; SI: s_endpgm define void @icmp_users_different_blocks(i32 %cond, i32 addrspace(1)* %out) { bb: @@ -333,13 +337,14 @@ ; SI-LABEL: {{^}}divergent_inside_uniform: ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0 -; SI: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: s_cbranch_scc0 [[SUCCESS_LABEL:[0-9_A-Za-z]+]] +; SI: s_endpgm +; SI: [[SUCCESS_LABEL]]: ; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; SI: buffer_store_dword [[ONE]] -; SI: [[ENDIF_LABEL]]: ; SI: s_endpgm define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { entry: @@ -368,10 +373,11 @@ ; SI: buffer_store_dword [[ONE]] ; SI: s_or_b64 exec, exec, [[MASK]] ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0 -; SI: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]] +; SI: s_cbranch_scc0 [[THREE:[A-Z0-9_]+]] +; SI: s_endpgm +; SI: [[THREE]]: ; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; SI: buffer_store_dword [[TWO]] -; SI: [[EXIT]]: ; SI: s_endpgm define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { entry: Index: test/CodeGen/AMDGPU/uniform-crash.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-crash.ll +++ test/CodeGen/AMDGPU/uniform-crash.ll @@ -3,9 +3,10 @@ ; GCN-LABEL: {{^}}icmp_2_users: ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1 -; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]] +; GCN: s_cbranch_scc0 [[LABEL:BB[0-9_A-Z]+]] +; GCN: s_endpgm ; GCN: [[LABEL]]: -; GCN-NEXT: s_endpgm +; GCN: s_endpgm define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { main_body: %0 = icmp sgt i32 %cond, 0 Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -116,9 +116,13 @@ ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] ; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] +; SI: s_branch [[LABEL_PREHEADER:BB[0-9]+_[0-9]+]] + +; SI: [[LABEL_EXIT]]: +; SI: s_endpgm ; Initialize inner condition to false -; SI: ; BB#1: +; SI: [[LABEL_PREHEADER]]: ; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] @@ -146,8 +150,6 @@ ; SI: BB#5 ; SI: s_or_b64 exec, exec, [[COND_STATE]] - -; SI: [[LABEL_EXIT]]: ; SI-NOT: [[COND_STATE]] ; SI: s_endpgm Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -82,12 +82,12 @@ ;CHECK-NEXT: ; %main_body ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: %IF +;CHECK: image_sample ;CHECK: %ELSE ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] ;CHECK: store ;CHECK: s_mov_b64 exec, [[SAVED]] -;CHECK: %IF -;CHECK: image_sample define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0 Index: test/CodeGen/ARM/2013-05-05-IfConvertBug.ll =================================================================== --- test/CodeGen/ARM/2013-05-05-IfConvertBug.ll +++ test/CodeGen/ARM/2013-05-05-IfConvertBug.ll @@ -112,15 +112,17 @@ ; CHECK-NEXT: subs [[REG:r[0-9]+]], #120 ; CHECK-NEXT: cmp [[REG]], r1 ; CHECK-NOT: it lt -; CHECK-NEXT: bge [[LABEL:.+]] +; CHECK-NEXT: blt [[LABEL:.+]] ; Next BB +; CHECK: subs r0, r1, r0 +; CHECK-NEXT: bx lr +; Next BB +; CHECK: [[LABEL]]: ; CHECK-NOT: cmplt ; CHECK: cmp r0, #119 ; CHECK-NEXT: itt le ; CHECK-NEXT: addle r0, r1, #1 ; CHECK-NEXT: bxle lr -; Next BB -; CHECK: [[LABEL]]: ; CHECK-NEXT: subs r0, r1, r0 ; CHECK-NEXT: bx lr Index: test/CodeGen/ARM/arm-shrink-wrapping.ll =================================================================== --- test/CodeGen/ARM/arm-shrink-wrapping.ll +++ test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -23,9 +23,11 @@ ; Compare the arguments and jump to exit. ; No prologue needed. ; ENABLE: cmp r0, r1 -; ENABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]] +; ENABLE-NEXT: blt [[SUCCESS_LABEL:LBB[0-9_]+]] +; ENABLE: bx lr ; ; Prologue code. +; ENABLE: [[SUCCESS_LABEL]]: ; CHECK: push {r7, lr} ; CHECK-NEXT: mov r7, sp ;; @@ -33,8 +35,12 @@ ; After the prologue is set. ; DISABLE: sub sp ; DISABLE: cmp r0, r1 -; DISABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]] +; DISABLE-NEXT: blt [[SUCCESS_LABEL:LBB[0-9_]+]] +; ARM-DISABLE: mov sp, r7 +; THUMB-DISABLE: add sp, +; DISABLE-NEXT: pop {r7, pc} ; +; DISABLE: [[SUCCESS_LABEL]]: ; Store %a in the alloca. ; ARM-ENABLE: push {r0} ; THUMB-ENABLE: str r0, [sp, #-4] @@ -50,9 +56,8 @@ ; THUMB-ENABLE-NEXT: add sp, #4 ; ENABLE-NEXT: pop{{(\.w)?}} {r7, lr} ; -; CHECK: [[EXIT_LABEL]]: -; -; Without shrink-wrapping, epilogue is in the exit block. +; Late stage tail-duplication removes the exit label with shrink-wrapping. +; Without shrink-wrapping, epilogue is before the return. ; Epilogue code. (What we pop does not matter.) ; ARM-DISABLE: mov sp, r7 ; THUMB-DISABLE: add sp, @@ -388,9 +393,9 @@ ; ; Next BB. ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body -; ARM: subs [[IV]], [[IV]], #1 -; THUMB: subs [[IV]], #1 -; CHECK: add{{(\.w)?}} r4, r4, #1 +; ARM-DAG: subs [[IV]], [[IV]], #1 +; THUMB-DAG: subs [[IV]], #1 +; CHECK-DAG: add{{(\.w)?}} r4, r4, #1 ; CHECK: bne [[LOOP]] ; ; Next BB. Index: test/CodeGen/ARM/atomic-cmpxchg.ll =================================================================== --- test/CodeGen/ARM/atomic-cmpxchg.ll +++ test/CodeGen/ARM/atomic-cmpxchg.ll @@ -72,11 +72,11 @@ ; CHECK-ARMV7-NEXT: mov [[RES:r[0-9]+]], #1 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0 ; CHECK-ARMV7-NEXT: bne [[TRY]] -; CHECK-ARMV7-NEXT: b [[END:.LBB[0-9_]+]] +; CHECK-ARMV7-NEXT: mov r0, [[RES]] +; CHECK-ARMV7-NEXT: bx lr ; CHECK-ARMV7-NEXT: [[FAIL]]: ; CHECK-ARMV7-NEXT: clrex ; CHECK-ARMV7-NEXT: mov [[RES]], #0 -; CHECK-ARMV7-NEXT: [[END]]: ; CHECK-ARMV7-NEXT: mov r0, [[RES]] ; CHECK-ARMV7-NEXT: bx lr Index: test/CodeGen/ARM/atomic-op.ll =================================================================== --- test/CodeGen/ARM/atomic-op.ll +++ test/CodeGen/ARM/atomic-op.ll @@ -297,10 +297,10 @@ ; CHECK: strex [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]] ; CHECK: cmp [[SUCCESS]], #0 ; CHECK: bne [[LOOP_BB]] -; CHECK: b [[END_BB:\.?LBB[0-9]+_[0-9]+]] +; CHECK: dmb ish +; CHECK: bx lr ; CHECK: [[FAIL_BB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[END_BB]]: ; CHECK: dmb ish ; CHECK: bx lr Index: test/CodeGen/ARM/atomic-ops-v8.ll =================================================================== --- test/CodeGen/ARM/atomic-ops-v8.ll +++ test/CodeGen/ARM/atomic-ops-v8.ll @@ -1045,20 +1045,21 @@ ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i8 %old } @@ -1078,20 +1079,21 @@ ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i16 %old } @@ -1110,20 +1112,21 @@ ; r0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: cmp r[[OLD]], r0 -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-NEXT: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-ARM-NEXT: bx lr ret void } @@ -1148,16 +1151,16 @@ ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]] ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r2, r3 is a reasonable guess. ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]] +; CHECK-NEXT: pop ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr Index: test/CodeGen/ARM/fold-stack-adjust.ll =================================================================== --- test/CodeGen/ARM/fold-stack-adjust.ll +++ test/CodeGen/ARM/fold-stack-adjust.ll @@ -135,7 +135,7 @@ ; Important to check for beginning of basic block, because if it gets ; if-converted the test is probably no longer checking what it should. -; CHECK: {{LBB[0-9]+_2}}: +; CHECK: %end ; CHECK-NEXT: vpop {d7, d8} ; CHECK-NEXT: pop {r4, pc} Index: test/CodeGen/ARM/machine-cse-cmp.ll =================================================================== --- test/CodeGen/ARM/machine-cse-cmp.ll +++ test/CodeGen/ARM/machine-cse-cmp.ll @@ -52,7 +52,7 @@ ; CHECK-LABEL: f3: ; CHECK-NOT: sub ; CHECK: cmp -; CHECK: blt +; CHECK: bge %0 = load i32, i32* %offset, align 4 %cmp = icmp slt i32 %0, %size %s = sub nsw i32 %0, %size Index: test/CodeGen/Mips/llvm-ir/ashr.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/ashr.ll +++ test/CodeGen/Mips/llvm-ir/ashr.ll @@ -96,20 +96,23 @@ ; M2: srav $[[T0:[0-9]+]], $4, $7 ; M2: andi $[[T1:[0-9]+]], $7, 32 - ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]] + ; M2: beqz $[[T1]], $[[BB0:BB[0-9_]+]] ; M2: move $3, $[[T0]] + ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: nop + ; M2: $[[EXIT:BB[0-9_]+]]: + ; M2: jr $ra + ; M2: nop + ; M2: $[[BB0]]: ; M2: srlv $[[T2:[0-9]+]], $5, $7 ; M2: not $[[T3:[0-9]+]], $7 ; M2: sll $[[T4:[0-9]+]], $4, 1 ; M2: sllv $[[T5:[0-9]+]], $[[T4]], $[[T3]] + ; M2: beqz $[[T1]], $[[EXIT]] ; M2: or $3, $[[T3]], $[[T2]] - ; M2: $[[BB0]]: - ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] - ; M2: nop - ; M2: sra $2, $4, 31 ; M2: $[[BB1]]: ; M2: jr $ra - ; M2: nop + ; M2: sra $2, $4, 31 ; 32R1-R5: srlv $[[T0:[0-9]+]], $5, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -180,20 +183,23 @@ ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsrav $[[T1:[0-9]+]], $4, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]] + ; M3: beqz $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]] ; M3: move $3, $[[T1]] + ; M3: bnez $[[T3]], $[[BB1:BB[0-9_]+]] + ; M3: nop + ; M3: $[[EXIT:BB[0-9_]+]]: + ; M3: jr $ra + ; M3: nop + ; M3: $[[BB0]]: ; M3: dsrlv $[[T4:[0-9]+]], $5, $7 ; M3: dsll $[[T5:[0-9]+]], $4, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsllv $[[T7:[0-9]+]], $[[T5]], $[[T6]] + ; M3: beqz $[[T3]], $[[EXIT]] ; M3: or $3, $[[T7]], $[[T4]] - ; M3: $[[BB0]]: - ; M3: beqz $[[T3]], $[[BB1:BB[0-9_]+]] - ; M3: nop - ; M3: dsra $2, $4, 63 ; M3: $[[BB1]]: ; M3: jr $ra - ; M3: nop + ; M3: dsra $2, $4, 63 ; GP64-NOT-R6: dsrlv $[[T0:[0-9]+]], $5, $7 ; GP64-NOT-R6: dsll $[[T1:[0-9]+]], $4, 1 Index: test/CodeGen/Mips/llvm-ir/lshr.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/lshr.ll +++ test/CodeGen/Mips/llvm-ir/lshr.ll @@ -94,20 +94,24 @@ ; M2: srlv $[[T0:[0-9]+]], $4, $7 ; M2: andi $[[T1:[0-9]+]], $7, 32 - ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]] + ; M2: beqz $[[T1]], $[[BB0:BB[0-9_]+]] ; M2: move $3, $[[T0]] + ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: addiu $2, $zero, 0 + ; M2: $[[EXIT:BB[0-9_]+]]: + ; M2: jr $ra + ; M2: nop + ; M2: $[[BB0]]: ; M2: srlv $[[T2:[0-9]+]], $5, $7 ; M2: not $[[T3:[0-9]+]], $7 ; M2: sll $[[T4:[0-9]+]], $4, 1 ; M2: sllv $[[T5:[0-9]+]], $[[T4]], $[[T3]] ; M2: or $3, $[[T3]], $[[T2]] - ; M2: $[[BB0]]: - ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: bnez $[[T1]], $[[EXIT]] ; M2: addiu $2, $zero, 0 - ; M2: move $2, $[[T0]] ; M2: $[[BB1]]: ; M2: jr $ra - ; M2: nop + ; M2: move $2, $[[T0]] ; 32R1-R5: srlv $[[T0:[0-9]+]], $5, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -171,20 +175,24 @@ ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsrlv $[[T1:[0-9]+]], $4, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]] + ; M3: beqz $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]] ; M3: move $3, $[[T1]] + ; M3: beqz $[[T3]], $[[BB1:BB[0-9_]+]] + ; M3: daddiu $2, $zero, 0 + ; M3: $[[EXIT:BB[0-9_]+]]: + ; M3: jr $ra + ; M3: nop + ; M3: $[[BB0]]: ; M3: dsrlv $[[T4:[0-9]+]], $5, $7 ; M3: dsll $[[T5:[0-9]+]], $4, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsllv $[[T7:[0-9]+]], $[[T5]], $[[T6]] ; M3: or $3, $[[T7]], $[[T4]] - ; M3: $[[BB0]]: - ; M3: bnez $[[T3]], $[[BB1:BB[0-9_]+]] + ; M3: bnez $[[T3]], $[[EXIT]] ; M3: daddiu $2, $zero, 0 - ; M3: move $2, $[[T1]] ; M3: $[[BB1]]: ; M3: jr $ra - ; M3: nop + ; M3: move $2, $[[T1]] ; GP64-NOT-R6: dsrlv $[[T0:[0-9]+]], $5, $7 ; GP64-NOT-R6: dsll $[[T1:[0-9]+]], $4, 1 Index: test/CodeGen/Mips/llvm-ir/shl.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/shl.ll +++ test/CodeGen/Mips/llvm-ir/shl.ll @@ -110,20 +110,24 @@ ; M2: sllv $[[T0:[0-9]+]], $5, $7 ; M2: andi $[[T1:[0-9]+]], $7, 32 - ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]] + ; M2: beqz $[[T1]], $[[BB0:BB[0-9_]+]] ; M2: move $2, $[[T0]] + ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: addiu $3, $zero, 0 + ; M2: $[[EXIT:BB[0-9_]+]]: + ; M2: jr $ra + ; M2: nop + ; M2: $[[BB0]]: ; M2: sllv $[[T2:[0-9]+]], $4, $7 ; M2: not $[[T3:[0-9]+]], $7 ; M2: srl $[[T4:[0-9]+]], $5, 1 ; M2: srlv $[[T5:[0-9]+]], $[[T4]], $[[T3]] ; M2: or $2, $[[T2]], $[[T3]] - ; M2: $[[BB0]]: - ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: bnez $[[T1]], $[[EXIT]] ; M2: addiu $3, $zero, 0 - ; M2: move $3, $[[T0]] ; M2: $[[BB1]]: ; M2: jr $ra - ; M2: nop + ; M2: move $3, $[[T0]] ; 32R1-R5: sllv $[[T0:[0-9]+]], $4, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -187,20 +191,24 @@ ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsllv $[[T1:[0-9]+]], $5, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]] + ; M3: beqz $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]] ; M3: move $2, $[[T1]] + ; M3: beqz $[[T3]], $[[BB1:BB[0-9_]+]] + ; M3: daddiu $3, $zero, 0 + ; M3: $[[EXIT:BB[0-9_]+]]: + ; M3: jr $ra + ; M3: nop + ; M3: $[[BB0]]: ; M3: dsllv $[[T4:[0-9]+]], $4, $7 ; M3: dsrl $[[T5:[0-9]+]], $5, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsrlv $[[T7:[0-9]+]], $[[T5]], $[[T6]] ; M3: or $2, $[[T4]], $[[T7]] - ; M3: $[[BB0]]: - ; M3: bnez $[[T3]], $[[BB1:BB[0-9_]+]] + ; M3: bnez $[[T3]], $[[EXIT]] ; M3: daddiu $3, $zero, 0 - ; M3: move $3, $[[T1]] ; M3: $[[BB1]]: ; M3: jr $ra - ; M3: nop + ; M3: move $3, $[[T1]] ; GP64-NOT-R6: dsllv $[[T0:[0-9]+]], $4, $7 ; GP64-NOT-R6: dsrl $[[T1:[0-9]+]], $5, 1 Index: test/CodeGen/Mips/longbranch.ll =================================================================== --- test/CodeGen/Mips/longbranch.ll +++ test/CodeGen/Mips/longbranch.ll @@ -76,7 +76,7 @@ ; Check the MIPS64 version. ; N64: lui $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test1))) -; N64: bnez $4, $[[BB0:BB[0-9_]+]] +; N64: beqz $4, $[[BB0:BB[0-9_]+]] ; N64: daddu $[[R1:[0-9]+]], $[[R0]], $25 ; Check for long branch expansion: @@ -93,13 +93,14 @@ ; N64-NEXT: daddiu $sp, $sp, 16 ; N64: $[[BB0]]: +; N64: jr $ra +; N64: nop +; N64: $[[BB2]]: ; N64: daddiu $[[GP:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test1))) ; N64: ld $[[R2:[0-9]+]], %got_disp(x)($[[GP]]) ; N64: addiu $[[R3:[0-9]+]], $zero, 1 -; N64: sw $[[R3]], 0($[[R2]]) -; N64: $[[BB2]]: ; N64: jr $ra -; N64: nop +; N64: sw $[[R3]], 0($[[R2]]) ; Check the microMIPS version. Index: test/CodeGen/PowerPC/bdzlr.ll =================================================================== --- test/CodeGen/PowerPC/bdzlr.ll +++ test/CodeGen/PowerPC/bdzlr.ll @@ -53,13 +53,15 @@ ; CHECK: @lua_xmove ; CHECK: bnelr -; CHECK: bnelr +; CHECK: beq +; CHECK: blr ; CHECK: bdzlr ; CHECK-NOT: blr ; CHECK-CRB: @lua_xmove ; CHECK-CRB: bclr 12, -; CHECK-CRB: bclr 12, +; CHECK-CRB: bc 4, +; CHECK-CRB: blr ; CHECK-CRB: bdzlr ; CHECK-CRB-NOT: blr } Index: test/CodeGen/PowerPC/branch-opt.ll =================================================================== --- test/CodeGen/PowerPC/branch-opt.ll +++ test/CodeGen/PowerPC/branch-opt.ll @@ -4,9 +4,9 @@ target triple = "powerpc-apple-darwin8.7.0" ;CHECK-LABEL: foo: -;CHECK: b LBB0_16 -;CHECK: b LBB0_14 -;CHECK: b LBB0_14 +;CHECK: b LBB0_15 +;CHECK: b LBB0_13 +;CHECK: b LBB0_13 ;CHECK-NOT: b LBB define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) { Index: test/CodeGen/PowerPC/sjlj.ll =================================================================== --- test/CodeGen/PowerPC/sjlj.ll +++ test/CodeGen/PowerPC/sjlj.ll @@ -74,24 +74,24 @@ ; CHECK-DAG: std [[REGA]], [[OFF:[0-9]+]](31) # 8-byte Folded Spill ; CHECK-DAG: std 1, 16([[REGA]]) ; CHECK-DAG: std 2, 24([[REGA]]) -; CHECK: bcl 20, 31, .LBB1_5 +; CHECK: bcl 20, 31, .LBB1_2 ; CHECK: li 3, 1 -; CHECK: #EH_SjLj_Setup .LBB1_5 +; CHECK: #EH_SjLj_Setup .LBB1_2 ; CHECK: b .LBB1_1 -; CHECK: .LBB1_4: +; CHECK: .LBB1_2: +; CHECK: mflr [[REGL:[0-9]+]] +; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31) # 8-byte Folded Reload +; CHECK: std [[REGL]], 8([[REG2]]) +; CHECK: li 3, 0 + +; CHECK: .LBB1_5: ; CHECK: lfd ; CHECK: lvx ; CHECK: ld ; CHECK: blr -; CHECK: .LBB1_5: -; CHECK: mflr [[REGL:[0-9]+]] -; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31) # 8-byte Folded Reload -; CHECK: std [[REGL]], 8([[REG2]]) -; CHECK: li 3, 0 - ; CHECK-NOAV: @main ; CHECK-NOAV-NOT: stvx ; CHECK-NOAV: bcl Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- test/CodeGen/PowerPC/tail-dup-layout.ll +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -1,4 +1,4 @@ -; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s +; RUN: llc -O2 < %s | FileCheck %s target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-grtev4-linux-gnu" @@ -7,19 +7,16 @@ ;CHECK: # %test1 ;CHECK-NEXT: andi. {{[0-9]+}}, 4, 1 ;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] -;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2 +;CHECK-NEXT: # %test2 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 30, 30 ;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] ;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 29, 29 -;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: bne 0, [[OPT3LABEL:[._0-9A-Za-z]+]] ;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 28, 28 -;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] -;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit -;CHECK-NEXT: std 5, 0(3) -;CHECK-NEXT: std 6, 8(3) -;CHECK-NEXT: blr +;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: b [[OPT4LABEL:[._0-9A-Za-z]+]] ;CHECK-NEXT: [[OPT1LABEL]] ;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 30, 30 ;CHECK-NEXT: beq 0, [[TEST3LABEL]] @@ -30,7 +27,10 @@ ;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 28, 28 ;CHECK-NEXT: beq 0, [[EXITLABEL]] ;CHECK-NEXT: [[OPT4LABEL]] -;CHECK: b [[EXITLABEL]] +;CHECK: [[EXITLABEL]]: # %exit +;CHECK-NEXT: std 5, 0(3) +;CHECK-NEXT: std 6, 8(3) +;CHECK-NEXT: blr define void @f(%struct.ptrpair* noalias nocapture sret %result, i32 %tag, i8* %source1, i8* %sink1) { entry: Index: test/CodeGen/SPARC/sjlj.ll =================================================================== --- test/CodeGen/SPARC/sjlj.ll +++ test/CodeGen/SPARC/sjlj.ll @@ -66,14 +66,15 @@ ; CHECK: ba .LBB1_1 ; CHECK: nop ; CHECK:.LBB1_1: ! %entry -; CHECK: ba .LBB1_3 ; CHECK: mov %g0, %i0 +; CHECK: cmp %i0, 0 +; CHECK: bne .LBB1_4 +; CHECK: ba .LBB1_5 ; CHECK:.LBB1_2: ! Block address taken ; CHECK: mov 1, %i0 -; CHECK:.LBB1_3: ! %entry -; CHECK: cmp %i0, 0 ; CHECK: be .LBB1_5 -; CHECK: nop +; CHECK:.LBB1_4: +; CHECK: ba .LBB1_6 } declare i8* @llvm.frameaddress(i32) #2 Index: test/CodeGen/Thumb/thumb-shrink-wrapping.ll =================================================================== --- test/CodeGen/Thumb/thumb-shrink-wrapping.ll +++ test/CodeGen/Thumb/thumb-shrink-wrapping.ll @@ -1,11 +1,12 @@ -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T + ; ; Note: Lots of tests use inline asm instead of regular calls. ; This allows to have a better control on what the allocation will do. @@ -15,6 +16,8 @@ ; edges. ; Also disable the late if-converter as it makes harder to reason on ; the diffs. +; Disable tail-duplication during placement, as v4t vs v5t get different +; results due to branches not being analyzable under v5 ; Initial motivating example: Simple diamond with a call just on one side. ; CHECK-LABEL: foo: Index: test/CodeGen/Thumb2/cbnz.ll =================================================================== --- test/CodeGen/Thumb2/cbnz.ll +++ test/CodeGen/Thumb2/cbnz.ll @@ -26,7 +26,7 @@ call void @x() call void @x() call void @x() - ; CHECK: cbnz + ; CHECK: cbz %q = icmp eq i32 %y, 0 br i1 %q, label %t2, label %f Index: test/CodeGen/Thumb2/ifcvt-compare.ll =================================================================== --- test/CodeGen/Thumb2/ifcvt-compare.ll +++ test/CodeGen/Thumb2/ifcvt-compare.ll @@ -4,7 +4,7 @@ define void @f0(i32 %x) optsize { ; CHECK-LABEL: f0: - ; CHECK: cbnz + ; CHECK: cbz %p = icmp eq i32 %x, 0 br i1 %p, label %t, label %f Index: test/CodeGen/WebAssembly/mem-intrinsics.ll =================================================================== --- test/CodeGen/WebAssembly/mem-intrinsics.ll +++ test/CodeGen/WebAssembly/mem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s ; Test memcpy, memmove, and memset intrinsics. Index: test/CodeGen/X86/2012-08-17-legalizer-crash.ll =================================================================== --- test/CodeGen/X86/2012-08-17-legalizer-crash.ll +++ test/CodeGen/X86/2012-08-17-legalizer-crash.ll @@ -26,5 +26,5 @@ ret void ; CHECK-LABEL: fn1: -; CHECK: jb +; CHECK: jae } Index: test/CodeGen/X86/atom-bypass-slow-division.ll =================================================================== --- test/CodeGen/X86/atom-bypass-slow-division.ll +++ test/CodeGen/X86/atom-bypass-slow-division.ll @@ -47,8 +47,8 @@ ; CHECK-LABEL: Test_use_div_and_idiv: ; CHECK: idivl ; CHECK: divb -; CHECK: divl ; CHECK: divb +; CHECK: divl ; CHECK: addl ; CHECK: ret %resultidiv = sdiv i32 %a, %b Index: test/CodeGen/X86/avx-splat.ll =================================================================== --- test/CodeGen/X86/avx-splat.ll +++ test/CodeGen/X86/avx-splat.ll @@ -62,8 +62,10 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ## implicit-def: %YMM0 ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne LBB4_2 -; CHECK-NEXT: ## BB#1: ## %load.i1247 +; CHECK-NEXT: je LBB4_1 +; CHECK-NEXT: ## BB#2: ## %__load_and_broadcast_32.exit1249 +; CHECK-NEXT: retq +; CHECK-NEXT: LBB4_1: ## %load.i1247 ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: andq $-32, %rsp @@ -71,7 +73,6 @@ ; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %ymm0 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp -; CHECK-NEXT: LBB4_2: ## %__load_and_broadcast_32.exit1249 ; CHECK-NEXT: retq allocas: %udx495 = alloca [18 x [18 x float]], align 32 Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -69,13 +69,14 @@ ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vucomiss %xmm1, %xmm0 ; ALL-NEXT: jne LBB3_1 -; ALL-NEXT: jnp LBB3_2 +; ALL-NEXT: jp LBB3_1 +; ALL-NEXT: ## BB#2: ## %return +; ALL-NEXT: retq ; ALL-NEXT: LBB3_1: ## %if.end ; ALL-NEXT: seta %al ; ALL-NEXT: movzbl %al, %eax ; ALL-NEXT: leaq {{.*}}(%rip), %rcx ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: LBB3_2: ## %return ; ALL-NEXT: retq entry: %cmp = fcmp oeq float %p, 0.000000e+00 Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -306,7 +306,7 @@ define void @unnatural_cfg1() { ; Test that we can handle a loop with an inner unnatural loop at the end of ; a function. This is a gross CFG reduced out of the single source GCC. -; CHECK: unnatural_cfg1 +; CHECK-LABEL: unnatural_cfg1 ; CHECK: %entry ; CHECK: %loop.body1 ; CHECK: %loop.body2 @@ -344,7 +344,7 @@ ; Test that we can handle a loop with a nested natural loop *and* an unnatural ; loop. This was reduced from a crash on block placement when run over ; single-source GCC. -; CHECK: unnatural_cfg2 +; CHECK-LABEL: unnatural_cfg2 ; CHECK: %entry ; CHECK: %loop.body1 ; CHECK: %loop.body2 @@ -551,7 +551,7 @@ ; didn't correctly locate the fallthrough successor, assuming blindly that the ; first one was the fallthrough successor. As a result, we would add an ; erroneous jump to the landing pad thinking *that* was the default successor. -; CHECK: test_eh_lpad_successor +; CHECK-LABEL: test_eh_lpad_successor ; CHECK: %entry ; CHECK-NOT: jmp ; CHECK: %loop @@ -579,7 +579,7 @@ ; fallthrough simply won't occur. Make sure we don't crash trying to update ; terminators for such constructs. ; -; CHECK: test_eh_throw +; CHECK-LABEL: test_eh_throw ; CHECK: %entry ; CHECK: %cleanup @@ -601,7 +601,7 @@ ; attempt to merge onto the wrong end of the inner loop just because we find it ; first. This was reduced from a crasher in GCC's single source. ; -; CHECK: test_unnatural_cfg_backwards_inner_loop +; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop ; CHECK: %entry ; CHECK: %loop2b ; CHECK: %loop1 @@ -641,7 +641,7 @@ ; fallthrough because that happens to always produce unanalyzable branches on ; x86. ; -; CHECK: unanalyzable_branch_to_loop_header +; CHECK-LABEL: unanalyzable_branch_to_loop_header ; CHECK: %entry ; CHECK: %loop ; CHECK: %exit @@ -665,7 +665,7 @@ ; This branch is now analyzable and hence the destination block becomes the ; hotter one. The right order is entry->bar->exit->foo. ; -; CHECK: unanalyzable_branch_to_best_succ +; CHECK-LABEL: unanalyzable_branch_to_best_succ ; CHECK: %entry ; CHECK: %bar ; CHECK: %exit @@ -691,12 +691,13 @@ ; Ensure that we can handle unanalyzable branches where the destination block ; gets selected as the best free block in the CFG. ; -; CHECK: unanalyzable_branch_to_free_block +; CHECK-LABEL: unanalyzable_branch_to_free_block ; CHECK: %entry ; CHECK: %a ; CHECK: %b -; CHECK: %c ; CHECK: %exit +; CHECK: %c +; CHECK: retl entry: br i1 undef, label %a, label %b @@ -721,7 +722,7 @@ ; Ensure that we don't crash as we're building up many unanalyzable branches, ; blocks, and loops. ; -; CHECK: many_unanalyzable_branches +; CHECK-LABEL: many_unanalyzable_branches ; CHECK: %entry ; CHECK: %exit @@ -940,7 +941,7 @@ ; strange layouts that are siginificantly less efficient, often times maing ; it discontiguous. ; -; CHECK: @benchmark_heapsort +; CHECK-LABEL: @benchmark_heapsort ; CHECK: %entry ; First rotated loop top. ; CHECK: .p2align Index: test/CodeGen/X86/cmovcmov.ll =================================================================== --- test/CodeGen/X86/cmovcmov.ll +++ test/CodeGen/X86/cmovcmov.ll @@ -192,7 +192,7 @@ ; CMOV-NEXT: retq ; NOCMOV: jne -; NOCMOV-NEXT: jp +; NOCMOV-NEXT: jnp define float @test_zext_fcmp_une(float %a, float %b) #0 { entry: %cmp = fcmp une float %a, %b @@ -214,7 +214,7 @@ ; CMOV-NEXT: retq ; NOCMOV: jne -; NOCMOV-NEXT: jp +; NOCMOV-NEXT: jnp define float @test_zext_fcmp_oeq(float %a, float %b) #0 { entry: %cmp = fcmp oeq float %a, %b Index: test/CodeGen/X86/critical-edge-split-2.ll =================================================================== --- test/CodeGen/X86/critical-edge-split-2.ll +++ test/CodeGen/X86/critical-edge-split-2.ll @@ -24,6 +24,7 @@ ; CHECK-LABEL: test1: ; CHECK: testb %dil, %dil -; CHECK: jne LBB0_2 +; CHECK: je LBB0_1 +; CHECK: retq +; CHECK: LBB0_1: ; CHECK: divl -; CHECK: LBB0_2: Index: test/CodeGen/X86/fp-une-cmp.ll =================================================================== --- test/CodeGen/X86/fp-une-cmp.ll +++ test/CodeGen/X86/fp-une-cmp.ll @@ -56,11 +56,11 @@ ; CHECK-NEXT: ucomisd %xmm1, %xmm0 ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: jp .LBB1_1 -; CHECK-NEXT: .LBB1_2: # %bb2 +; CHECK-NEXT: # BB#2: # %bb2 ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB1_1: # %bb1 ; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: jmp .LBB1_2 +; CHECK-NEXT: retq entry: %mul = fmul double %x, %y Index: test/CodeGen/X86/ragreedy-bug.ll =================================================================== --- test/CodeGen/X86/ragreedy-bug.ll +++ test/CodeGen/X86/ragreedy-bug.ll @@ -6,13 +6,13 @@ ; CHECK: isupper.exit ; CHECK-NEXT: in Loop ; CHECK-NEXT: testl -; CHECK-NEXT: jne +; CHECK-NEXT: je +; CHECK: maskrune ; CHECK: isupper.exit ; CHECK-NEXT: in Loop ; CHECK-NEXT: testl ; CHECK-NEXT: je ; CHECK: maskrune -; CHECK: maskrune %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* } %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* } Index: test/CodeGen/X86/shrink-wrap-chkstk.ll =================================================================== --- test/CodeGen/X86/shrink-wrap-chkstk.ll +++ test/CodeGen/X86/shrink-wrap-chkstk.ll @@ -62,11 +62,12 @@ ; CHECK-LABEL: @use_eax_before_prologue@8: # @use_eax_before_prologue ; CHECK: movl %ecx, %eax ; CHECK: cmpl %edx, %eax -; CHECK: jge LBB1_2 +; CHECK: jl LBB1_1 +; CHECK: retl +; CHECK: LBB1_1 ; CHECK: pushl %eax ; CHECK: movl $4092, %eax ; CHECK: calll __chkstk ; CHECK: movl 4092(%esp), %eax ; CHECK: calll _doSomething -; CHECK: LBB1_2: ; CHECK: retl Index: test/CodeGen/X86/statepoint-invoke.ll =================================================================== --- test/CodeGen/X86/statepoint-invoke.ll +++ test/CodeGen/X86/statepoint-invoke.ll @@ -89,6 +89,7 @@ left.relocs: ; CHECK: movq (%rsp), ; CHECK: movq 8(%rsp), [[REGVAL2:%[a-z]+]] + ; CHECK: cmoveq {{.*}}[[REGVAL2]]{{.*}} %val1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13) %val2.relocated_left = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14) br label %normal_return @@ -104,13 +105,13 @@ right.relocs: ; CHECK: movq (%rsp), [[REGVAL2]] ; CHECK: movq + ; CHECK: cmoveq {{.*}}[[REGVAL2]]{{.*}} %val2.relocated_right = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 13, i32 13) %val3.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 14, i32 14) br label %normal_return normal_return: ; CHECK-LABEL: %normal_return - ; CHECK: cmoveq {{.*}}[[REGVAL2]]{{.*}} ; CHECK: retq %a1 = phi i64 addrspace(1)* [%val1.relocated, %left.relocs], [%val3.relocated, %right.relocs] %a2 = phi i64 addrspace(1)* [%val2.relocated_left, %left.relocs], [%val2.relocated_right, %right.relocs] Index: test/CodeGen/X86/twoaddr-coalesce-3.ll =================================================================== --- test/CodeGen/X86/twoaddr-coalesce-3.ll +++ test/CodeGen/X86/twoaddr-coalesce-3.ll @@ -19,7 +19,7 @@ ; Check that only one mov will be generated in the kernel loop. ; CHECK-LABEL: foo: -; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body +; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}} ; CHECK-NOT: mov ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]] ; CHECK-NOT: mov @@ -56,7 +56,7 @@ ; Check that only two mov will be generated in the kernel loop. ; CHECK-LABEL: goo: -; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body +; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}} ; CHECK-NOT: mov ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]] ; CHECK-NOT: mov Index: test/CodeGen/X86/x86-shrink-wrap-unwind.ll =================================================================== --- test/CodeGen/X86/x86-shrink-wrap-unwind.ll +++ test/CodeGen/X86/x86-shrink-wrap-unwind.ll @@ -24,7 +24,9 @@ ; After the prologue is set. ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]] ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]] -; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] +; CHECK: popq +; CHECK-NEXT: retq ; ; Store %a in the alloca. ; CHECK: movl [[ARG0CPY]], 4(%rsp) @@ -33,14 +35,9 @@ ; Set the first argument to zero. ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: callq _doSomething -; -; CHECK: [[EXIT_LABEL]]: -; -; Without shrink-wrapping, epilogue is in the exit block. -; Epilogue code. (What we pop does not matter.) ; CHECK-NEXT: popq -; ; CHECK-NEXT: retq +; define i32 @framelessUnwind(i32 %a, i32 %b) #0 { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b @@ -70,9 +67,11 @@ ; After the prologue is set. ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]] ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]] -; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] +; CHECK: retq ; ; Prologue code. +; CHECK-NEXT: [[SUCCESS_LABEL]] ; CHECK: pushq %rbp ; CHECK: movq %rsp, %rbp ; @@ -86,9 +85,8 @@ ; ; Epilogue code. (What we pop does not matter.) ; CHECK: popq %rbp -; -; CHECK: [[EXIT_LABEL]]: ; CHECK-NEXT: retq +; define i32 @frameUnwind(i32 %a, i32 %b) #1 { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b @@ -116,10 +114,12 @@ ; After the prologue is set. ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]] ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]] -; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] +; CHECK: retq ; ; Prologue code. ; (What we push does not matter. It should be some random sratch register.) +; CHECK-NEXT: [[SUCCESS_LABEL]] ; CHECK: pushq ; ; Store %a in the alloca. @@ -132,8 +132,6 @@ ; ; Epilogue code. ; CHECK-NEXT: addq -; -; CHECK: [[EXIT_LABEL]]: ; CHECK-NEXT: retq define i32 @framelessnoUnwind(i32 %a, i32 %b) #2 { %tmp = alloca i32, align 4 Index: test/CodeGen/X86/x86-shrink-wrapping.ll =================================================================== --- test/CodeGen/X86/x86-shrink-wrapping.ll +++ test/CodeGen/X86/x86-shrink-wrapping.ll @@ -18,18 +18,24 @@ ; No prologue needed. ; ENABLE: movl %edi, [[ARG0CPY:%e[a-z]+]] ; ENABLE-NEXT: cmpl %esi, [[ARG0CPY]] -; ENABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; ENABLE-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] +; ENABLE: retq ; ; Prologue code. ; (What we push does not matter. It should be some random sratch register.) +; ENABLE: [[SUCCESS_LABEL]]: ; CHECK: pushq ; ; Compare the arguments and jump to exit. ; After the prologue is set. ; DISABLE: movl %edi, [[ARG0CPY:%e[a-z]+]] ; DISABLE-NEXT: cmpl %esi, [[ARG0CPY]] -; DISABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; DISABLE-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] ; +; DISABLE: popq +; DISABLE-NEXT: retq + +; DISABLE: [[SUCCESS_LABEL]]: ; Store %a in the alloca. ; CHECK: movl [[ARG0CPY]], 4(%rsp) ; Set the alloca address in the second argument. @@ -37,17 +43,11 @@ ; Set the first argument to zero. ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: callq _doSomething -; ; With shrink-wrapping, epilogue is just after the call. ; ENABLE-NEXT: addq $8, %rsp -; -; CHECK: [[EXIT_LABEL]]: -; -; Without shrink-wrapping, epilogue is in the exit block. -; Epilogue code. (What we pop does not matter.) ; DISABLE-NEXT: popq -; ; CHECK-NEXT: retq + define i32 @foo(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b