Index: include/llvm/Analysis/LoopInfoImpl.h =================================================================== --- include/llvm/Analysis/LoopInfoImpl.h +++ include/llvm/Analysis/LoopInfoImpl.h @@ -185,8 +185,13 @@ template void LoopBase:: addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase &LIB) { - assert((Blocks.empty() || LIB[getHeader()] == this) && - "Incorrect LI specified for this loop!"); +#ifndef NDEBUG + if (!Blocks.empty()) { + auto SameHeader = LIB[getHeader()]; + assert(contains(SameHeader) && getHeader() == SameHeader->getHeader() + && "Incorrect LI specified for this loop!"); + } +#endif assert(NewBB && "Cannot add a null basic block to the loop!"); assert(!LIB[NewBB] && "BasicBlock already in the loop!"); Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -306,10 +306,21 @@ /// must be done inline. TailDuplicator TailDup; - /// \brief A set of blocks that are unavoidably execute, i.e. they dominate - /// all terminators of the MachineFunction. + /// \brief A set of blocks that are unavoidably executed. + /// + /// i.e. they dominate + /// all terminators of the MachineFunction. Also used within loops for blocks + /// that are unavoidable within the loop. SmallPtrSet UnavoidableBlocks; + /// \brief A set of delayed blocks for tail-duplication. + /// + /// These blocks form a second spine through a loop/function, and so + /// predecessors within this set do not need to be able to placed. + /// This allows the tail-duplicated spine (or similar cfg) to grow beyond + /// 2 blocks. See the description of canTailDuplicateAllPreds. + SmallPtrSet TailDupDelayBlocks; + /// \brief Allocator and owner of BlockChain structures. /// /// We build BlockChains lazily while processing the loop structure of @@ -389,12 +400,25 @@ const BlockFilterSet &LoopBlockSet); void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet); - void collectMustExecuteBBs(); void buildCFGChains(); void optimizeBranches(); void alignBlocks(); + /// Compute the set of blocks that are unavoidable within a loop's sub-CFG + void computeLoopUnavoidableBlocks(MachineLoop &L); + /// Compute the set of blocks that are unavoidable within a function. + void computeUnavoidableBlocks(); + /// See if Succ can tail-duplicate into all un-placed, un-filtered + /// predecessors. Excludes predecessors in TailDupDelayBlocks. + bool canTailDuplicateAllPreds(MachineBasicBlock *BB, MachineBasicBlock *Succ, + BlockChain &Chain, + const BlockFilterSet *BlockFilter); + /// Add all un-filtered unplaced blocks that will be duplicated into to the + /// delay set. + void delayTailDuplicatedBlocks(MachineBasicBlock *BB, MachineBasicBlock *Succ, + BlockChain &Chain, + const BlockFilterSet *BlockFilter); -public: + public: static char ID; // Pass identification, replacement for typeid MachineBlockPlacement() : MachineFunctionPass(ID) { initializeMachineBlockPlacementPass(*PassRegistry::getPassRegistry()); @@ -551,6 +575,85 @@ return SuccProb; } +static bool hasSameSuccessors( + MachineBasicBlock &BB, SmallPtrSetImpl &Successors) { + if (BB.succ_size() != Successors.size()) + return false; + // We don't want to count self-loops + if (Successors.count(&BB)) + return false; + for (MachineBasicBlock *Succ : BB.successors()) + if (!Successors.count(Succ)) + return false; + return true; +} + +/// When the option TailDupPlacement is on, this method checks if the +/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated +/// into all of its unplaced, unfiltered predecessors, that are not BB. In +/// addition we keep a set of blocks that have been tail-duplicated into and +/// allow those blocks to be unplaced as well. This allows the creation of a +/// second (larger) spine and a short fallthrough spine. +/// We also identify blocks with the CFG that would have been produced by +/// tail-duplication and lay them out in the same manner. +bool MachineBlockPlacement::canTailDuplicateAllPreds( + MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, + const BlockFilterSet *BlockFilter) { + DEBUG(dbgs() << "Checking to see if block " << getBlockName(Succ) + << " can tail duplicate into all its predecessors.\n"); + bool IsSimple = TailDup.isSimpleBB(Succ); + + if (!TailDup.shouldTailDuplicate(*Succ->getParent(), IsSimple, *Succ)) { + DEBUG(dbgs() << "Skipping because it is " + << "not a candidate for duplication.\n"); + return false; + } + // For CFG checking. + SmallPtrSet Successors(BB->succ_begin(), BB->succ_end()); + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // Make sure all unplaced and unfiltered predecessors are either part + // of the second spine, or can be tail-duplicated into. + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain) + continue; + // If Pred is part of the growing second spine, we don't need to be + // able to copy succ onto the end of it. + if (TailDupDelayBlocks.count(Pred) > 0) + continue; + if (!TailDup.canTailDuplicate(Succ, Pred)) { + DEBUG(dbgs() << "Possibly skipping because it can't be duplicated into block " + << getBlockName(Pred) << ".\n"); + // Check for #Successors > 1 to make sure we aren't just outlining in the + // triangle case. + if (Successors.size() > 1 + && hasSameSuccessors(*Pred, Successors)) { + DEBUG(dbgs() << "Not skipping because it looks like a tail-duplicated block.\n"); + continue; + } else { + DEBUG(dbgs() << "Skipping because it can't be duplicated into block " + << getBlockName(Pred) << ".\n"); + } + return false; + } + } + return true; +} + +/// Add all un-filtered unplaced blocks that will be duplicated into to the +/// delay set. +void MachineBlockPlacement::delayTailDuplicatedBlocks( + MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, + const BlockFilterSet *BlockFilter) { + for (MachineBasicBlock *Pred : Succ->predecessors()) { + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain + || TailDupDelayBlocks.count(Pred) > 0) + continue; + DEBUG(dbgs() << "Delaying block: " << getBlockName(Pred) << ".\n"); + TailDupDelayBlocks.insert(Pred); + } +} + /// When the option OutlineOptionalBranches is on, this method /// checks if the fallthrough candidate block \p Succ (of block /// \p BB) also has other unscheduled predecessor blocks which @@ -564,12 +667,20 @@ MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, const BlockFilterSet *BlockFilter, BranchProbability SuccProb, BranchProbability HotProb) { - if (!OutlineOptionalBranches) + if (!OutlineOptionalBranches && !TailDupPlacement) return false; // If we outline optional branches, look whether Succ is unavoidable, i.e. // dominates all terminators of the MachineFunction. If it does, other // successors must be optional. Don't do this for cold branches. if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) { + bool TailDupDelay; + if (OutlineOptionalBranches) + TailDupDelay = false; + else if (TailDupPlacement + && canTailDuplicateAllPreds(BB, Succ, Chain, BlockFilter)) + TailDupDelay = true; + else + return false; for (MachineBasicBlock *Pred : Succ->predecessors()) { // Check whether there is an unplaced optional branch. if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || @@ -582,9 +693,11 @@ if (Pred->size() < OutlineOptionalThreshold) return false; } + if (TailDupDelay) + delayTailDuplicatedBlocks(BB, Succ, Chain, BlockFilter); return true; - } else - return false; + } + return false; } // When profile is not present, return the StaticLikelyProb. @@ -808,7 +921,9 @@ BranchProbability SuccProb = getAdjustedProbability(RealSuccProb, AdjustedSumProb); - // This heuristic is off by default. + // Full outlinining is off by default. + // Tail-duplication during layout, and outlining blocks that are + // tail-duplicated into is on by default. if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb, HotProb)) return Succ; @@ -1046,6 +1161,7 @@ // Place this block, updating the datastructures to reflect its placement. BlockChain &SuccChain = *BlockToChain[BestSucc]; + TailDupDelayBlocks.erase(BestSucc); // Zero out UnscheduledPredecessors for the successor we're about to merge in case // we selected a successor that didn't fit naturally into the CFG. SuccChain.UnscheduledPredecessors = 0; @@ -1465,6 +1581,80 @@ return LoopBlockSet; } + +/// \brief Finds unavoidable blocks within a loop. +/// +/// These blocks form the loop spine, and knowing which blocks they are allow +/// the loop-optional blocks to be outlined to the end of the loop, +/// unconditionally or if they can form a second tail-duped spine. +void MachineBlockPlacement::computeLoopUnavoidableBlocks(MachineLoop &L) { + SmallVector Exits; + L.getLoopLatches(Exits); + // Find the nearest common dominator of all of L's latches. + MachineBasicBlock *Dominator = nullptr; + for (MachineBasicBlock *MBB : Exits) { + DEBUG(dbgs() << "Block: " << getBlockName(MBB) + << " is a latch.\n"); + if (Dominator == nullptr) + Dominator = MBB; + else + Dominator = MDT->findNearestCommonDominator(Dominator, MBB); + } + + Exits.clear(); + L.getExitingBlocks(Exits); + for (MachineBasicBlock *MBB : Exits) { + DEBUG(dbgs() << "Block: " << getBlockName(MBB) + << " is a loop exit.\n"); + if (MBB == L.getHeader()) + continue; + if (Dominator == nullptr) + Dominator = MBB; + else + Dominator = MDT->findNearestCommonDominator(Dominator, MBB); + } + + // MBBs dominating this common dominator are unavoidable. + UnavoidableBlocks.clear(); + for (MachineBasicBlock *MBB : L.getBlocks()) + if (MDT->dominates(MBB, Dominator)) { + DEBUG(dbgs() << "Block: " << getBlockName(MBB) + << " is loop un-avoidable.\n"); + UnavoidableBlocks.insert(MBB); + } +} + + +/// \brief Finds unavoidable blocks for the entire function +/// +/// These blocks form the spine, and knowing which blocks they are allow +/// the optional blocks to be outlined to the end of the function +/// unconditionally or if they can form a second tail-duped spine. +void MachineBlockPlacement::computeUnavoidableBlocks() { + MachineBasicBlock * Terminator = nullptr; + for (MachineBasicBlock &MBB : *F) { + if (MBB.succ_size() == 0) { + if (Terminator == nullptr) + Terminator = &MBB; + else + Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); + } + } + + // MBBs dominating this common dominator are unavoidable. + UnavoidableBlocks.clear(); + // If there are no exit blocks from the function, punt and assume that there + // are no unavoidable blocks. This will result in a linear layout. + if (Terminator == nullptr) + return; + for (MachineBasicBlock &MBB : *F) + if (MDT->dominates(&MBB, Terminator)) { + DEBUG(dbgs() << "Block: " << getBlockName(&MBB) + << " is un-avoidable.\n"); + UnavoidableBlocks.insert(&MBB); + } +} + /// \brief Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code @@ -1481,6 +1671,13 @@ assert(EHPadWorkList.empty()); BlockFilterSet LoopBlockSet = collectLoopBlockSet(L); + // Find the unavoidable blocks within this loop. This allows partial outlining + // with tail duplication within a loop. + if (TailDupPlacement) { + computeLoopUnavoidableBlocks(L); + TailDupDelayBlocks.clear(); + } + // Check if we have profile data for this function. If yes, we will rotate // this loop by modeling costs more precisely which requires the profile data // for better layout. @@ -1559,31 +1756,6 @@ EHPadWorkList.clear(); } -/// When OutlineOpitonalBranches is on, this method collects BBs that -/// dominates all terminator blocks of the function \p F. -void MachineBlockPlacement::collectMustExecuteBBs() { - if (OutlineOptionalBranches) { - // Find the nearest common dominator of all of F's terminators. - MachineBasicBlock *Terminator = nullptr; - for (MachineBasicBlock &MBB : *F) { - if (MBB.succ_size() == 0) { - if (Terminator == nullptr) - Terminator = &MBB; - else - Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); - } - } - - // MBBs dominating this common dominator are unavoidable. - UnavoidableBlocks.clear(); - for (MachineBasicBlock &MBB : *F) { - if (MDT->dominates(&MBB, Terminator)) { - UnavoidableBlocks.insert(&MBB); - } - } - } -} - void MachineBlockPlacement::buildCFGChains() { // Ensure that every BB in the function has an associated chain to simplify // the assumptions of the remaining algorithm. @@ -1615,9 +1787,6 @@ } } - // Turned on with OutlineOptionalBranches option - collectMustExecuteBBs(); - // Build any loop-based chains. for (MachineLoop *L : *MLI) buildLoopChains(*L); @@ -1625,6 +1794,13 @@ assert(BlockWorkList.empty()); assert(EHPadWorkList.empty()); + // This must go after the loop chains, because the loop chains compute their + // own loop-relative UnavoidableBlocks + if (OutlineOptionalBranches || TailDupPlacement) { + computeUnavoidableBlocks(); + TailDupDelayBlocks.clear(); + } + SmallPtrSet UpdatedPreds; for (MachineBasicBlock &MBB : *F) fillWorkLists(&MBB, UpdatedPreds); @@ -1963,15 +2139,14 @@ /*CommonHoist=*/false, *MBFI, *MBPI); - DEBUG(MF.dump()); if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), getAnalysisIfAvailable(), MLI, /*AfterBlockPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. - DEBUG(MF.dump()); BlockToChain.clear(); // Must redo the dominator tree if blocks were changed. MDT->runOnMachineFunction(MF); + BlockToChain.clear(); ChainAllocator.DestroyAll(); buildCFGChains(); } Index: test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll =================================================================== --- test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll +++ test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -664,12 +664,12 @@ ; No realignment in the prologue. ; CHECK-NOT: and ; CHECK-NOT: 0xffffffffffffffe0 -; CHECK: tbz {{.*}} .[[LABEL:.*]] +; CHECK: tbnz {{.*}} .[[LABEL:.*]] +; CHECK: ret +; CHECK: .[[LABEL]]: ; Stack is realigned in a non-entry BB. ; CHECK: sub [[REG:x[01-9]+]], sp, #64 ; CHECK: and sp, [[REG]], #0xffffffffffffffe0 -; CHECK: .[[LABEL]]: -; CHECK: ret define void @realign_conditional2(i1 %b) { @@ -687,15 +687,15 @@ ; CHECK-LABEL: realign_conditional2 ; Extra realignment in the prologue (performance issue). -; CHECK: tbz {{.*}} .[[LABEL:.*]] +; CHECK: tbnz {{.*}} .[[LABEL:.*]] +; CHECK: ret +; CHECK: .[[LABEL]]: ; CHECK: sub x9, sp, #32 // =32 ; CHECK: and sp, x9, #0xffffffffffffffe0 ; CHECK: mov x19, sp ; Stack is realigned in a non-entry BB. ; CHECK: sub [[REG:x[01-9]+]], sp, #64 ; CHECK: and sp, [[REG]], #0xffffffffffffffe0 -; CHECK: .[[LABEL]]: -; CHECK: ret attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/CodeGen/AArch64/arm64-atomic.ll =================================================================== --- test/CodeGen/AArch64/arm64-atomic.ll +++ test/CodeGen/AArch64/arm64-atomic.ll @@ -9,10 +9,10 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -27,10 +27,12 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret %new = load i32, i32* %pnew %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 @@ -41,15 +43,15 @@ ; CHECK-LABEL: val_compare_and_swap_rel: ; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: -; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]] +; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: cmp [[RESULT]], w1 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] -; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]] +; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -64,10 +66,10 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val Index: test/CodeGen/AArch64/arm64-ccmp.ll =================================================================== --- test/CodeGen/AArch64/arm64-ccmp.ll +++ test/CodeGen/AArch64/arm64-ccmp.ll @@ -51,7 +51,7 @@ ; CHECK: cmp ; CHECK: b.eq ; CHECK: cmp -; CHECK: b.gt +; CHECK: b.le define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp { entry: %cmp = icmp eq i32 %a, 5 @@ -78,7 +78,7 @@ ; CHECK: cmp ; CHECK: b.eq ; CHECK: cmp -; CHECK: tbz +; CHECK: tbnz define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp { entry: %cmp = icmp eq i32 %a, 5 Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -10,9 +10,11 @@ ; Compare the arguments and jump to exit. ; No prologue needed. ; ENABLE: cmp w0, w1 -; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]] +; ENABLE-NEXT: b.lt [[PROLOGUE_LABEL:LBB[0-9_]+]] +; ENABLE: ret ; ; Prologue code. +; ENABLE: [[PROLOGUE_LABEL]]: ; CHECK: sub sp, sp, #32 ; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16] ; CHECK-NEXT: add [[SAVE_SP]], sp, #16 @@ -37,7 +39,6 @@ ; CHECK-NEXT: add sp, sp, #32 ; ; With shrink-wrapping, exit block is a simple return. -; ENABLE: [[EXIT_LABEL]]: ; CHECK-NEXT: ret define i32 @foo(i32 %a, i32 %b) { %tmp = alloca i32, align 4 Index: test/CodeGen/AArch64/branch-relax-bcc.ll =================================================================== --- test/CodeGen/AArch64/branch-relax-bcc.ll +++ test/CodeGen/AArch64/branch-relax-bcc.ll @@ -35,22 +35,20 @@ ; CHECK-LABEL: _block_split: ; CHECK: cmp w0, #5 -; CHECK-NEXT: b.eq [[LONG_BR_BB:LBB[0-9]+_[0-9]+]] -; CHECK-NEXT: b [[LOR_LHS_FALSE_BB:LBB[0-9]+_[0-9]+]] - -; CHECK: [[LONG_BR_BB]]: +; CHECK-NEXT: b.ne [[LOR_LHS_FALSE_BB:LBB[0-9]+_[0-9]+]] ; CHECK-NEXT: b [[IF_THEN_BB:LBB[0-9]+_[0-9]+]] ; CHECK: [[LOR_LHS_FALSE_BB]]: ; CHECK: cmp w{{[0-9]+}}, #16 ; CHECK-NEXT: b.le [[IF_THEN_BB]] -; CHECK-NEXT: b [[IF_END_BB:LBB[0-9]+_[0-9]+]] -; CHECK: [[IF_THEN_BB]]: +; CHECK: ; %if.end +; CHECK: #0x7 +; CHECK: ret + +; CHECK: [[IF_THEN_BB]] ; CHECK: bl _foo ; CHECK-NOT: b L - -; CHECK: [[IF_END_BB]]: ; CHECK: #0x7 ; CHECK: ret define i32 @block_split(i32 %a, i32 %b) #0 { Index: test/CodeGen/AArch64/combine-comparisons-by-cse.ll =================================================================== --- test/CodeGen/AArch64/combine-comparisons-by-cse.ll +++ test/CodeGen/AArch64/combine-comparisons-by-cse.ll @@ -264,9 +264,9 @@ define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 { ; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ ; CHECK: cmn -; CHECK: b.gt +; CHECK-NEXT: b.le ; CHECK: cmp -; CHECK: b.gt +; CHECK-NEXT: b.le entry: %0 = load i32, i32* @a, align 4 %cmp4 = icmp slt i32 %0, -1 Index: test/CodeGen/AArch64/fcmp.ll =================================================================== --- test/CodeGen/AArch64/fcmp.ll +++ test/CodeGen/AArch64/fcmp.ll @@ -31,7 +31,7 @@ %tst4 = fcmp uge float %a, -0.0 br i1 %tst4, label %t5, label %end ; CHECK-NOT: fcmp {{s[0-9]+}}, #0.0 -; CHECK: b.mi .LBB +; CHECK: b.pl .LBB t5: call void @bar(i32 0) @@ -70,7 +70,7 @@ %tst4 = fcmp uge double %a, -0.0 br i1 %tst4, label %t5, label %end ; CHECK-NOT: fcmp {{d[0-9]+}}, #0.0 -; CHECK: b.mi .LBB +; CHECK: b.pl .LBB t5: call void @bar(i32 0) Index: test/CodeGen/AArch64/rm_redundant_cmp.ll =================================================================== --- test/CodeGen/AArch64/rm_redundant_cmp.ll +++ test/CodeGen/AArch64/rm_redundant_cmp.ll @@ -13,7 +13,7 @@ ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}} ; CHECK-NEXT: b.gt ; CHECK-NOT: cmp -; CHECK: b.ne +; CHECK: b.eq entry: %0 = load i16, i16* getelementptr inbounds (%struct.s_signed_i16, %struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2 %1 = load i16, i16* getelementptr inbounds (%struct.s_signed_i16, %struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2 @@ -69,7 +69,7 @@ ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}} ; CHECK-NEXT: b.hi ; CHECK-NOT: cmp -; CHECK: b.ne +; CHECK: b.eq entry: %0 = load i16, i16* getelementptr inbounds (%struct.s_unsigned_i16, %struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2 %1 = load i16, i16* getelementptr inbounds (%struct.s_unsigned_i16, %struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2 @@ -134,7 +134,7 @@ ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}} ; CHECK-NEXT: b.gt ; CHECK-NOT: cmp -; CHECK: b.ne +; CHECK: b.eq entry: %0 = load i8, i8* getelementptr inbounds (%struct.s_signed_i8, %struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2 %1 = load i8, i8* getelementptr inbounds (%struct.s_signed_i8, %struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2 @@ -190,7 +190,7 @@ ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}} ; CHECK-NEXT: b.hi ; CHECK-NOT: cmp -; CHECK: b.ne +; CHECK: b.eq entry: %0 = load i8, i8* getelementptr inbounds (%struct.s_unsigned_i8, %struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2 %1 = load i8, i8* getelementptr inbounds (%struct.s_unsigned_i8, %struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2 Index: test/CodeGen/AArch64/tbz-tbnz.ll =================================================================== --- test/CodeGen/AArch64/tbz-tbnz.ll +++ test/CodeGen/AArch64/tbz-tbnz.ll @@ -10,7 +10,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -28,7 +28,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:x[0-9]+]], x0, #12 -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() @@ -46,7 +46,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbnz [[CMP]], #31 +; CHECK: tbz [[CMP]], #31 if.then: call void @t() @@ -64,7 +64,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:x[0-9]+]], x0, #12 -; CHECK: tbnz [[CMP]], #63 +; CHECK: tbz [[CMP]], #63 if.then: call void @t() @@ -82,7 +82,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbnz [[CMP]], #31 +; CHECK: tbz [[CMP]], #31 if.then: call void @t() @@ -100,7 +100,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:x[0-9]+]], x0, #12 -; CHECK: tbnz [[CMP]], #63 +; CHECK: tbz [[CMP]], #63 if.then: call void @t() @@ -118,7 +118,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -162,7 +162,7 @@ br i1 %tst4, label %if.then4, label %if.end ; CHECK: tst x0, x1, lsl #62 -; CHECK: b.lt +; CHECK: b.ge if.then4: call void @t() @@ -178,7 +178,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -194,7 +194,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -209,7 +209,7 @@ ; CHECK: ldr [[CMP:x[0-9]+]], [x1] ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 %val = load i64, i64* %ptr %tst = icmp slt i64 %val, 0 @@ -229,7 +229,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -247,7 +247,7 @@ ; CHECK: orr [[CMP:x[0-9]+]], x0, x1 ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() @@ -262,7 +262,7 @@ br i1 %cond, label %if.end, label %if.then ; CHECK-NOT: and -; CHECK: tbnz w0, #0 +; CHECK: tbz w0, #0 if.then: call void @t() @@ -278,7 +278,7 @@ br i1 %cond1, label %if.then, label %if.end ; CHECK-NOT: movn -; CHECK: tbnz w0, #0 +; CHECK: tbz w0, #0 if.then: call void @t() @@ -296,7 +296,7 @@ br i1 %cond, label %then, label %end ; CHECK-NOT: lsl -; CHECK: tbnz w0, #2 +; CHECK: tbz w0, #2 then: call void @t() @@ -314,7 +314,7 @@ br i1 %cond, label %then, label %end ; CHECK-NOT: lsr -; CHECK: tbnz w0, #3 +; CHECK: tbz w0, #3 then: call void @t() @@ -331,7 +331,7 @@ br i1 %cond, label %then, label %end ; CHECK-NOT: asr -; CHECK: tbnz w0, #31 +; CHECK: tbz w0, #31 then: call void @t() @@ -350,7 +350,7 @@ br i1 %cond, label %then, label %end ; CHECK-NOT: ubfx -; CHECK: tbnz w0, #3 +; CHECK: tbz w0, #3 then: call void @t() Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll =================================================================== --- test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -2,11 +2,11 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s ; GCN-LABEL: {{^}}test_loop: -; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: +; GCN: s_endpgm +; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}} ; GCN: ds_read_b32 ; GCN: ds_write_b32 ; GCN: s_branch [[LABEL]] -; GCN: s_endpgm define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind { entry: %cmp = icmp eq i32 %n, -1 Index: test/CodeGen/AMDGPU/convergent-inlineasm.ll =================================================================== --- test/CodeGen/AMDGPU/convergent-inlineasm.ll +++ test/CodeGen/AMDGPU/convergent-inlineasm.ll @@ -6,6 +6,8 @@ ; GCN: v_cmp_ne_i32_e64 ; GCN: ; mask branch ; GCN: BB{{[0-9]+_[0-9]+}}: +; GCN: BB{{[0-9]+_[0-9]+}}: +; GCN: s_endpgm define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -26,9 +28,12 @@ ; GCN: ; mask branch ; GCN: BB{{[0-9]+_[0-9]+}}: -; GCN: v_cmp_ne_i32_e64 +; GCN: s_endpgm ; GCN: BB{{[0-9]+_[0-9]+}}: +; GCN: v_cmp_ne_i32_e64 +; GCN: s_endpgm + define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -437,11 +437,12 @@ ; GCN: s_load_dword [[SGPR:s[0-9]+]] ; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}} ; GCN: s_and_b64 vcc, exec, vcc -; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]] +; GCN: s_cbranch_vccz [[SUCCESS:[A-Z0-9_]+]] +; GCN: s_endpgm +; GCN: {{^}}[[SUCCESS]]: ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN-NOHSA: buffer_store_dword [[ONE]] ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]] -; GCN; {{^}}[[EXIT]]: ; GCN: s_endpgm define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) { bb3: ; preds = %bb2 Index: test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf.ll +++ test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -93,13 +93,13 @@ ; SI-NEXT: s_cbranch_scc0 [[ENDPGM:BB[0-9]+_[0-9]+]] ; SI: s_cmp_gt_i32 -; SI-NEXT: s_cbranch_scc1 [[ENDPGM]] - -; SI: [[INFLOOP:BB[0-9]+_[0-9]+]] -; SI: s_branch [[INFLOOP]] +; SI-NEXT: s_cbranch_scc0 [[INFLOOP:BB[0-9]+_[0-9]+]] ; SI: [[ENDPGM]]: ; SI: s_endpgm + +; SI: [[INFLOOP]] +; SI: s_branch [[INFLOOP]] define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { entry: %cmp = icmp sgt i32 %c0, 0 Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -268,14 +268,16 @@ ; CHECK: [[PHIBB]]: ; CHECK: v_cmp_eq_f32_e32 vcc, 0, [[PHIREG]] ; CHECK: s_and_b64 vcc, exec, vcc -; CHECK: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]] +; CHECK: s_cbranch_vccnz [[BB3:BB[0-9]+_[0-9]+]] -; CHECK: ; BB#3: ; %bb10 +; CHECK: ; %end +; CHECK-NEXT: s_endpgm + +; CHECK: [[BB3]]: ; %bb10 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9 ; CHECK: buffer_store_dword - -; CHECK: [[ENDBB]]: ; CHECK-NEXT: s_endpgm + define amdgpu_ps void @phi_use_def_before_kill() #0 { bb: %tmp = fadd float undef, 1.000000e+00 Index: test/CodeGen/AMDGPU/smrd-vccz-bug.ll =================================================================== --- test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -9,9 +9,10 @@ ; GCN: s_waitcnt lgkmcnt(0) ; VCCZ-BUG: s_mov_b64 vcc, vcc ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc -; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]] +; GCN: s_cbranch_vccz [[SUCCESS:[0-9A-Za-z_]+]] +; GCN: s_endpgm +; GCN: [[SUCCESS]]: ; GCN: buffer_store_dword -; GCN: [[EXIT]]: ; GCN: s_endpgm define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) { entry: Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -121,9 +121,10 @@ ; be selected for the SALU and then later moved to the VALU. ; SI: v_cmp_ne_i32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]] ; SI: s_and_b64 vcc, exec, [[COND]] -; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: s_cbranch_vccz [[SUCCESS_LABEL:[0-9_A-Za-z]+]] +; SI: s_endpgm +; SI: [[SUCCESS_LABEL]]: ; SI: buffer_store_dword -; SI: [[ENDIF_LABEL]]: ; SI: s_endpgm define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) { entry: @@ -146,9 +147,10 @@ ; be selected for the SALU and then later moved to the VALU. ; SI: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]] ; SI: s_and_b64 vcc, exec, [[COND]] -; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: s_cbranch_vccz [[SUCCESS_LABEL:[0-9_A-Za-z]+]] +; SI: s_endpgm +; SI: [[SUCCESS_LABEL]]: ; SI: buffer_store_dword -; SI: [[ENDIF_LABEL]]: ; SI: s_endpgm define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) { entry: @@ -231,9 +233,10 @@ ; SI-LABEL: {{^}}icmp_2_users: ; SI: s_cmp_lt_i32 s{{[0-9]+}}, 1 -; SI: s_cbranch_scc1 [[LABEL:[a-zA-Z0-9_]+]] +; SI: s_cbranch_scc0 [[SUCCESS:[a-zA-Z0-9_]+]] +; SI: s_endpgm +; SI: [[SUCCESS]]: ; SI: buffer_store_dword -; SI: [[LABEL]]: ; SI: s_endpgm define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { main_body: @@ -255,9 +258,10 @@ ; SI: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]] ; SI: v_cmp_lt_i32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0, [[COND]] ; SI: s_and_b64 vcc, exec, [[MASK]] -; SI: s_cbranch_vccnz [[EXIT]] +; SI: s_cbranch_vccz [[SUCCESS:[a-zA-Z0-9_]+]] +; SI: s_endpgm +; SI: {{^}}[[SUCCESS]]: ; SI: buffer_store -; SI: {{^}}[[EXIT]]: ; SI: s_endpgm define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) { bb: @@ -334,13 +338,14 @@ ; SI-LABEL: {{^}}divergent_inside_uniform: ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0 -; SI: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: s_cbranch_scc0 [[SUCCESS_LABEL:[0-9_A-Za-z]+]] +; SI: s_endpgm +; SI: [[SUCCESS_LABEL]]: ; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; SI: buffer_store_dword [[ONE]] -; SI: [[ENDIF_LABEL]]: ; SI: s_endpgm define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { entry: @@ -369,10 +374,11 @@ ; SI: buffer_store_dword [[ONE]] ; SI: s_or_b64 exec, exec, [[MASK]] ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0 -; SI: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]] +; SI: s_cbranch_scc0 [[THREE:[A-Z0-9_]+]] +; SI: s_endpgm +; SI: [[THREE]]: ; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; SI: buffer_store_dword [[TWO]] -; SI: [[EXIT]]: ; SI: s_endpgm define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { entry: Index: test/CodeGen/AMDGPU/uniform-crash.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-crash.ll +++ test/CodeGen/AMDGPU/uniform-crash.ll @@ -3,9 +3,10 @@ ; GCN-LABEL: {{^}}icmp_2_users: ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1 -; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]] +; GCN: s_cbranch_scc0 [[LABEL:BB[0-9_A-Z]+]] +; GCN: s_endpgm ; GCN: [[LABEL]]: -; GCN-NEXT: s_endpgm +; GCN: s_endpgm define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { main_body: %0 = icmp sgt i32 %cond, 0 Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -116,9 +116,13 @@ ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] ; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] +; SI: s_branch [[LABEL_PREHEADER:BB[0-9]+_[0-9]+]] + +; SI: [[LABEL_EXIT]]: +; SI: s_endpgm ; Initialize inner condition to false -; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader +; SI: [[LABEL_PREHEADER]]: ; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] @@ -146,8 +150,6 @@ ; SI: BB#5 ; SI: s_or_b64 exec, exec, [[COND_STATE]] - -; SI: [[LABEL_EXIT]]: ; SI-NOT: [[COND_STATE]] ; SI: s_endpgm Index: test/CodeGen/ARM/2013-05-05-IfConvertBug.ll =================================================================== --- test/CodeGen/ARM/2013-05-05-IfConvertBug.ll +++ test/CodeGen/ARM/2013-05-05-IfConvertBug.ll @@ -112,15 +112,17 @@ ; CHECK-NEXT: subs [[REG:r[0-9]+]], #120 ; CHECK-NEXT: cmp [[REG]], r1 ; CHECK-NOT: it lt -; CHECK-NEXT: bge [[LABEL:.+]] +; CHECK-NEXT: blt [[LABEL:.+]] ; Next BB +; CHECK: subs r0, r1, r0 +; CHECK-NEXT: bx lr +; Next BB +; CHECK: [[LABEL]]: ; CHECK-NOT: cmplt ; CHECK: cmp r0, #119 ; CHECK-NEXT: itt le ; CHECK-NEXT: addle r0, r1, #1 ; CHECK-NEXT: bxle lr -; Next BB -; CHECK: [[LABEL]]: ; CHECK-NEXT: subs r0, r1, r0 ; CHECK-NEXT: bx lr Index: test/CodeGen/ARM/arm-shrink-wrapping.ll =================================================================== --- test/CodeGen/ARM/arm-shrink-wrapping.ll +++ test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -23,9 +23,11 @@ ; Compare the arguments and jump to exit. ; No prologue needed. ; ENABLE: cmp r0, r1 -; ENABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]] +; ENABLE-NEXT: blt [[SUCCESS_LABEL:LBB[0-9_]+]] +; ENABLE: bx lr ; ; Prologue code. +; ENABLE: [[SUCCESS_LABEL]]: ; CHECK: push {r7, lr} ; CHECK-NEXT: mov r7, sp ;; @@ -33,8 +35,12 @@ ; After the prologue is set. ; DISABLE: sub sp ; DISABLE: cmp r0, r1 -; DISABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]] +; DISABLE-NEXT: blt [[SUCCESS_LABEL:LBB[0-9_]+]] +; ARM-DISABLE: mov sp, r7 +; THUMB-DISABLE: add sp, +; DISABLE-NEXT: pop {r7, pc} ; +; DISABLE: [[SUCCESS_LABEL]]: ; Store %a in the alloca. ; ARM-ENABLE: push {r0} ; THUMB-ENABLE: str r0, [sp, #-4] @@ -50,9 +56,8 @@ ; THUMB-ENABLE-NEXT: add sp, #4 ; ENABLE-NEXT: pop{{(\.w)?}} {r7, lr} ; -; CHECK: [[EXIT_LABEL]]: -; -; Without shrink-wrapping, epilogue is in the exit block. +; Late stage tail-duplication removes the exit label with shrink-wrapping. +; Without shrink-wrapping, epilogue is before the return. ; Epilogue code. (What we pop does not matter.) ; ARM-DISABLE: mov sp, r7 ; THUMB-DISABLE: add sp, @@ -388,9 +393,9 @@ ; ; Next BB. ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body -; ARM: subs [[IV]], [[IV]], #1 -; THUMB: subs [[IV]], #1 -; CHECK: add{{(\.w)?}} r4, r4, #1 +; ARM-DAG: subs [[IV]], [[IV]], #1 +; THUMB-DAG: subs [[IV]], #1 +; CHECK-DAG: add{{(\.w)?}} r4, r4, #1 ; CHECK: bne [[LOOP]] ; ; Next BB. Index: test/CodeGen/ARM/atomic-cmpxchg.ll =================================================================== --- test/CodeGen/ARM/atomic-cmpxchg.ll +++ test/CodeGen/ARM/atomic-cmpxchg.ll @@ -72,11 +72,11 @@ ; CHECK-ARMV7-NEXT: mov [[RES:r[0-9]+]], #1 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0 ; CHECK-ARMV7-NEXT: bne [[TRY]] -; CHECK-ARMV7-NEXT: b [[END:.LBB[0-9_]+]] +; CHECK-ARMV7-NEXT: mov r0, [[RES]] +; CHECK-ARMV7-NEXT: bx lr ; CHECK-ARMV7-NEXT: [[FAIL]]: ; CHECK-ARMV7-NEXT: clrex ; CHECK-ARMV7-NEXT: mov [[RES]], #0 -; CHECK-ARMV7-NEXT: [[END]]: ; CHECK-ARMV7-NEXT: mov r0, [[RES]] ; CHECK-ARMV7-NEXT: bx lr Index: test/CodeGen/ARM/atomic-op.ll =================================================================== --- test/CodeGen/ARM/atomic-op.ll +++ test/CodeGen/ARM/atomic-op.ll @@ -297,10 +297,10 @@ ; CHECK: strex [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]] ; CHECK: cmp [[SUCCESS]], #0 ; CHECK: bne [[LOOP_BB]] -; CHECK: b [[END_BB:\.?LBB[0-9]+_[0-9]+]] +; CHECK: dmb ish +; CHECK: bx lr ; CHECK: [[FAIL_BB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[END_BB]]: ; CHECK: dmb ish ; CHECK: bx lr Index: test/CodeGen/ARM/atomic-ops-v8.ll =================================================================== --- test/CodeGen/ARM/atomic-ops-v8.ll +++ test/CodeGen/ARM/atomic-ops-v8.ll @@ -1045,20 +1045,21 @@ ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i8 %old } @@ -1078,20 +1079,21 @@ ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i16 %old } @@ -1110,20 +1112,21 @@ ; r0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: cmp r[[OLD]], r0 -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-NEXT: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-ARM-NEXT: bx lr ret void } @@ -1148,16 +1151,16 @@ ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]] ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r2, r3 is a reasonable guess. ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]] +; CHECK-NEXT: pop ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr Index: test/CodeGen/ARM/fold-stack-adjust.ll =================================================================== --- test/CodeGen/ARM/fold-stack-adjust.ll +++ test/CodeGen/ARM/fold-stack-adjust.ll @@ -135,7 +135,7 @@ ; Important to check for beginning of basic block, because if it gets ; if-converted the test is probably no longer checking what it should. -; CHECK: {{LBB[0-9]+_2}}: +; CHECK: %end ; CHECK-NEXT: vpop {d7, d8} ; CHECK-NEXT: pop {r4, pc} Index: test/CodeGen/ARM/machine-cse-cmp.ll =================================================================== --- test/CodeGen/ARM/machine-cse-cmp.ll +++ test/CodeGen/ARM/machine-cse-cmp.ll @@ -52,7 +52,7 @@ ; CHECK-LABEL: f3: ; CHECK-NOT: sub ; CHECK: cmp -; CHECK: blt +; CHECK: bge %0 = load i32, i32* %offset, align 4 %cmp = icmp slt i32 %0, %size %s = sub nsw i32 %0, %size Index: test/CodeGen/Mips/llvm-ir/ashr.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/ashr.ll +++ test/CodeGen/Mips/llvm-ir/ashr.ll @@ -83,20 +83,23 @@ ; M2: srav $[[T0:[0-9]+]], $4, $7 ; M2: andi $[[T1:[0-9]+]], $7, 32 - ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]] + ; M2: beqz $[[T1]], $[[BB0:BB[0-9_]+]] ; M2: move $3, $[[T0]] + ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: nop + ; M2: $[[EXIT:BB[0-9_]+]]: + ; M2: jr $ra + ; M2: nop + ; M2: $[[BB0]]: ; M2: srlv $[[T2:[0-9]+]], $5, $7 ; M2: not $[[T3:[0-9]+]], $7 ; M2: sll $[[T4:[0-9]+]], $4, 1 ; M2: sllv $[[T5:[0-9]+]], $[[T4]], $[[T3]] + ; M2: beqz $[[T1]], $[[EXIT]] ; M2: or $3, $[[T3]], $[[T2]] - ; M2: $[[BB0]]: - ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] - ; M2: nop - ; M2: sra $2, $4, 31 ; M2: $[[BB1]]: ; M2: jr $ra - ; M2: nop + ; M2: sra $2, $4, 31 ; 32R1-R5: srlv $[[T0:[0-9]+]], $5, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -167,20 +170,23 @@ ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsrav $[[T1:[0-9]+]], $4, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], [[BB0:.LBB[0-9_]+]] + ; M3: beqz $[[T3:[0-9]+]], .[[BB0:LBB[0-9_]+]] ; M3: move $3, $[[T1]] + ; M3: bnez $[[T3]], .[[BB1:LBB[0-9_]+]] + ; M3: nop + ; M3: .[[EXIT:LBB[0-9_]+]]: + ; M3: jr $ra + ; M3: nop + ; M3: .[[BB0]]: ; M3: dsrlv $[[T4:[0-9]+]], $5, $7 ; M3: dsll $[[T5:[0-9]+]], $4, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsllv $[[T7:[0-9]+]], $[[T5]], $[[T6]] + ; M3: beqz $[[T3]], .[[EXIT]] ; M3: or $3, $[[T7]], $[[T4]] - ; M3: [[BB0]]: - ; M3: beqz $[[T3]], [[BB1:.LBB[0-9_]+]] - ; M3: nop - ; M3: dsra $2, $4, 63 - ; M3: [[BB1]]: + ; M3: .[[BB1]]: ; M3: jr $ra - ; M3: nop + ; M3: dsra $2, $4, 63 ; GP64-NOT-R6: dsrlv $[[T0:[0-9]+]], $5, $7 ; GP64-NOT-R6: dsll $[[T1:[0-9]+]], $4, 1 Index: test/CodeGen/Mips/llvm-ir/lshr.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/lshr.ll +++ test/CodeGen/Mips/llvm-ir/lshr.ll @@ -81,20 +81,24 @@ ; M2: srlv $[[T0:[0-9]+]], $4, $7 ; M2: andi $[[T1:[0-9]+]], $7, 32 - ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]] + ; M2: beqz $[[T1]], $[[BB0:BB[0-9_]+]] ; M2: move $3, $[[T0]] + ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: addiu $2, $zero, 0 + ; M2: $[[EXIT:BB[0-9_]+]]: + ; M2: jr $ra + ; M2: nop + ; M2: $[[BB0]]: ; M2: srlv $[[T2:[0-9]+]], $5, $7 ; M2: not $[[T3:[0-9]+]], $7 ; M2: sll $[[T4:[0-9]+]], $4, 1 ; M2: sllv $[[T5:[0-9]+]], $[[T4]], $[[T3]] ; M2: or $3, $[[T3]], $[[T2]] - ; M2: $[[BB0]]: - ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: bnez $[[T1]], $[[EXIT]] ; M2: addiu $2, $zero, 0 - ; M2: move $2, $[[T0]] ; M2: $[[BB1]]: ; M2: jr $ra - ; M2: nop + ; M2: move $2, $[[T0]] ; 32R1-R5: srlv $[[T0:[0-9]+]], $5, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -158,20 +162,24 @@ ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsrlv $[[T1:[0-9]+]], $4, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]] + ; M3: beqz $[[T3:[0-9]+]], .[[BB0:LBB[0-9_]+]] ; M3: move $3, $[[T1]] + ; M3: beqz $[[T3]], .[[BB1:LBB[0-9_]+]] + ; M3: daddiu $2, $zero, 0 + ; M3: .[[EXIT:LBB[0-9_]+]]: + ; M3: jr $ra + ; M3: nop + ; M3: .[[BB0]]: ; M3: dsrlv $[[T4:[0-9]+]], $5, $7 ; M3: dsll $[[T5:[0-9]+]], $4, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsllv $[[T7:[0-9]+]], $[[T5]], $[[T6]] ; M3: or $3, $[[T7]], $[[T4]] - ; M3: [[BB0]]: - ; M3: bnez $[[T3]], [[BB1:\.LBB[0-9_]+]] + ; M3: bnez $[[T3]], .[[EXIT]] ; M3: daddiu $2, $zero, 0 - ; M3: move $2, $[[T1]] ; M3: [[BB1]]: ; M3: jr $ra - ; M3: nop + ; M3: move $2, $[[T1]] ; GP64-NOT-R6: dsrlv $[[T0:[0-9]+]], $5, $7 ; GP64-NOT-R6: dsll $[[T1:[0-9]+]], $4, 1 Index: test/CodeGen/Mips/llvm-ir/shl.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/shl.ll +++ test/CodeGen/Mips/llvm-ir/shl.ll @@ -97,20 +97,24 @@ ; M2: sllv $[[T0:[0-9]+]], $5, $7 ; M2: andi $[[T1:[0-9]+]], $7, 32 - ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]] + ; M2: beqz $[[T1]], $[[BB0:BB[0-9_]+]] ; M2: move $2, $[[T0]] + ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: addiu $3, $zero, 0 + ; M2: $[[EXIT:BB[0-9_]+]]: + ; M2: jr $ra + ; M2: nop + ; M2: $[[BB0]]: ; M2: sllv $[[T2:[0-9]+]], $4, $7 ; M2: not $[[T3:[0-9]+]], $7 ; M2: srl $[[T4:[0-9]+]], $5, 1 ; M2: srlv $[[T5:[0-9]+]], $[[T4]], $[[T3]] ; M2: or $2, $[[T2]], $[[T3]] - ; M2: $[[BB0]]: - ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: bnez $[[T1]], $[[EXIT]] ; M2: addiu $3, $zero, 0 - ; M2: move $3, $[[T0]] ; M2: $[[BB1]]: ; M2: jr $ra - ; M2: nop + ; M2: move $3, $[[T0]] ; 32R1-R5: sllv $[[T0:[0-9]+]], $4, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -174,20 +178,24 @@ ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsllv $[[T1:[0-9]+]], $5, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]] + ; M3: beqz $[[T3:[0-9]+]], .[[BB0:LBB[0-9_]+]] ; M3: move $2, $[[T1]] + ; M3: beqz $[[T3]], .[[BB1:LBB[0-9_]+]] + ; M3: daddiu $3, $zero, 0 + ; M3: .[[EXIT:LBB[0-9_]+]]: + ; M3: jr $ra + ; M3: nop + ; M3: .[[BB0]]: ; M3: dsllv $[[T4:[0-9]+]], $4, $7 ; M3: dsrl $[[T5:[0-9]+]], $5, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsrlv $[[T7:[0-9]+]], $[[T5]], $[[T6]] ; M3: or $2, $[[T4]], $[[T7]] - ; M3: [[BB0]]: - ; M3: bnez $[[T3]], [[BB1:\.LBB[0-9_]+]] + ; M3: bnez $[[T3]], .[[EXIT]] ; M3: daddiu $3, $zero, 0 - ; M3: move $3, $[[T1]] ; M3: [[BB1]]: ; M3: jr $ra - ; M3: nop + ; M3: move $3, $[[T1]] ; GP64-NOT-R6: dsllv $[[T0:[0-9]+]], $4, $7 ; GP64-NOT-R6: dsrl $[[T1:[0-9]+]], $5, 1 Index: test/CodeGen/Mips/longbranch.ll =================================================================== --- test/CodeGen/Mips/longbranch.ll +++ test/CodeGen/Mips/longbranch.ll @@ -84,7 +84,7 @@ ; Check the MIPS64 version. ; N64: lui $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test1))) -; N64: bnez $4, [[BB0:\.LBB[0-9_]+]] +; N64: beqz $4, .[[EXIT:LBB[0-9_]+]] ; N64: daddu $[[R1:[0-9]+]], $[[R0]], $25 ; Check for long branch expansion: @@ -100,14 +100,15 @@ ; N64-NEXT: jr $1 ; N64-NEXT: daddiu $sp, $sp, 16 -; N64: [[BB0]]: +; N64: [[EXIT]]: +; N64: jr $ra +; N64: nop +; N64: [[BB2]]: ; N64: daddiu $[[GP:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test1))) ; N64: ld $[[R2:[0-9]+]], %got_disp(x)($[[GP]]) ; N64: addiu $[[R3:[0-9]+]], $zero, 1 -; N64: sw $[[R3]], 0($[[R2]]) -; N64: [[BB2]]: ; N64: jr $ra -; N64: nop +; N64: sw $[[R3]], 0($[[R2]]) ; In MIPS64R6 JR is an alias to JALR with $rd=0. As everything else remains the ; same with the N64 prefix, we use -asm-show-inst in order to make sure that Index: test/CodeGen/PowerPC/bdzlr.ll =================================================================== --- test/CodeGen/PowerPC/bdzlr.ll +++ test/CodeGen/PowerPC/bdzlr.ll @@ -53,13 +53,15 @@ ; CHECK: @lua_xmove ; CHECK: bnelr -; CHECK: bnelr +; CHECK: beq +; CHECK: blr ; CHECK: bdzlr ; CHECK-NOT: blr ; CHECK-CRB: @lua_xmove ; CHECK-CRB: bclr 12, -; CHECK-CRB: bclr 12, +; CHECK-CRB: bc 4, +; CHECK-CRB: blr ; CHECK-CRB: bdzlr ; CHECK-CRB-NOT: blr } Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- test/CodeGen/PowerPC/tail-dup-layout.ll +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -1,25 +1,25 @@ -; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s +; RUN: llc -O2 < %s | FileCheck %s target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-grtev4-linux-gnu" ; Intended layout: -; The outlining flag produces the layout +; The chain-based outlining produces the layout ; test1 ; test2 ; test3 ; test4 -; exit ; optional1 ; optional2 ; optional3 ; optional4 +; exit ; Tail duplication puts test n+1 at the end of optional n ; so optional1 includes a copy of test2 at the end, and branches ; to test3 (at the top) or falls through to optional 2. -; The CHECK statements check for the whole string of tests and exit block, +; The CHECK statements check for the whole string of tests ; and then check that the correct test has been duplicated into the end of ; the optional blocks and that the optional blocks are in the correct order. -;CHECK-LABEL: f: +;CHECK-LABEL: straight_test: ; test1 may have been merged with entry ;CHECK: mr [[TAGREG:[0-9]+]], 3 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 @@ -33,8 +33,7 @@ ;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 ;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] -;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit -;CHECK: blr +;CHECK-NEXT: b [[EXITLABEL:[._0-9A-Za-z]+]] ;CHECK-NEXT: [[OPT1LABEL]] ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 ;CHECK-NEXT: beq 0, [[TEST3LABEL]] @@ -45,9 +44,10 @@ ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 ;CHECK-NEXT: beq 0, [[EXITLABEL]] ;CHECK-NEXT: [[OPT4LABEL]] -;CHECK: b [[EXITLABEL]] +;CHECK: [[EXITLABEL]]: # %exit +;CHECK: blr -define void @f(i32 %tag) { +define void @straight_test(i32 %tag) { entry: br label %test1 test1: @@ -94,7 +94,115 @@ ret void } +; Intended layout: +; The chain-based outlining produces the layout +; entry +; --- Begin loop --- +; for.latch +; for.check +; test1 +; test2 +; test3 +; test4 +; optional1 +; optional2 +; optional3 +; optional4 +; --- End loop --- +; exit +; The CHECK statements check for the whole string of tests and exit block, +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: loop_test: +;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4 +;CHECK: [[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch +;CHECK: addi +;CHECK: [[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check +;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]]) +;CHECK: # %test1 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, [[OPT3LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}} +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, [[LATCHLABEL]] +;CHECK-NEXT: b [[OPT4LABEL:[._0-9A-Za-z]+]] +;CHECK: [[OPT1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[TEST3LABEL]] +;CHECK-NEXT: [[OPT2LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, [[TEST4LABEL]] +;CHECK-NEXT: [[OPT3LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, [[LATCHLABEL]] +;CHECK-NEXT: [[OPT4LABEL]] +;CHECK: b [[LATCHLABEL]] +define void @loop_test(i32* %tags, i32 %count) { +entry: + br label %for.check +for.check: + %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch] + %done.count = icmp ugt i32 %count.loop, 0 + %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count + %tag = load i32, i32* %tag_ptr + %done.tag = icmp eq i32 %tag, 0 + %done = and i1 %done.count, %done.tag + br i1 %done, label %test1, label %exit +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1 +optional1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2 +optional2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3 +optional3: + call void @c() + call void @c() + call void @c() + call void @c() + br label %test4 +test4: + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %for.latch, label %optional4 +optional4: + call void @d() + call void @d() + call void @d() + call void @d() + br label %for.latch +for.latch: + %count.sub = sub i32 %count.loop, 1 + br label %for.check +exit: + ret void +} + declare void @a() declare void @b() declare void @c() declare void @d() + Index: test/CodeGen/SPARC/sjlj.ll =================================================================== --- test/CodeGen/SPARC/sjlj.ll +++ test/CodeGen/SPARC/sjlj.ll @@ -66,14 +66,15 @@ ; CHECK: ba .LBB1_1 ; CHECK: nop ; CHECK:.LBB1_1: ! %entry -; CHECK: ba .LBB1_3 ; CHECK: mov %g0, %i0 +; CHECK: cmp %i0, 0 +; CHECK: bne .LBB1_4 +; CHECK: ba .LBB1_5 ; CHECK:.LBB1_2: ! Block address taken ; CHECK: mov 1, %i0 -; CHECK:.LBB1_3: ! %entry -; CHECK: cmp %i0, 0 ; CHECK: be .LBB1_5 -; CHECK: nop +; CHECK:.LBB1_4: +; CHECK: ba .LBB1_6 } declare i8* @llvm.frameaddress(i32) #2 Index: test/CodeGen/SystemZ/tdc-06.ll =================================================================== --- test/CodeGen/SystemZ/tdc-06.ll +++ test/CodeGen/SystemZ/tdc-06.ll @@ -26,25 +26,27 @@ nonzeroord: ; CHECK: lhi %r2, 2 ; CHECK: tcdb %f0, 48 -; CHECK: jl [[RET]] +; CHECK: je [[FINITE:.]] %abs = tail call double @llvm.fabs.f64(double %x) %testinf = fcmp oeq double %abs, 0x7FF0000000000000 br i1 %testinf, label %ret, label %finite, !prof !1 +ret: +; CHECK: [[RET]]: +; CHECK: br %r14 + %res = phi i32 [ 5, %entry ], [ 1, %nonzero ], [ 2, %nonzeroord ], [ %finres, %finite ] + ret i32 %res + finite: ; CHECK: lhi %r2, 3 ; CHECK: tcdb %f0, 831 ; CHECK: blr %r14 ; CHECK: lhi %r2, 4 +; CHECK: br %r14 %testnormal = fcmp uge double %abs, 0x10000000000000 %finres = select i1 %testnormal, i32 3, i32 4 br label %ret -ret: -; CHECK: [[RET]]: -; CHECK: br %r14 - %res = phi i32 [ 5, %entry ], [ 1, %nonzero ], [ 2, %nonzeroord ], [ %finres, %finite ] - ret i32 %res } !1 = !{!"branch_weights", i32 1, i32 1} Index: test/CodeGen/Thumb/thumb-shrink-wrapping.ll =================================================================== --- test/CodeGen/Thumb/thumb-shrink-wrapping.ll +++ test/CodeGen/Thumb/thumb-shrink-wrapping.ll @@ -1,11 +1,12 @@ -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T + ; ; Note: Lots of tests use inline asm instead of regular calls. ; This allows to have a better control on what the allocation will do. @@ -15,6 +16,8 @@ ; edges. ; Also disable the late if-converter as it makes harder to reason on ; the diffs. +; Disable tail-duplication during placement, as v4t vs v5t get different +; results due to branches not being analyzable under v5 ; Initial motivating example: Simple diamond with a call just on one side. ; CHECK-LABEL: foo: Index: test/CodeGen/Thumb2/cbnz.ll =================================================================== --- test/CodeGen/Thumb2/cbnz.ll +++ test/CodeGen/Thumb2/cbnz.ll @@ -26,7 +26,7 @@ call void @x() call void @x() call void @x() - ; CHECK: cbnz + ; CHECK: cbz %q = icmp eq i32 %y, 0 br i1 %q, label %t2, label %f Index: test/CodeGen/Thumb2/ifcvt-compare.ll =================================================================== --- test/CodeGen/Thumb2/ifcvt-compare.ll +++ test/CodeGen/Thumb2/ifcvt-compare.ll @@ -4,7 +4,7 @@ define void @f0(i32 %x) optsize { ; CHECK-LABEL: f0: - ; CHECK: cbnz + ; CHECK: cbz %p = icmp eq i32 %x, 0 br i1 %p, label %t, label %f Index: test/CodeGen/WebAssembly/mem-intrinsics.ll =================================================================== --- test/CodeGen/WebAssembly/mem-intrinsics.ll +++ test/CodeGen/WebAssembly/mem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s ; Test memcpy, memmove, and memset intrinsics. Index: test/CodeGen/X86/2012-08-17-legalizer-crash.ll =================================================================== --- test/CodeGen/X86/2012-08-17-legalizer-crash.ll +++ test/CodeGen/X86/2012-08-17-legalizer-crash.ll @@ -26,5 +26,5 @@ ret void ; CHECK-LABEL: fn1: -; CHECK: jb +; CHECK: jae } Index: test/CodeGen/X86/avx-splat.ll =================================================================== --- test/CodeGen/X86/avx-splat.ll +++ test/CodeGen/X86/avx-splat.ll @@ -62,8 +62,10 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ## implicit-def: %YMM0 ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne LBB4_2 -; CHECK-NEXT: ## BB#1: ## %load.i1247 +; CHECK-NEXT: je LBB4_1 +; CHECK-NEXT: ## BB#2: ## %__load_and_broadcast_32.exit1249 +; CHECK-NEXT: retq +; CHECK-NEXT: LBB4_1: ## %load.i1247 ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: andq $-32, %rsp @@ -71,7 +73,6 @@ ; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %ymm0 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp -; CHECK-NEXT: LBB4_2: ## %__load_and_broadcast_32.exit1249 ; CHECK-NEXT: retq allocas: %udx495 = alloca [18 x [18 x float]], align 32 Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -69,13 +69,14 @@ ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vucomiss %xmm1, %xmm0 ; ALL-NEXT: jne LBB3_1 -; ALL-NEXT: jnp LBB3_2 +; ALL-NEXT: jp LBB3_1 +; ALL-NEXT: ## BB#2: ## %return +; ALL-NEXT: retq ; ALL-NEXT: LBB3_1: ## %if.end ; ALL-NEXT: seta %al ; ALL-NEXT: movzbl %al, %eax ; ALL-NEXT: leaq {{.*}}(%rip), %rcx ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: LBB3_2: ## %return ; ALL-NEXT: retq entry: %cmp = fcmp oeq float %p, 0.000000e+00 Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -314,7 +314,7 @@ define void @unnatural_cfg1() { ; Test that we can handle a loop with an inner unnatural loop at the end of ; a function. This is a gross CFG reduced out of the single source GCC. -; CHECK: unnatural_cfg1 +; CHECK-LABEL: unnatural_cfg1 ; CHECK: %entry ; CHECK: %loop.body1 ; CHECK: %loop.body2 @@ -352,17 +352,22 @@ ; Test that we can handle a loop with a nested natural loop *and* an unnatural ; loop. This was reduced from a crash on block placement when run over ; single-source GCC. -; CHECK: unnatural_cfg2 +; The tail-duplication outlining algorithm places +; %loop.body3 and %loop.inner1.begin out-of-line at the end of the loop, +; because %loop.body4 is unnavoidable within the loop and short, +; and %loop.inner1.begin has an alternate fallthrough of %loop.body3 +; CHECK-LABEL: unnatural_cfg2 ; CHECK: %entry ; CHECK: %loop.body1 ; CHECK: %loop.body2 +; CHECK: %loop.body4 +; CHECK: %loop.inner2.begin +; CHECK: %loop.inner2.begin +; The loop.inner2.end block is folded ; CHECK: %loop.body3 ; CHECK: %loop.inner1.begin ; The end block is folded with %loop.body3... ; CHECK-NOT: %loop.inner1.end -; CHECK: %loop.body4 -; CHECK: %loop.inner2.begin -; The loop.inner2.end block is folded ; CHECK: %loop.header ; CHECK: %bail @@ -559,7 +564,7 @@ ; didn't correctly locate the fallthrough successor, assuming blindly that the ; first one was the fallthrough successor. As a result, we would add an ; erroneous jump to the landing pad thinking *that* was the default successor. -; CHECK: test_eh_lpad_successor +; CHECK-LABEL: test_eh_lpad_successor ; CHECK: %entry ; CHECK-NOT: jmp ; CHECK: %loop @@ -587,7 +592,7 @@ ; fallthrough simply won't occur. Make sure we don't crash trying to update ; terminators for such constructs. ; -; CHECK: test_eh_throw +; CHECK-LABEL: test_eh_throw ; CHECK: %entry ; CHECK: %cleanup @@ -609,7 +614,7 @@ ; attempt to merge onto the wrong end of the inner loop just because we find it ; first. This was reduced from a crasher in GCC's single source. ; -; CHECK: test_unnatural_cfg_backwards_inner_loop +; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop ; CHECK: %entry ; CHECK: %loop2b ; CHECK: %loop1 @@ -649,7 +654,7 @@ ; fallthrough because that happens to always produce unanalyzable branches on ; x86. ; -; CHECK: unanalyzable_branch_to_loop_header +; CHECK-LABEL: unanalyzable_branch_to_loop_header ; CHECK: %entry ; CHECK: %loop ; CHECK: %exit @@ -673,7 +678,7 @@ ; This branch is now analyzable and hence the destination block becomes the ; hotter one. The right order is entry->bar->exit->foo. ; -; CHECK: unanalyzable_branch_to_best_succ +; CHECK-LABEL: unanalyzable_branch_to_best_succ ; CHECK: %entry ; CHECK: %bar ; CHECK: %exit @@ -699,12 +704,13 @@ ; Ensure that we can handle unanalyzable branches where the destination block ; gets selected as the best free block in the CFG. ; -; CHECK: unanalyzable_branch_to_free_block +; CHECK-LABEL: unanalyzable_branch_to_free_block ; CHECK: %entry ; CHECK: %a ; CHECK: %b -; CHECK: %c ; CHECK: %exit +; CHECK: %c +; CHECK: retl entry: br i1 undef, label %a, label %b @@ -729,7 +735,7 @@ ; Ensure that we don't crash as we're building up many unanalyzable branches, ; blocks, and loops. ; -; CHECK: many_unanalyzable_branches +; CHECK-LABEL: many_unanalyzable_branches ; CHECK: %entry ; CHECK: %exit @@ -948,7 +954,7 @@ ; strange layouts that are siginificantly less efficient, often times maing ; it discontiguous. ; -; CHECK: @benchmark_heapsort +; CHECK-LABEL: @benchmark_heapsort ; CHECK: %entry ; First rotated loop top. ; CHECK: .p2align Index: test/CodeGen/X86/cmovcmov.ll =================================================================== --- test/CodeGen/X86/cmovcmov.ll +++ test/CodeGen/X86/cmovcmov.ll @@ -192,7 +192,7 @@ ; CMOV-NEXT: retq ; NOCMOV: jne -; NOCMOV-NEXT: jp +; NOCMOV-NEXT: jnp define float @test_zext_fcmp_une(float %a, float %b) #0 { entry: %cmp = fcmp une float %a, %b @@ -214,7 +214,7 @@ ; CMOV-NEXT: retq ; NOCMOV: jne -; NOCMOV-NEXT: jp +; NOCMOV-NEXT: jnp define float @test_zext_fcmp_oeq(float %a, float %b) #0 { entry: %cmp = fcmp oeq float %a, %b Index: test/CodeGen/X86/critical-edge-split-2.ll =================================================================== --- test/CodeGen/X86/critical-edge-split-2.ll +++ test/CodeGen/X86/critical-edge-split-2.ll @@ -24,6 +24,7 @@ ; CHECK-LABEL: test1: ; CHECK: testb %dil, %dil -; CHECK: jne LBB0_2 +; CHECK: je LBB0_1 +; CHECK: retq +; CHECK: LBB0_1: ; CHECK: divl -; CHECK: LBB0_2: Index: test/CodeGen/X86/shrink-wrap-chkstk.ll =================================================================== --- test/CodeGen/X86/shrink-wrap-chkstk.ll +++ test/CodeGen/X86/shrink-wrap-chkstk.ll @@ -62,11 +62,12 @@ ; CHECK-LABEL: @use_eax_before_prologue@8: # @use_eax_before_prologue ; CHECK: movl %ecx, %eax ; CHECK: cmpl %edx, %eax -; CHECK: jge LBB1_2 +; CHECK: jl LBB1_1 +; CHECK: retl +; CHECK: LBB1_1 ; CHECK: pushl %eax ; CHECK: movl $4092, %eax ; CHECK: calll __chkstk ; CHECK: movl 4092(%esp), %eax ; CHECK: calll _doSomething -; CHECK: LBB1_2: ; CHECK: retl Index: test/CodeGen/X86/twoaddr-coalesce-3.ll =================================================================== --- test/CodeGen/X86/twoaddr-coalesce-3.ll +++ test/CodeGen/X86/twoaddr-coalesce-3.ll @@ -19,7 +19,7 @@ ; Check that only one mov will be generated in the kernel loop. ; CHECK-LABEL: foo: -; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body +; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}} ; CHECK-NOT: mov ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]] ; CHECK-NOT: mov @@ -56,7 +56,7 @@ ; Check that only two mov will be generated in the kernel loop. ; CHECK-LABEL: goo: -; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body +; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}} ; CHECK-NOT: mov ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]] ; CHECK-NOT: mov Index: test/CodeGen/X86/x86-shrink-wrap-unwind.ll =================================================================== --- test/CodeGen/X86/x86-shrink-wrap-unwind.ll +++ test/CodeGen/X86/x86-shrink-wrap-unwind.ll @@ -24,7 +24,9 @@ ; After the prologue is set. ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]] ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]] -; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] +; CHECK: popq +; CHECK-NEXT: retq ; ; Store %a in the alloca. ; CHECK: movl [[ARG0CPY]], 4(%rsp) @@ -33,14 +35,9 @@ ; Set the first argument to zero. ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: callq _doSomething -; -; CHECK: [[EXIT_LABEL]]: -; -; Without shrink-wrapping, epilogue is in the exit block. -; Epilogue code. (What we pop does not matter.) ; CHECK-NEXT: popq -; ; CHECK-NEXT: retq +; define i32 @framelessUnwind(i32 %a, i32 %b) #0 { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b @@ -70,9 +67,11 @@ ; After the prologue is set. ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]] ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]] -; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] +; CHECK: retq ; ; Prologue code. +; CHECK-NEXT: [[SUCCESS_LABEL]] ; CHECK: pushq %rbp ; CHECK: movq %rsp, %rbp ; @@ -86,9 +85,8 @@ ; ; Epilogue code. (What we pop does not matter.) ; CHECK: popq %rbp -; -; CHECK: [[EXIT_LABEL]]: ; CHECK-NEXT: retq +; define i32 @frameUnwind(i32 %a, i32 %b) #1 { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b @@ -116,10 +114,12 @@ ; After the prologue is set. ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]] ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]] -; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] +; CHECK: retq ; ; Prologue code. ; (What we push does not matter. It should be some random sratch register.) +; CHECK-NEXT: [[SUCCESS_LABEL]] ; CHECK: pushq ; ; Store %a in the alloca. @@ -132,8 +132,6 @@ ; ; Epilogue code. ; CHECK-NEXT: addq -; -; CHECK: [[EXIT_LABEL]]: ; CHECK-NEXT: retq define i32 @framelessnoUnwind(i32 %a, i32 %b) #2 { %tmp = alloca i32, align 4 Index: test/CodeGen/X86/x86-shrink-wrapping.ll =================================================================== --- test/CodeGen/X86/x86-shrink-wrapping.ll +++ test/CodeGen/X86/x86-shrink-wrapping.ll @@ -18,18 +18,24 @@ ; No prologue needed. ; ENABLE: movl %edi, [[ARG0CPY:%e[a-z]+]] ; ENABLE-NEXT: cmpl %esi, [[ARG0CPY]] -; ENABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; ENABLE-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] +; ENABLE: retq ; ; Prologue code. ; (What we push does not matter. It should be some random sratch register.) +; ENABLE: [[SUCCESS_LABEL]]: ; CHECK: pushq ; ; Compare the arguments and jump to exit. ; After the prologue is set. ; DISABLE: movl %edi, [[ARG0CPY:%e[a-z]+]] ; DISABLE-NEXT: cmpl %esi, [[ARG0CPY]] -; DISABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]] +; DISABLE-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]] ; +; DISABLE: popq +; DISABLE-NEXT: retq + +; DISABLE: [[SUCCESS_LABEL]]: ; Store %a in the alloca. ; CHECK: movl [[ARG0CPY]], 4(%rsp) ; Set the alloca address in the second argument. @@ -37,17 +43,11 @@ ; Set the first argument to zero. ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: callq _doSomething -; ; With shrink-wrapping, epilogue is just after the call. ; ENABLE-NEXT: addq $8, %rsp -; -; CHECK: [[EXIT_LABEL]]: -; -; Without shrink-wrapping, epilogue is in the exit block. -; Epilogue code. (What we pop does not matter.) ; DISABLE-NEXT: popq -; ; CHECK-NEXT: retq + define i32 @foo(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b