Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" @@ -266,6 +267,12 @@ /// \brief A typedef for a block filter set. typedef SmallSetVector BlockFilterSet; + /// Pair struct containing basic block and taildup profitiability + struct BlockAndTailDupResult { + MachineBasicBlock * BB; + bool ShouldTailDup; + }; + /// \brief work lists of blocks that are ready to be laid out SmallVector BlockWorkList; SmallVector EHPadWorkList; @@ -293,9 +300,12 @@ /// \brief A handle to the target's lowering info. const TargetLoweringBase *TLI; - /// \brief A handle to the post dominator tree. + /// \brief A handle to the dominator tree. MachineDominatorTree *MDT; + /// \brief A handle to the post dominator tree. + MachinePostDominatorTree *MPDT; + /// \brief Duplicator used to duplicate tails during placement. /// /// Placement decisions can open up new tail duplication opportunities, but @@ -368,9 +378,9 @@ BlockChain &SuccChain, BranchProbability SuccProb, BranchProbability RealSuccProb, BlockChain &Chain, const BlockFilterSet *BlockFilter); - MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB, - BlockChain &Chain, - const BlockFilterSet *BlockFilter); + BlockAndTailDupResult selectBestSuccessor(MachineBasicBlock *BB, + BlockChain &Chain, + const BlockFilterSet *BlockFilter); MachineBasicBlock * selectBestCandidateBlock(BlockChain &Chain, SmallVectorImpl &WorkList); @@ -403,6 +413,15 @@ void buildCFGChains(); void optimizeBranches(); void alignBlocks(); + bool shouldTailDuplicate(MachineBasicBlock *BB); + /// Check the edge frequencies to see if tail duplication will increase + /// fallthroughs. + bool isProfitableToTailDup(MachineBasicBlock *BB, MachineBasicBlock *Succ); + /// Returns true if a block can tail duplicate into all unplaced + /// predecessors. Filters based on loop. + bool canTailDuplicateUnplacedPreds( + MachineBasicBlock *BB, MachineBasicBlock *Succ, + BlockChain &Chain, const BlockFilterSet *BlockFilter); public: static char ID; // Pass identification, replacement for typeid @@ -416,6 +435,8 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + if (TailDupPlacement) + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -430,6 +451,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement", "Branch Probability Basic Block Placement", false, false) @@ -561,6 +583,134 @@ return SuccProb; } +/// Check if a block should be tail duplicated. +/// \p BB Block to check. +bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { + // Blocks with single successors don't create additional fallthrough + // opportunities. Don't duplicate them. TODO: When conditional exits are + // analyzable, allow them to be duplicated. + bool IsSimple = TailDup.isSimpleBB(BB); + + if (BB->succ_size() == 1) + return false; + return TailDup.shouldTailDuplicate(IsSimple, *BB); +} + +/// Check the edge frequencies to see if tail duplication will increase +/// fallthroughs. +/// \p SuccNoTailDup True if Succ would be chosen w/o considering duplication. +bool MachineBlockPlacement::isProfitableToTailDup( + MachineBasicBlock *BB, MachineBasicBlock *Succ, bool SuccNoTailDup) { + // We need to do a probability calculation to make sure this is profitable. + // First: does succ have a successor that post-dominates? This affects the + // calculation. The 2 relevant cases are: + // BB BB + // | | + // P| |P + // = C = C + // | /Q | /Q + // | / | / + // Succ Succ + // / \ | \ V + // U/ =V |U \ + // / \ = D + // D E | / + // | / + // |/ + // PDom + // In the second case, Placing Succ while duplicating it into C prevents the + // fallthrough of Succ into either D or Dom, because they now have C as an + // unplaced predecessor + MachineBasicBlock *PDom = nullptr; + auto BestSuccSucc = BranchProbability::getZero(); + for (MachineBasicBlock *SuccSucc : Succ->successors()) { + auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc); + if (Prob > BestSuccSucc) + BestSuccSucc = Prob; + if (PDom == nullptr) + if (MPDT->dominates(SuccSucc, Succ)) + PDom = SuccSucc; + } + // If it doesn't have a post-dominating successor, here is the calculation: + // BB BB + // | | \ + // P| | = + // = C | C + // | /Q | | + // | / | | + // Succ Succ /| + // / \ | \/ | + // U/ =V = /= = + // / \ | / \| + // D E D E + // Cost in the first case is: P + V + // Cost in the second case is: Q + QV + PU + PV + if (Dom == nullptr || !Succ->isSuccessor(Dom)) { + BranchProbability P = (MBPI->getEdgeProbability(BB, Succ)); + BranchProbability Q = P.getCompl(); + BranchProbability U = BestSuccSucc; + BranchProbability V = U.getCompl(); + BranchProbability QV = Q * V; + uint64_t BaseCost = static_cast(P.getNumerator()) + + static_cast(V.getNumerator()); + uint64_t DupCost = static_cast(Q.getNumerator()) + + static_cast(QV.getNumerator()) + + static_cast(P.getNumerator()); + return (BaseCost > DupCost); + } + BranchProbability U = MBPI->getEdgeProbability(Succ, Dom); + BranchProbability Q = MBPI->getEdgeProbability(BB, Succ).getCompl(); + // If there is a post-dominating successor, here is the calculation: + // BB BB + // | \ Q | \ Q + // |P \ | = + // = C |P C (+Succ) + // | / | | + // | / | | + // Succ Succ /| + // | \ V | \/ | + // |U \ |U /\ | + // = D = = =| + // | / |/ D + // | / | / + // |/ | / + // Dom Dom + // Branches have been marked with (=) + // The cost for taken branches in the first case is P + U + // The cost in the second case (assuming independence), given the layout: + // BB, Succ, (C+Succ), D, Dom + // is Q + P*U + P*V + Q*U == Q + P * Q*U. Subtracting P means we need to + // compare U vs Q + Q*U. + return (U > (Q + Q*U)); +} + + +/// When the option TailDupPlacement is on, this method checks if the +/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated +/// into all of its unplaced, unfiltered predecessors, that are not BB. In +/// addition we keep a set of blocks that have been tail-duplicated into and +/// allow those blocks to be unplaced as well. This allows the creation of a +/// second (larger) spine and a short fallthrough spine. +/// We also identify blocks with the CFG that would have been produced by +/// tail-duplication and lay them out in the same manner. +bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( + MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, + const BlockFilterSet *BlockFilter) { + if (!shouldTailDuplicate(Succ)) + return false; + + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // Make sure all unplaced and unfiltered predecessors can be + // tail-duplicated into. + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain) + continue; + if (!TailDup.canTailDuplicate(Succ, Pred)) + return false; + } + return true; +} + /// When the option OutlineOptionalBranches is on, this method /// checks if the fallthrough candidate block \p Succ (of block /// \p BB) also has other unscheduled predecessor blocks which @@ -609,11 +759,11 @@ if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) { /* See case 1 below for the cost analysis. For BB->Succ to * be taken with smaller cost, the following needs to hold: - * Prob(BB->Succ) > 2* Prob(BB->Pred) - * So the threshold T - * T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1, - * We have T + T/2 = 1, i.e. T = 2/3. Also adding user specified - * branch bias, we have + * Prob(BB->Succ) > 2 * Prob(BB->Pred) + * So the threshold T in the calculation below + * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred) + * So T / (1 - T) = 2, Yielding T = 2/3 + * Also adding user specified branch bias, we have * T = (2/3)*(ProfileLikelyProb/50) * = (2*ProfileLikelyProb)/150) */ @@ -798,14 +948,15 @@ /// breaking CFG structure, but cave and break such structures in the case of /// very hot successor edges. /// -/// \returns The best successor block found, or null if none are viable. -MachineBasicBlock * +/// \returns The best successor block found, or null if none are viable, along +/// with a boolean indicating if tail duplication is necessary. +BlockAndTailDupResult MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter) { const BranchProbability HotProb(StaticLikelyProb, 100); - MachineBasicBlock *BestSucc = nullptr; + BlockAndTailDupResult BestSucc = { nullptr, false }; auto BestProb = BranchProbability::getZero(); SmallVector Successors; @@ -814,6 +965,7 @@ DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n"); for (MachineBasicBlock *Succ : Successors) { + bool ShouldTailDup = false; auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BranchProbability SuccProb = getAdjustedProbability(RealSuccProb, AdjustedSumProb); @@ -827,8 +979,17 @@ // Skip the edge \c BB->Succ if block \c Succ has a better layout // predecessor that yields lower global cost. if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb, - Chain, BlockFilter)) - continue; + Chain, BlockFilter)) { + // If tail duplication would make Succ profitable, place it. + if (!(TailDupPlacement + && canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) + && isProfitableToTailDup(BB, Succ, false))) + continue; + ShouldTailDup = true; + } + + if (!ShouldTailDup) + ShouldTailDup = isProfitableToTailDup(BB, Succ, true); DEBUG( dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " @@ -836,13 +997,14 @@ << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestProb >= SuccProb) { + if (BestSucc.BB && BestProb >= SuccProb) { DEBUG(dbgs() << " Not the best candidate, continuing\n"); continue; } DEBUG(dbgs() << " Setting it as best candidate\n"); - BestSucc = Succ; + BestSucc.BB = Succ; + BestSucc.ShouldTailDup = ShouldTailDup; BestProb = SuccProb; } if (BestSucc) @@ -995,7 +1157,9 @@ // Look for the best viable successor if there is one to place immediately // after this block. - MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + auto Result = selectBestSuccessor(BB, Chain, BlockFilter); + MachineBasicBlock* BestSucc = Result.BB; + bool ShouldTailDup = Result.ShouldTailDup; // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at @@ -1016,7 +1180,7 @@ // Placement may have changed tail duplication opportunities. // Check for that now. - if (TailDupPlacement && BestSucc) { + if (TailDupPlacement && BestSucc && ShouldTailDup) { // If the chosen successor was duplicated into all its predecessors, // don't bother laying it out, just go round the loop again with BB as // the chain end. @@ -1908,13 +2072,8 @@ DuplicatedToLPred = false; DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber() << "\n"); - bool IsSimple = TailDup.isSimpleBB(BB); - // Blocks with single successors don't create additional fallthrough - // opportunities. Don't duplicate them. TODO: When conditional exits are - // analyzable, allow them to be duplicated. - if (!IsSimple && BB->succ_size() == 1) - return false; - if (!TailDup.shouldTailDuplicate(IsSimple, *BB)) + + if (!shouldTailDuplicate(BB)) return false; // This has to be a callback because none of it can be done after // BB is deleted. @@ -1967,6 +2126,7 @@ llvm::function_ref(RemovalCallback); SmallVector DuplicatedPreds; + bool IsSimple = TailDup.isSimpleBB(BB); TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, &RemovalCallbackRef); @@ -2007,12 +2167,14 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MDT = &getAnalysis(); + MPDT = nullptr; // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. PreferredLoopExit = nullptr; if (TailDupPlacement) { + MPDT = &getAnalysis(); unsigned TailDupSize = TailDuplicatePlacementThreshold; if (MF.getFunction()->optForSize()) TailDupSize = 1; @@ -2043,6 +2205,8 @@ BlockToChain.clear(); // Must redo the dominator tree if blocks were changed. MDT->runOnMachineFunction(MF); + if (MPDT) + MPDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } Index: test/CodeGen/AArch64/addsub.ll =================================================================== --- test/CodeGen/AArch64/addsub.ll +++ test/CodeGen/AArch64/addsub.ll @@ -140,12 +140,17 @@ test5: ; CHECK: cmn {{w[0-9]+}}, #444 -; CHECK: b.gt [[RET]] +; CHECK: b.le [[TEST6:.?LBB[0-9]+_[0-9]+]] %newval5 = add i32 %val, 4 store i32 %newval5, i32* @var_i32 %cmp_neg_uge = icmp sgt i32 %val2, -444 br i1 %cmp_neg_uge, label %ret, label %test6 +; CHECK: {{^}}[[RET]]: +; CHECK: ret +; CHECK: {{^}}[[TEST6]]: +; CHECK: ret + test6: %newval6 = add i32 %val, 5 store i32 %newval6, i32* @var_i32 Index: test/CodeGen/AArch64/arm64-atomic.ll =================================================================== --- test/CodeGen/AArch64/arm64-atomic.ll +++ test/CodeGen/AArch64/arm64-atomic.ll @@ -9,10 +9,10 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -27,10 +27,12 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret %new = load i32, i32* %pnew %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 @@ -41,15 +43,15 @@ ; CHECK-LABEL: val_compare_and_swap_rel: ; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: -; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]] +; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: cmp [[RESULT]], w1 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] -; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]] +; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -64,10 +66,10 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val Index: test/CodeGen/AArch64/arm64-ccmp.ll =================================================================== --- test/CodeGen/AArch64/arm64-ccmp.ll +++ test/CodeGen/AArch64/arm64-ccmp.ll @@ -108,10 +108,10 @@ ; CHECK: cmp w0, #1 ; CHECK: sdiv [[DIVRES:w[0-9]+]], w1, w0 ; CHECK: ccmp [[DIVRES]], #16, #0, ge -; CHECK: b.gt [[BLOCK:LBB[0-9_]+]] -; CHECK: bl _foo -; CHECK: [[BLOCK]]: +; CHECK: b.le [[BLOCK:LBB[0-9_]+]] ; CHECK: orr w0, wzr, #0x7 +; CHECK: [[BLOCK]]: +; CHECK: bl _foo define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp { entry: %cmp = icmp sgt i32 %a, 0 @@ -135,7 +135,7 @@ ; CHECK: cmp ; CHECK-NOT: b. ; CHECK: fccmp {{.*}}, #8, ge -; CHECK: b.lt +; CHECK: b.ge define i32 @single_fcmp(i32 %a, float %b) nounwind ssp { entry: %cmp = icmp sgt i32 %a, 0 Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -346,19 +346,15 @@ ; CHECK-NEXT: sub w1, w1, #1 ; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]] ; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]] -; DISABLE-NEXT: b [[IFEND_LABEL]] -; -; DISABLE: [[ELSE_LABEL]]: ; %if.else -; DISABLE: lsl w0, w1, #1 -; -; CHECK: [[IFEND_LABEL]]: +; CHECK-NEXT: [[IFEND_LABEL]]: ; Epilogue code. ; CHECK: add sp, sp, #16 ; CHECK-NEXT: ret ; -; ENABLE: [[ELSE_LABEL]]: ; %if.else -; ENABLE-NEXT: lsl w0, w1, #1 -; ENABLE_NEXT: ret +; CHECK: [[ELSE_LABEL]]: ; %if.else +; CHECK-NEXT: lsl w0, w1, #1 +; DISABLE-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 { entry: %ap = alloca i8*, align 8 Index: test/CodeGen/AArch64/compare-branch.ll =================================================================== --- test/CodeGen/AArch64/compare-branch.ll +++ test/CodeGen/AArch64/compare-branch.ll @@ -27,7 +27,7 @@ %val4 = load volatile i64, i64* @var64 %tst4 = icmp ne i64 %val4, 0 br i1 %tst4, label %end, label %test5, !prof !1 -; CHECK: cbnz {{x[0-9]+}}, .LBB +; CHECK: cbz {{x[0-9]+}}, .LBB test5: store volatile i64 %val4, i64* @var64 Index: test/CodeGen/AArch64/logical_shifted_reg.ll =================================================================== --- test/CodeGen/AArch64/logical_shifted_reg.ll +++ test/CodeGen/AArch64/logical_shifted_reg.ll @@ -210,7 +210,7 @@ test3: ; CHECK: tst {{x[0-9]+}}, {{x[0-9]+}}, asr #12 -; CHECK: b.gt .L +; CHECK: b.le .L %asr_op = ashr i64 %val2, 12 %asr_and = and i64 %asr_op, %val1 %tst3 = icmp sgt i64 %asr_and, 0 Index: test/CodeGen/AArch64/tbz-tbnz.ll =================================================================== --- test/CodeGen/AArch64/tbz-tbnz.ll +++ test/CodeGen/AArch64/tbz-tbnz.ll @@ -10,7 +10,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -28,7 +28,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:x[0-9]+]], x0, #12 -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() @@ -118,7 +118,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -178,7 +178,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -194,7 +194,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -209,7 +209,7 @@ ; CHECK: ldr [[CMP:x[0-9]+]], [x1] ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 %val = load i64, i64* %ptr %tst = icmp slt i64 %val, 0 @@ -229,7 +229,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -247,7 +247,7 @@ ; CHECK: orr [[CMP:x[0-9]+]], x0, x1 ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() @@ -262,7 +262,7 @@ br i1 %cond, label %if.end, label %if.then ; CHECK-NOT: and -; CHECK: tbnz w0, #0 +; CHECK: tbz w0, #0 if.then: call void @t() Index: test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- test/CodeGen/AMDGPU/branch-relaxation.ll +++ test/CodeGen/AMDGPU/branch-relaxation.ll @@ -335,6 +335,12 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: [[BB3]]: ; %bb3 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm define void @expand_requires_expand(i32 %cond0) #0 { bb0: @@ -356,6 +362,12 @@ br label %bb3 bb3: +; These NOPs prevent tail-duplication-based outlining +; from firing, which defeats the need to expand the branches and this test. + call void asm sideeffect + "v_nop_e64", ""() #0 + call void asm sideeffect + "v_nop_e64", ""() #0 ret void } @@ -385,6 +397,7 @@ ; GCN-NEXT: [[ENDIF]]: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] +; GCN-NEXT: s_sleep 5 ; GCN-NEXT: s_endpgm define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 { entry: @@ -402,6 +415,9 @@ br label %endif endif: + ; layout can remove the split branch if it can copy the return block. + ; This call makes the return block long enough that it doesn't get copied. + call void @llvm.amdgcn.s.sleep(i32 5); ret void } Index: test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll +++ test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll @@ -37,7 +37,10 @@ ; OPT-NOT: call i1 @llvm.amdgcn.loop ; GCN-LABEL: {{^}}annotate_ret_noloop: -; GCN: s_cbranch_scc1 +; GCN: s_cbranch_scc0 [[BODY:BB[0-9]+_[0-9]+]] +; GCN: s_endpgm + +; GCN: {{^}}[[BODY]]: ; GCN: s_endpgm ; GCN: .Lfunc_end1 define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -252,10 +252,12 @@ ; GCN: s_cmp_lt_i32 [[COND]], 1 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]] ; GCN: v_cmp_gt_i32_e64 vcc, [[COND]], 0{{$}} -; GCN: s_cbranch_vccnz [[EXIT]] -; GCN: buffer_store +; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]] ; GCN: {{^}}[[EXIT]]: ; GCN: s_endpgm +; GCN: {{^}}[[BODY]]: +; GCN: buffer_store +; GCN: s_endpgm define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -302,9 +304,10 @@ ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] -; GCN: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]] ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0 -; GCN: s_cbranch_scc1 [[ENDIF_LABEL]] +; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]] +; GCN: s_endpgm +; GCN: {{^}}[[IF_UNIFORM_LABEL]]: ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) { @@ -328,14 +331,13 @@ ; GCN-LABEL: {{^}}divergent_inside_uniform: ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] +; GCN: [[IF_LABEL]]: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] -; GCN: [[ENDIF_LABEL]]: -; GCN: s_endpgm define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { entry: %u_cmp = icmp eq i32 %cond, 0 @@ -363,11 +365,11 @@ ; GCN: buffer_store_dword [[ONE]] ; GCN: s_or_b64 exec, exec, [[MASK]] ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]] +; GCN: s_cbranch_scc0 [[IF_UNIFORM:[A-Z0-9_]+]] +; GCN: s_endpgm +; GCN: [[IF_UNIFORM]]: ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; GCN: buffer_store_dword [[TWO]] -; GCN: [[EXIT]]: -; GCN: s_endpgm define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -398,16 +400,20 @@ ; GCN-LABEL: {{^}}cse_uniform_condition_different_blocks: ; GCN: s_load_dword [[COND:s[0-9]+]] ; GCN: s_cmp_lt_i32 [[COND]], 1 -; GCN: s_cbranch_scc1 BB[[FNNUM:[0-9]+]]_3 +; GCN: s_cbranch_scc1 [[FN:BB[0-9_]+]] ; GCN: BB#1: ; GCN-NOT: cmp ; GCN: buffer_load_dword ; GCN: buffer_store_dword -; GCN: s_cbranch_scc1 BB[[FNNUM]]_3 +; GCN: s_cbranch_scc0 [[BB7:BB[0-9_]+]] -; GCN: BB[[FNNUM]]_3: +; GCN: [[FN]]: ; GCN: s_endpgm + +; GCN: [[BB7]]: +; GCN: s_endpgm + define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 Index: test/CodeGen/ARM/arm-and-tst-peephole.ll =================================================================== --- test/CodeGen/ARM/arm-and-tst-peephole.ll +++ test/CodeGen/ARM/arm-and-tst-peephole.ll @@ -49,9 +49,9 @@ ; V8-NEXT: beq ; V8-NEXT: %tailrecurse.switch ; V8: cmp -; V8-NEXT: bne -; V8-NEXT: b -; The trailing space in the last line checks that the branch is unconditional +; V8-NEXT: beq +; V8-NEXT: %sw.epilog +; V8-NEXT: bx lr switch i32 %and, label %sw.epilog [ i32 1, label %sw.bb i32 3, label %sw.bb6 Index: test/CodeGen/ARM/atomic-op.ll =================================================================== --- test/CodeGen/ARM/atomic-op.ll +++ test/CodeGen/ARM/atomic-op.ll @@ -320,10 +320,10 @@ ; CHECK: strex [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]] ; CHECK: cmp [[SUCCESS]], #0 ; CHECK: bne [[LOOP_BB]] -; CHECK: b [[END_BB:\.?LBB[0-9]+_[0-9]+]] +; CHECK: dmb ish +; CHECK: bx lr ; CHECK: [[FAIL_BB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[END_BB]]: ; CHECK: dmb ish ; CHECK: bx lr Index: test/CodeGen/ARM/atomic-ops-v8.ll =================================================================== --- test/CodeGen/ARM/atomic-ops-v8.ll +++ test/CodeGen/ARM/atomic-ops-v8.ll @@ -1045,20 +1045,21 @@ ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i8 %old } @@ -1078,20 +1079,21 @@ ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i16 %old } @@ -1110,20 +1112,21 @@ ; r0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: cmp r[[OLD]], r0 -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-NEXT: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-ARM-NEXT: bx lr ret void } @@ -1148,16 +1151,16 @@ ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]] ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r2, r3 is a reasonable guess. ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]] +; CHECK-NEXT: pop ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr Index: test/CodeGen/ARM/cmpxchg-weak.ll =================================================================== --- test/CodeGen/ARM/cmpxchg-weak.ll +++ test/CodeGen/ARM/cmpxchg-weak.ll @@ -13,14 +13,16 @@ ; CHECK-NEXT: dmb ish ; CHECK-NEXT: strex [[SUCCESS:r[0-9]+]], r2, [r0] ; CHECK-NEXT: cmp [[SUCCESS]], #0 -; CHECK-NEXT: bne [[FAILBB:LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: beq [[SUCCESSBB:LBB[0-9]+_[0-9]+]] ; CHECK-NEXT: BB#2: -; CHECK-NEXT: dmb ish ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: bx lr ; CHECK-NEXT: [[LDFAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: str r3, [r0] +; CHECK-NEXT: bx lr +; CHECK-NEXT: [[SUCCESSBB]]: +; CHECK-NEXT: dmb ish ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: bx lr Index: test/CodeGen/ARM/machine-cse-cmp.ll =================================================================== --- test/CodeGen/ARM/machine-cse-cmp.ll +++ test/CodeGen/ARM/machine-cse-cmp.ll @@ -52,7 +52,7 @@ ; CHECK-LABEL: f3: ; CHECK-NOT: sub ; CHECK: cmp -; CHECK: blt +; CHECK: bge %0 = load i32, i32* %offset, align 4 %cmp = icmp slt i32 %0, %size %s = sub nsw i32 %0, %size Index: test/CodeGen/Mips/brconeq.ll =================================================================== --- test/CodeGen/Mips/brconeq.ll +++ test/CodeGen/Mips/brconeq.ll @@ -8,11 +8,11 @@ entry: %0 = load i32, i32* @i, align 4 %1 = load i32, i32* @j, align 4 - %cmp = icmp eq i32 %0, %1 + %cmp = icmp ne i32 %0, %1 ; 16: cmp ${{[0-9]+}}, ${{[0-9]+}} ; 16: bteqz $[[LABEL:[0-9A-Ba-b_]+]] ; 16: $[[LABEL]]: - br i1 %cmp, label %if.end, label %if.then + br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry store i32 1, i32* @result, align 4 Index: test/CodeGen/Mips/brconeqk.ll =================================================================== --- test/CodeGen/Mips/brconeqk.ll +++ test/CodeGen/Mips/brconeqk.ll @@ -6,8 +6,8 @@ define void @test() nounwind { entry: %0 = load i32, i32* @i, align 4 - %cmp = icmp eq i32 %0, 10 - br i1 %cmp, label %if.end, label %if.then + %cmp = icmp ne i32 %0, 10 + br i1 %cmp, label %if.then, label %if.end ; 16: cmpi ${{[0-9]+}}, {{[0-9]+}} ; 16: bteqz $[[LABEL:[0-9A-Ba-b_]+]] ; 16: $[[LABEL]]: Index: test/CodeGen/Mips/brcongt.ll =================================================================== --- test/CodeGen/Mips/brcongt.ll +++ test/CodeGen/Mips/brcongt.ll @@ -9,8 +9,8 @@ entry: %0 = load i32, i32* @i, align 4 %1 = load i32, i32* @j, align 4 - %cmp = icmp sgt i32 %0, %1 - br i1 %cmp, label %if.end, label %if.then + %cmp = icmp sle i32 %0, %1 + br i1 %cmp, label %if.then, label %if.end ; 16: slt ${{[0-9]+}}, ${{[0-9]+}} ; 16: btnez $[[LABEL:[0-9A-Ba-b_]+]] ; 16: $[[LABEL]]: Index: test/CodeGen/Mips/brconlt.ll =================================================================== --- test/CodeGen/Mips/brconlt.ll +++ test/CodeGen/Mips/brconlt.ll @@ -10,8 +10,8 @@ entry: %0 = load i32, i32* @j, align 4 %1 = load i32, i32* @i, align 4 - %cmp = icmp slt i32 %0, %1 - br i1 %cmp, label %if.end, label %if.then + %cmp = icmp sge i32 %0, %1 + br i1 %cmp, label %if.then, label %if.end ; 16: slt ${{[0-9]+}}, ${{[0-9]+}} ; MM32R6: slt ${{[0-9]+}}, ${{[0-9]+}} Index: test/CodeGen/Mips/brconnez.ll =================================================================== --- test/CodeGen/Mips/brconnez.ll +++ test/CodeGen/Mips/brconnez.ll @@ -7,7 +7,7 @@ entry: %0 = load i32, i32* @j, align 4 %cmp = icmp eq i32 %0, 0 - br i1 %cmp, label %if.then, label %if.end + br i1 %cmp, label %if.then, label %if.end, !prof !1 ; 16: bnez ${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]] ; 16: lw ${{[0-9]+}}, %got(result)(${{[0-9]+}}) @@ -21,4 +21,4 @@ ret void } - +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/Mips/llvm-ir/ashr.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/ashr.ll +++ test/CodeGen/Mips/llvm-ir/ashr.ll @@ -91,12 +91,13 @@ ; M2: sllv $[[T5:[0-9]+]], $[[T4]], $[[T3]] ; M2: or $3, $[[T3]], $[[T2]] ; M2: $[[BB0]]: - ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] ; M2: nop - ; M2: sra $2, $4, 31 - ; M2: $[[BB1]]: ; M2: jr $ra ; M2: nop + ; M2: $[[BB1]]: + ; M2: jr $ra + ; M2: sra $2, $4, 31 ; 32R1-R5: srlv $[[T0:[0-9]+]], $5, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -177,12 +178,13 @@ ; M3: dsllv $[[T7:[0-9]+]], $[[T5]], $[[T6]] ; M3: or $3, $[[T7]], $[[T4]] ; M3: [[BB0]]: - ; M3: beqz $[[T3]], [[BB1:.LBB[0-9_]+]] + ; M3: bnez $[[T3]], [[BB1:.LBB[0-9_]+]] ; M3: nop - ; M3: dsra $2, $4, 63 - ; M3: [[BB1]]: ; M3: jr $ra ; M3: nop + ; M3: [[BB1]]: + ; M3: jr $ra + ; M3: dsra $2, $4, 63 ; GP64-NOT-R6: dsrlv $[[T0:[0-9]+]], $5, $7 ; GP64-NOT-R6: dsll $[[T1:[0-9]+]], $4, 1 Index: test/CodeGen/Mips/micromips-compact-branches.ll =================================================================== --- test/CodeGen/Mips/micromips-compact-branches.ll +++ test/CodeGen/Mips/micromips-compact-branches.ll @@ -6,7 +6,7 @@ %x = alloca i32, align 4 %0 = load i32, i32* %x, align 4 %cmp = icmp eq i32 %0, 0 - br i1 %cmp, label %if.then, label %if.end + br i1 %cmp, label %if.then, label %if.end, !prof !1 if.then: store i32 10, i32* %x, align 4 @@ -17,3 +17,4 @@ } ; CHECK: bnezc +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/PowerPC/misched-inorder-latency.ll =================================================================== --- test/CodeGen/PowerPC/misched-inorder-latency.ll +++ test/CodeGen/PowerPC/misched-inorder-latency.ll @@ -17,7 +17,7 @@ %sum1 = add i32 %sumin, 1 %val1 = load i32, i32* %ptr %p = icmp eq i32 %sumin, 0 - br i1 %p, label %true, label %end + br i1 %p, label %true, label %end, !prof !1 true: %sum2 = add i32 %sum1, 1 %ptr2 = getelementptr i32, i32* %ptr, i32 1 @@ -53,3 +53,5 @@ ret i32 %valmerge } declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/PowerPC/tail-dup-break-cfg.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-break-cfg.ll @@ -0,0 +1,97 @@ +; RUN: llc -O2 -o - %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +; Intended layout: +; The code for tail-duplication during layout will produce the layout: +; test1 +; test2 +; body1 (with copy of test2) +; body2 +; exit + +;CHECK-LABEL: tail_dup_break_cfg: +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[BODY1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[EXITLABEL]] +;CHECK-NEXT: [[BODY2LABEL]] +;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +define void @tail_dup_break_cfg(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely +body1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %exit, label %body2, !prof !1 ; %exit more likely +body2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %exit +exit: + ret void +} + +; The branch weights here hint that we shouldn't tail duplicate in this case. +;CHECK-LABEL: tail_dup_dont_break_cfg: +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 4, 1, [[TEST2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %body1 +;CHECK: [[TEST2LABEL]]: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %body2 +;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +define void @tail_dup_dont_break_cfg(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely +body1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp ne i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely +body2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %exit +exit: + ret void +} +declare void @a() +declare void @b() +declare void @c() +declare void @d() + +!1 = !{!"branch_weights", i32 5, i32 3} Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- test/CodeGen/PowerPC/tail-dup-layout.ll +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -19,7 +19,7 @@ ; The CHECK statements check for the whole string of tests and exit block, ; and then check that the correct test has been duplicated into the end of ; the optional blocks and that the optional blocks are in the correct order. -;CHECK-LABEL: f: +;CHECK-LABEL: straight_test: ; test1 may have been merged with entry ;CHECK: mr [[TAGREG:[0-9]+]], 3 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 @@ -47,7 +47,7 @@ ;CHECK-NEXT: [[OPT4LABEL]] ;CHECK: b [[EXITLABEL]] -define void @f(i32 %tag) { +define void @straight_test(i32 %tag) { entry: br label %test1 test1: @@ -94,7 +94,57 @@ ret void } +; The block then2 is not unavoidable, but since it can be tail-duplicated, it +; should be placed as a fallthrough from test2 and copied. +; CHECK-LABEL: avoidable_test: +; CHECK: # %entry +; CHECK: andi. +; CHECK: # %test2 +; Make sure then2 falls through from test2 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %then2 +; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %end2 +; CHECK: # %else1 +; CHECK: bl a +; CHECK: bl a +; Make sure then2 was copied into else1 +; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 +; CHECK: # %else2 +; CHECK: bl c +define void @avoidable_test(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely +else1: + call void @a() + call void @a() + br label %then2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely +then2: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely +else2: + call void @c() + br label %end2 +end2: + ret void +end1: + call void @d() + ret void +} + declare void @a() declare void @b() declare void @c() declare void @d() + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/SPARC/sjlj.ll =================================================================== --- test/CodeGen/SPARC/sjlj.ll +++ test/CodeGen/SPARC/sjlj.ll @@ -66,14 +66,15 @@ ; CHECK: ba .LBB1_1 ; CHECK: nop ; CHECK:.LBB1_1: ! %entry -; CHECK: ba .LBB1_3 ; CHECK: mov %g0, %i0 +; CHECK: cmp %i0, 0 +; CHECK: bne .LBB1_4 +; CHECK: ba .LBB1_5 ; CHECK:.LBB1_2: ! Block address taken ; CHECK: mov 1, %i0 -; CHECK:.LBB1_3: ! %entry -; CHECK: cmp %i0, 0 ; CHECK: be .LBB1_5 -; CHECK: nop +; CHECK:.LBB1_4: +; CHECK: ba .LBB1_6 } declare i8* @llvm.frameaddress(i32) #2 Index: test/CodeGen/SystemZ/asm-18.ll =================================================================== --- test/CodeGen/SystemZ/asm-18.ll +++ test/CodeGen/SystemZ/asm-18.ll @@ -297,7 +297,7 @@ ; CHECK: iihf [[REG]], 2102030405 ; CHECK: blah [[REG]] ; CHECK: br %r14 - %cmp = icmp eq i32 %x, 0 + %cmp = icmp ne i32 %x, 0 %val = select i1 %cmp, i32 0, i32 2102030405 call void asm sideeffect "blah $0", "h"(i32 %val) ret void @@ -311,7 +311,7 @@ ; CHECK: iilf [[REG]], 2102030405 ; CHECK: blah [[REG]] ; CHECK: br %r14 - %cmp = icmp eq i32 %x, 0 + %cmp = icmp ne i32 %x, 0 %val = select i1 %cmp, i32 0, i32 2102030405 call void asm sideeffect "blah $0", "r"(i32 %val) ret void Index: test/CodeGen/SystemZ/cond-store-01.ll =================================================================== --- test/CodeGen/SystemZ/cond-store-01.ll +++ test/CodeGen/SystemZ/cond-store-01.ll @@ -297,8 +297,11 @@ define void @f18(i8 *%ptr, i8 %alt, i32 %limit) { ; CHECK-LABEL: f18: ; CHECK: lb {{%r[0-5]}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: stc {{%r[0-5]}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}} ; CHECK: stc {{%r[0-5]}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 @@ -331,8 +334,11 @@ ; FIXME: should use a normal load instead of CS. ; CHECK-LABEL: f20: ; CHECK: lb {{%r[0-9]+}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: stc {{%r[0-9]+}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}} ; CHECK: stc {{%r[0-9]+}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 Index: test/CodeGen/SystemZ/cond-store-02.ll =================================================================== --- test/CodeGen/SystemZ/cond-store-02.ll +++ test/CodeGen/SystemZ/cond-store-02.ll @@ -297,8 +297,11 @@ define void @f18(i16 *%ptr, i16 %alt, i32 %limit) { ; CHECK-LABEL: f18: ; CHECK: lh {{%r[0-5]}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: sth {{%r[0-5]}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}} ; CHECK: sth {{%r[0-5]}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 @@ -331,8 +334,11 @@ ; FIXME: should use a normal load instead of CS. ; CHECK-LABEL: f20: ; CHECK: lh {{%r[0-9]+}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: sth {{%r[0-9]+}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: lr {{%r[0-9]+}}, {{%r[0-9]+}} ; CHECK: sth {{%r[0-9]+}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 Index: test/CodeGen/SystemZ/cond-store-03.ll =================================================================== --- test/CodeGen/SystemZ/cond-store-03.ll +++ test/CodeGen/SystemZ/cond-store-03.ll @@ -226,8 +226,11 @@ define void @f14(i32 *%ptr, i32 %alt, i32 %limit) { ; CHECK-LABEL: f14: ; CHECK: l {{%r[0-5]}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: st {{%r[0-5]}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}} ; CHECK: st {{%r[0-5]}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 @@ -260,8 +263,11 @@ ; FIXME: should use a normal load instead of CS. ; CHECK-LABEL: f16: ; CHECK: l {{%r[0-5]}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: st {{%r[0-5]}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}} ; CHECK: st {{%r[0-5]}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 Index: test/CodeGen/SystemZ/cond-store-04.ll =================================================================== --- test/CodeGen/SystemZ/cond-store-04.ll +++ test/CodeGen/SystemZ/cond-store-04.ll @@ -124,8 +124,11 @@ define void @f8(i64 *%ptr, i64 %alt, i32 %limit) { ; CHECK-LABEL: f8: ; CHECK: lg {{%r[0-5]}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: stg {{%r[0-5]}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: lgr {{%r[0-5]}}, {{%r[0-5]}} ; CHECK: stg {{%r[0-5]}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 @@ -158,8 +161,11 @@ ; FIXME: should use a normal load instead of CSG. ; CHECK-LABEL: f10: ; CHECK: lg {{%r[0-5]}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: stg {{%r[0-5]}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: lgr {{%r[0-5]}}, {{%r[0-5]}} ; CHECK: stg {{%r[0-5]}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 Index: test/CodeGen/SystemZ/cond-store-05.ll =================================================================== --- test/CodeGen/SystemZ/cond-store-05.ll +++ test/CodeGen/SystemZ/cond-store-05.ll @@ -156,8 +156,11 @@ define void @f10(float *%ptr, float %alt, i32 %limit) { ; CHECK-LABEL: f10: ; CHECK: le {{%f[0-5]}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: ste {{%f[0-5]}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: ler {{%f[0-5]}}, {{%f[0-5]}} ; CHECK: ste {{%f[0-5]}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 Index: test/CodeGen/SystemZ/cond-store-06.ll =================================================================== --- test/CodeGen/SystemZ/cond-store-06.ll +++ test/CodeGen/SystemZ/cond-store-06.ll @@ -156,8 +156,11 @@ define void @f10(double *%ptr, double %alt, i32 %limit) { ; CHECK-LABEL: f10: ; CHECK: ld {{%f[0-5]}}, 0(%r2) -; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]] +; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]] +; CHECK: std {{%f[0-5]}}, 0(%r2) +; CHECK: br %r14 ; CHECK: [[LABEL]]: +; CHECK: ldr {{%f[0-5]}}, {{%f[0-5]}} ; CHECK: std {{%f[0-5]}}, 0(%r2) ; CHECK: br %r14 %cond = icmp ult i32 %limit, 420 Index: test/CodeGen/SystemZ/int-cmp-37.ll =================================================================== --- test/CodeGen/SystemZ/int-cmp-37.ll +++ test/CodeGen/SystemZ/int-cmp-37.ll @@ -15,8 +15,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i32 - %cond = icmp ult i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -34,8 +34,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i32 - %cond = icmp slt i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp sge i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -54,8 +54,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i32 - %cond = icmp eq i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp ne i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -74,8 +74,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i32 - %cond = icmp ne i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp eq i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -95,8 +95,8 @@ entry: %val = load i16 , i16 *@h, align 1 %src2 = zext i16 %val to i32 - %cond = icmp ult i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -115,8 +115,8 @@ entry: %val = load i16 , i16 *@g %src1 = zext i16 %val to i32 - %cond = icmp ult i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src2, %src2 br label %exit Index: test/CodeGen/SystemZ/int-cmp-40.ll =================================================================== --- test/CodeGen/SystemZ/int-cmp-40.ll +++ test/CodeGen/SystemZ/int-cmp-40.ll @@ -15,8 +15,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i64 - %cond = icmp ult i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src1, %src1 br label %exit @@ -54,8 +54,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i64 - %cond = icmp eq i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp ne i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src1, %src1 br label %exit @@ -74,8 +74,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i64 - %cond = icmp ne i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp eq i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src1, %src1 br label %exit @@ -95,8 +95,8 @@ entry: %val = load i16 , i16 *@h, align 1 %src2 = zext i16 %val to i64 - %cond = icmp ult i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src1, %src1 br label %exit @@ -115,8 +115,8 @@ entry: %val = load i16 , i16 *@g %src1 = zext i16 %val to i64 - %cond = icmp ult i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src2, %src2 br label %exit Index: test/CodeGen/SystemZ/int-cmp-44.ll =================================================================== --- test/CodeGen/SystemZ/int-cmp-44.ll +++ test/CodeGen/SystemZ/int-cmp-44.ll @@ -473,8 +473,8 @@ %xor = xor i32 %val, 1 %add = add i32 %xor, 1000000 call void @foo() - %cmp = icmp ne i32 %add, 0 - br i1 %cmp, label %exit, label %store + %cmp = icmp eq i32 %add, 0 + br i1 %cmp, label %store, label %exit, !prof !1 store: store i32 %add, i32 *%ptr @@ -888,3 +888,5 @@ exit: ret i64 %res } + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/SystemZ/int-cmp-48.ll =================================================================== --- test/CodeGen/SystemZ/int-cmp-48.ll +++ test/CodeGen/SystemZ/int-cmp-48.ll @@ -52,7 +52,7 @@ define double @f3(i8 *%src, double %a, double %b) { ; CHECK-LABEL: f3: ; CHECK: tm 0(%r2), 1 -; CHECK: je {{\.L.*}} +; CHECK: jne {{\.L.*}} ; CHECK: br %r14 %byte = load i8 , i8 *%src %and = and i8 %byte, 1 @@ -80,7 +80,7 @@ define double @f5(i8 *%src, double %a, double %b) { ; CHECK-LABEL: f5: ; CHECK: tm 0(%r2), 1 -; CHECK: jne {{\.L.*}} +; CHECK: je {{\.L.*}} ; CHECK: br %r14 %byte = load i8 , i8 *%src %and = and i8 %byte, 1 @@ -93,7 +93,7 @@ define double @f6(i8 *%src, double %a, double %b) { ; CHECK-LABEL: f6: ; CHECK: tm 0(%r2), 254 -; CHECK: jo {{\.L.*}} +; CHECK: jno {{\.L.*}} ; CHECK: br %r14 %byte = load i8 , i8 *%src %and = and i8 %byte, 254 @@ -106,7 +106,7 @@ define double @f7(i8 *%src, double %a, double %b) { ; CHECK-LABEL: f7: ; CHECK: tm 0(%r2), 254 -; CHECK: jno {{\.L.*}} +; CHECK: jo {{\.L.*}} ; CHECK: br %r14 %byte = load i8 , i8 *%src %and = and i8 %byte, 254 @@ -121,7 +121,7 @@ ; CHECK-LABEL: f8: ; CHECK: llc [[REG:%r[0-5]]], 0(%r2) ; CHECK: tmll [[REG]], 3 -; CHECK: jh {{\.L.*}} +; CHECK: jnh {{\.L.*}} ; CHECK: br %r14 %byte = load i8 , i8 *%src %and = and i8 %byte, 3 @@ -135,7 +135,7 @@ ; CHECK-LABEL: f9: ; CHECK: llc [[REG:%r[0-5]]], 0(%r2) ; CHECK: tmll [[REG]], 3 -; CHECK: jl {{\.L.*}} +; CHECK: jnl {{\.L.*}} ; CHECK: br %r14 %byte = load i8 , i8 *%src %and = and i8 %byte, 3 @@ -148,7 +148,7 @@ define double @f10(i8 *%src, double %a, double %b) { ; CHECK-LABEL: f10: ; CHECK: tm 4095(%r2), 1 -; CHECK: je {{\.L.*}} +; CHECK: jne {{\.L.*}} ; CHECK: br %r14 %ptr = getelementptr i8, i8 *%src, i64 4095 %byte = load i8 , i8 *%ptr @@ -162,7 +162,7 @@ define double @f11(i8 *%src, double %a, double %b) { ; CHECK-LABEL: f11: ; CHECK: tmy 4096(%r2), 1 -; CHECK: je {{\.L.*}} +; CHECK: jne {{\.L.*}} ; CHECK: br %r14 %ptr = getelementptr i8, i8 *%src, i64 4096 %byte = load i8 , i8 *%ptr @@ -176,7 +176,7 @@ define double @f12(i8 *%src, double %a, double %b) { ; CHECK-LABEL: f12: ; CHECK: tmy 524287(%r2), 1 -; CHECK: je {{\.L.*}} +; CHECK: jne {{\.L.*}} ; CHECK: br %r14 %ptr = getelementptr i8, i8 *%src, i64 524287 %byte = load i8 , i8 *%ptr @@ -191,7 +191,7 @@ ; CHECK-LABEL: f13: ; CHECK: agfi %r2, 524288 ; CHECK: tm 0(%r2), 1 -; CHECK: je {{\.L.*}} +; CHECK: jne {{\.L.*}} ; CHECK: br %r14 %ptr = getelementptr i8, i8 *%src, i64 524288 %byte = load i8 , i8 *%ptr @@ -205,7 +205,7 @@ define double @f14(i8 *%src, double %a, double %b) { ; CHECK-LABEL: f14: ; CHECK: tmy -524288(%r2), 1 -; CHECK: je {{\.L.*}} +; CHECK: jne {{\.L.*}} ; CHECK: br %r14 %ptr = getelementptr i8, i8 *%src, i64 -524288 %byte = load i8 , i8 *%ptr @@ -220,7 +220,7 @@ ; CHECK-LABEL: f15: ; CHECK: agfi %r2, -524289 ; CHECK: tm 0(%r2), 1 -; CHECK: je {{\.L.*}} +; CHECK: jne {{\.L.*}} ; CHECK: br %r14 %ptr = getelementptr i8, i8 *%src, i64 -524289 %byte = load i8 , i8 *%ptr @@ -234,7 +234,7 @@ define double @f16(i8 *%src, i64 %index, double %a, double %b) { ; CHECK-LABEL: f16: ; CHECK: tm 0({{%r[1-5]}}), 1 -; CHECK: je {{\.L.*}} +; CHECK: jne {{\.L.*}} ; CHECK: br %r14 %ptr = getelementptr i8, i8 *%src, i64 %index %byte = load i8 , i8 *%ptr Index: test/CodeGen/SystemZ/tdc-06.ll =================================================================== --- test/CodeGen/SystemZ/tdc-06.ll +++ test/CodeGen/SystemZ/tdc-06.ll @@ -26,25 +26,27 @@ nonzeroord: ; CHECK: lhi %r2, 2 ; CHECK: tcdb %f0, 48 -; CHECK: jl [[RET]] +; CHECK: je [[FINITE:.]] %abs = tail call double @llvm.fabs.f64(double %x) %testinf = fcmp oeq double %abs, 0x7FF0000000000000 br i1 %testinf, label %ret, label %finite, !prof !1 +ret: +; CHECK: [[RET]]: +; CHECK: br %r14 + %res = phi i32 [ 5, %entry ], [ 1, %nonzero ], [ 2, %nonzeroord ], [ %finres, %finite ] + ret i32 %res + finite: ; CHECK: lhi %r2, 3 ; CHECK: tcdb %f0, 831 ; CHECK: blr %r14 ; CHECK: lhi %r2, 4 +; CHECK: br %r14 %testnormal = fcmp uge double %abs, 0x10000000000000 %finres = select i1 %testnormal, i32 3, i32 4 br label %ret -ret: -; CHECK: [[RET]]: -; CHECK: br %r14 - %res = phi i32 [ 5, %entry ], [ 1, %nonzero ], [ 2, %nonzeroord ], [ %finres, %finite ] - ret i32 %res } !1 = !{!"branch_weights", i32 1, i32 1} Index: test/CodeGen/Thumb/thumb-shrink-wrapping.ll =================================================================== --- test/CodeGen/Thumb/thumb-shrink-wrapping.ll +++ test/CodeGen/Thumb/thumb-shrink-wrapping.ll @@ -1,11 +1,12 @@ -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T + ; ; Note: Lots of tests use inline asm instead of regular calls. ; This allows to have a better control on what the allocation will do. @@ -15,6 +16,8 @@ ; edges. ; Also disable the late if-converter as it makes harder to reason on ; the diffs. +; Disable tail-duplication during placement, as v4t vs v5t get different +; results due to branches not being analyzable under v5 ; Initial motivating example: Simple diamond with a call just on one side. ; CHECK-LABEL: foo: Index: test/CodeGen/Thumb2/cbnz.ll =================================================================== --- test/CodeGen/Thumb2/cbnz.ll +++ test/CodeGen/Thumb2/cbnz.ll @@ -26,7 +26,7 @@ call void @x() call void @x() call void @x() - ; CHECK: cbnz + ; CHECK: cbz %q = icmp eq i32 %y, 0 br i1 %q, label %t2, label %f Index: test/CodeGen/Thumb2/ifcvt-compare.ll =================================================================== --- test/CodeGen/Thumb2/ifcvt-compare.ll +++ test/CodeGen/Thumb2/ifcvt-compare.ll @@ -4,7 +4,7 @@ define void @f0(i32 %x) optsize { ; CHECK-LABEL: f0: - ; CHECK: cbnz + ; CHECK: cbz %p = icmp eq i32 %x, 0 br i1 %p, label %t, label %f Index: test/CodeGen/Thumb2/v8_IT_4.ll =================================================================== --- test/CodeGen/Thumb2/v8_IT_4.ll +++ test/CodeGen/Thumb2/v8_IT_4.ll @@ -12,10 +12,11 @@ define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string,std::allocator >"* %this, %"struct.std::basic_string,std::allocator >"* %__str) { ; CHECK-LABEL: _ZNKSs7compareERKSs: -; CHECK: cbnz r0, +; CHECK: cbz r0, +; CHECK-NEXT: %bb1 +; CHECK-NEXT: pop.w ; CHECK-NEXT: %bb ; CHECK-NEXT: sub{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}} -; CHECK-NEXT: %bb1 ; CHECK-NEXT: pop.w entry: %0 = tail call arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string,std::allocator >"* %this) ; [#uses=3] Index: test/CodeGen/WebAssembly/phi.ll =================================================================== --- test/CodeGen/WebAssembly/phi.ll +++ test/CodeGen/WebAssembly/phi.ll @@ -8,8 +8,9 @@ ; Basic phi triangle. ; CHECK-LABEL: test0: -; CHECK: div_s $[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}} -; CHECK: return $[[NUM0]]{{$}} +; CHECK: return $0 +; CHECK: div_s $push[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}} +; CHECK: return $pop[[NUM0]]{{$}} define i32 @test0(i32 %p) { entry: %t = icmp slt i32 %p, 0 Index: test/CodeGen/X86/2008-11-29-ULT-Sign.ll =================================================================== --- test/CodeGen/X86/2008-11-29-ULT-Sign.ll +++ test/CodeGen/X86/2008-11-29-ULT-Sign.ll @@ -4,8 +4,8 @@ define i32 @a(i32 %x) nounwind { entry: - %cmp = icmp ult i32 %x, -2147483648 ; [#uses=1] - br i1 %cmp, label %if.end, label %if.then + %cmp = icmp uge i32 %x, -2147483648 ; [#uses=1] + br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %call = call i32 (...) @b() ; [#uses=0] Index: test/CodeGen/X86/add.ll =================================================================== --- test/CodeGen/X86/add.ll +++ test/CodeGen/X86/add.ll @@ -30,7 +30,8 @@ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %sum = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %normal + %notobit = xor i1 1, %obit + br i1 %notobit, label %normal, label %overflow normal: store i32 0, i32* %X @@ -53,7 +54,8 @@ %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %sum = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %carry, label %normal + %notobit = xor i1 1, %obit + br i1 %notobit, label %normal, label %carry normal: store i32 0, i32* %X Index: test/CodeGen/X86/avx-splat.ll =================================================================== --- test/CodeGen/X86/avx-splat.ll +++ test/CodeGen/X86/avx-splat.ll @@ -62,8 +62,10 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ## implicit-def: %YMM0 ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne LBB4_2 -; CHECK-NEXT: ## BB#1: ## %load.i1247 +; CHECK-NEXT: je LBB4_1 +; CHECK-NEXT: ## BB#2: ## %__load_and_broadcast_32.exit1249 +; CHECK-NEXT: retq +; CHECK-NEXT: LBB4_1: ## %load.i1247 ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: andq $-32, %rsp @@ -71,7 +73,6 @@ ; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %ymm0 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp -; CHECK-NEXT: LBB4_2: ## %__load_and_broadcast_32.exit1249 ; CHECK-NEXT: retq allocas: %udx495 = alloca [18 x [18 x float]], align 32 Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -69,13 +69,14 @@ ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vucomiss %xmm1, %xmm0 ; ALL-NEXT: jne LBB3_1 -; ALL-NEXT: jnp LBB3_2 +; ALL-NEXT: jp LBB3_1 +; ALL-NEXT: ## BB#2: ## %return +; ALL-NEXT: retq ; ALL-NEXT: LBB3_1: ## %if.end ; ALL-NEXT: seta %al ; ALL-NEXT: movzbl %al, %eax ; ALL-NEXT: leaq {{.*}}(%rip), %rcx ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: LBB3_2: ## %return ; ALL-NEXT: retq entry: %cmp = fcmp oeq float %p, 0.000000e+00 Index: test/CodeGen/X86/bt.ll =================================================================== --- test/CodeGen/X86/bt.ll +++ test/CodeGen/X86/bt.ll @@ -49,7 +49,7 @@ %tmp29 = lshr i32 %x, %n %tmp3 = and i32 1, %tmp29 %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -89,7 +89,7 @@ %tmp29 = ashr i32 %x, %n %tmp3 = and i32 1, %tmp29 %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -109,7 +109,7 @@ %tmp29 = shl i32 1, %n %tmp3 = and i32 %tmp29, %x %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -129,7 +129,7 @@ %tmp29 = shl i32 1, %n %tmp3 = and i32 %x, %tmp29 %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -608,3 +608,5 @@ %tobool = icmp ne i64 %and1, 0 ret i1 %tobool } + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/X86/critical-edge-split-2.ll =================================================================== --- test/CodeGen/X86/critical-edge-split-2.ll +++ test/CodeGen/X86/critical-edge-split-2.ll @@ -24,6 +24,7 @@ ; CHECK-LABEL: test1: ; CHECK: testb %dil, %dil -; CHECK: jne LBB0_2 +; CHECK: je LBB0_1 +; CHECK: retq +; CHECK: LBB0_1: ; CHECK: divl -; CHECK: LBB0_2: Index: test/CodeGen/X86/fp-une-cmp.ll =================================================================== --- test/CodeGen/X86/fp-une-cmp.ll +++ test/CodeGen/X86/fp-une-cmp.ll @@ -36,8 +36,8 @@ entry: %mul = fmul double %x, %y - %cmp = fcmp une double %mul, 0.000000e+00 - br i1 %cmp, label %bb2, label %bb1 + %cmp = fcmp oeq double %mul, 0.000000e+00 + br i1 %cmp, label %bb1, label %bb2 bb1: %add = fadd double %mul, -1.000000e+00 Index: test/CodeGen/X86/jump_sign.ll =================================================================== --- test/CodeGen/X86/jump_sign.ll +++ test/CodeGen/X86/jump_sign.ll @@ -6,7 +6,7 @@ ; CHECK: jns %tmp1 = add i32 %X, 1 ; [#uses=1] %tmp = icmp slt i32 %tmp1, 0 ; [#uses=1] - br i1 %tmp, label %cond_true, label %cond_next + br i1 %tmp, label %cond_true, label %cond_next, !prof !1 cond_true: ; preds = %entry %tmp2 = tail call i32 (...) @bar( ) ; [#uses=0] @@ -303,3 +303,5 @@ if.end: ret i32 undef } + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/X86/machine-cse.ll =================================================================== --- test/CodeGen/X86/machine-cse.ll +++ test/CodeGen/X86/machine-cse.ll @@ -86,8 +86,8 @@ ; CHECK-LABEL: cross_mbb_phys_cse: ; CHECK: cmpl ; CHECK: ja - %cmp = icmp ugt i32 %a, %b - br i1 %cmp, label %return, label %if.end + %cmp = icmp ule i32 %a, %b + br i1 %cmp, label %if.end, label %return if.end: ; preds = %entry ; CHECK-NOT: cmpl Index: test/CodeGen/X86/shift-double.ll =================================================================== --- test/CodeGen/X86/shift-double.ll +++ test/CodeGen/X86/shift-double.ll @@ -14,11 +14,13 @@ ; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: shldl %cl, %esi, %edx ; CHECK-NEXT: testb $32, %cl -; CHECK-NEXT: je .LBB0_2 -; CHECK-NEXT: # BB#1: +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT: # BB#2: +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; CHECK-NEXT: .LBB0_1: ; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: popl %esi ; CHECK-NEXT: retl %shift.upgrd.1 = zext i8 %C to i64 ; [#uses=1] @@ -37,12 +39,14 @@ ; CHECK-NEXT: sarl %cl, %edx ; CHECK-NEXT: shrdl %cl, %esi, %eax ; CHECK-NEXT: testb $32, %cl -; CHECK-NEXT: je .LBB1_2 -; CHECK-NEXT: # BB#1: +; CHECK-NEXT: jne .LBB1_1 +; CHECK-NEXT: # BB#2: +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; CHECK-NEXT: .LBB1_1: ; CHECK-NEXT: sarl $31, %esi ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: movl %esi, %edx -; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: popl %esi ; CHECK-NEXT: retl %shift.upgrd.2 = zext i8 %C to i64 ; [#uses=1] @@ -61,11 +65,13 @@ ; CHECK-NEXT: shrl %cl, %edx ; CHECK-NEXT: shrdl %cl, %esi, %eax ; CHECK-NEXT: testb $32, %cl -; CHECK-NEXT: je .LBB2_2 -; CHECK-NEXT: # BB#1: +; CHECK-NEXT: jne .LBB2_1 +; CHECK-NEXT: # BB#2: +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; CHECK-NEXT: .LBB2_1: ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: popl %esi ; CHECK-NEXT: retl %shift.upgrd.3 = zext i8 %C to i64 ; [#uses=1] Index: test/CodeGen/X86/sink-hoist.ll =================================================================== --- test/CodeGen/X86/sink-hoist.ll +++ test/CodeGen/X86/sink-hoist.ll @@ -26,7 +26,8 @@ ; CHECK-LABEL: split: ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je +; CHECK-NEXT: jne +; CHECK: ret ; CHECK: divsd ; CHECK: movapd ; CHECK: ret Index: test/CodeGen/X86/sse-scalar-fp-arith.ll =================================================================== --- test/CodeGen/X86/sse-scalar-fp-arith.ll +++ test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1110,10 +1110,12 @@ ; AVX1-LABEL: add_ss_mask: ; AVX1: # BB#0: ; AVX1-NEXT: testb $1, %dil -; AVX1-NEXT: je .LBB62_2 -; AVX1-NEXT: # BB#1: +; AVX1-NEXT: jne .LBB62_1 +; AVX1-NEXT: # BB#2: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB62_1: ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: .LBB62_2: ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; AVX1-NEXT: retq ; @@ -1165,10 +1167,12 @@ ; AVX1-LABEL: add_sd_mask: ; AVX1: # BB#0: ; AVX1-NEXT: testb $1, %dil -; AVX1-NEXT: je .LBB63_2 -; AVX1-NEXT: # BB#1: +; AVX1-NEXT: jne .LBB63_1 +; AVX1-NEXT: # BB#2: +; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB63_1: ; AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: .LBB63_2: ; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/testb-je-fusion.ll =================================================================== --- test/CodeGen/X86/testb-je-fusion.ll +++ test/CodeGen/X86/testb-je-fusion.ll @@ -9,7 +9,7 @@ entry: %and = and i32 %flags, 512 %tobool = icmp eq i32 %and, 0 - br i1 %tobool, label %if.end, label %if.then + br i1 %tobool, label %if.end, label %if.then, !prof !1 if.then: br label %if.end @@ -18,3 +18,4 @@ %hasflag = phi i32 [ 1, %if.then ], [ 0, %entry ] ret i32 %hasflag } +!1 = !{!"branch_weights", i32 1, i32 2}