Index: lib/CodeGen/BranchFolding.h =================================================================== --- lib/CodeGen/BranchFolding.h +++ lib/CodeGen/BranchFolding.h @@ -122,6 +122,7 @@ const MachineBasicBlock *MBB) const; raw_ostream &printBlockFreq(raw_ostream &OS, const BlockFrequency Freq) const; + uint64_t getEntryFreq() const; private: const MachineBlockFrequencyInfo &MBFI; Index: lib/CodeGen/BranchFolding.cpp =================================================================== --- lib/CodeGen/BranchFolding.cpp +++ lib/CodeGen/BranchFolding.cpp @@ -498,6 +498,11 @@ return MBFI.printBlockFreq(OS, Freq); } +uint64_t +BranchFolder::MBFIWrapper::getEntryFreq() const { + return MBFI.getEntryFreq(); +} + /// CountTerminators - Count the number of terminators in the given /// block and set I to the position of the first non-terminator, if there /// is one, or MBB->end() otherwise. Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" @@ -49,6 +50,8 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include +#include +#include using namespace llvm; #define DEBUG_TYPE "block-placement" @@ -136,13 +139,23 @@ cl::init(true), cl::Hidden); // Heuristic for tail duplication. -static cl::opt TailDuplicatePlacementThreshold( +static cl::opt TailDupPlacementThreshold( "tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden); +// Heuristic for tail duplication. +static cl::opt TailDupPlacementPenalty( + "tail-dup-placement-penalty", + cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " + "Copying can increase fallthrough, but it also increases icache " + "pressure. This parameter controls the penalty to account for that. " + "Percent as integer."), + cl::init(2), + cl::Hidden); + extern cl::opt StaticLikelyProb; extern cl::opt ProfileLikelyProb; @@ -266,6 +279,12 @@ /// \brief A typedef for a block filter set. typedef SmallSetVector BlockFilterSet; + /// Pair struct containing basic block and taildup profitiability + struct BlockAndTailDupResult { + MachineBasicBlock * BB; + bool ShouldTailDup; + }; + /// \brief work lists of blocks that are ready to be laid out SmallVector BlockWorkList; SmallVector EHPadWorkList; @@ -293,9 +312,12 @@ /// \brief A handle to the target's lowering info. const TargetLoweringBase *TLI; - /// \brief A handle to the post dominator tree. + /// \brief A handle to the dominator tree. MachineDominatorTree *MDT; + /// \brief A handle to the post dominator tree. + MachinePostDominatorTree *MPDT; + /// \brief Duplicator used to duplicate tails during placement. /// /// Placement decisions can open up new tail duplication opportunities, but @@ -368,9 +390,9 @@ BlockChain &SuccChain, BranchProbability SuccProb, BranchProbability RealSuccProb, BlockChain &Chain, const BlockFilterSet *BlockFilter); - MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB, - BlockChain &Chain, - const BlockFilterSet *BlockFilter); + BlockAndTailDupResult selectBestSuccessor(MachineBasicBlock *BB, + BlockChain &Chain, + const BlockFilterSet *BlockFilter); MachineBasicBlock * selectBestCandidateBlock(BlockChain &Chain, SmallVectorImpl &WorkList); @@ -403,6 +425,18 @@ void buildCFGChains(); void optimizeBranches(); void alignBlocks(); + bool shouldTailDuplicate(MachineBasicBlock *BB); + /// Check the edge frequencies to see if tail duplication will increase + /// fallthroughs. + bool isProfitableToTailDup( + MachineBasicBlock *BB, MachineBasicBlock *Succ, + BranchProbability AdjustedSumProb, + BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Returns true if a block can tail duplicate into all unplaced + /// predecessors. Filters based on loop. + bool canTailDuplicateUnplacedPreds( + MachineBasicBlock *BB, MachineBasicBlock *Succ, + BlockChain &Chain, const BlockFilterSet *BlockFilter); public: static char ID; // Pass identification, replacement for typeid @@ -416,6 +450,8 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + if (TailDupPlacement) + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -430,6 +466,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement", "Branch Probability Basic Block Placement", false, false) @@ -561,6 +598,210 @@ return SuccProb; } +/// Check if a block should be tail duplicated. +/// \p BB Block to check. +bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { + // Blocks with single successors don't create additional fallthrough + // opportunities. Don't duplicate them. TODO: When conditional exits are + // analyzable, allow them to be duplicated. + bool IsSimple = TailDup.isSimpleBB(BB); + + if (BB->succ_size() == 1) + return false; + return TailDup.shouldTailDuplicate(IsSimple, *BB); +} + +/// Compare 2 BlockFrequency's with a small penalty for \p A. +/// In order to be conservative, we apply a X% penalty to account for +/// increased icache pressure and static heuristics. For small frequencies +/// we use only the numerators to improve accuracy. For simplicity, we assume the +/// penalty is less than 100% +/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere. +static bool greaterWithBias(BlockFrequency A, BlockFrequency B, + uint64_t EntryFreq) { + if (B > A) + return false; + BlockFrequency Gain = A - B; + if (Gain.getFrequency() < (uint64_t) 1 << 32) { + if (EntryFreq < (uint64_t) 1 << 40) { + return (Gain.getFrequency() * 100 > EntryFreq * TailDupPlacementPenalty); + } + return false; + } else { + BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); + return (Gain / ThresholdProb).getFrequency() > EntryFreq; + } +} + +/// Check the edge frequencies to see if tail duplication will increase +/// fallthroughs. It only makes sense to call this function when +/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is +/// always locally profitable if we would have picked \p Succ without +/// considering duplication. +bool MachineBlockPlacement::isProfitableToTailDup( + MachineBasicBlock *BB, MachineBasicBlock *Succ, + BranchProbability QProb, + BlockChain &Chain, const BlockFilterSet *BlockFilter) { + // We need to do a probability calculation to make sure this is profitable. + // First: does succ have a successor that post-dominates? This affects the + // calculation. The 2 relevant cases are: + // BB BB + // | \Qout | \Qout + // P| C |P C + // = C' = C' + // | /Qin | /Qin + // | / | / + // Succ Succ + // / \ | \ V + // U/ =V |U \ + // / \ = D + // D E | / + // | / + // |/ + // PDom + // '=' : Branch taken for that CFG edge + // In the second case, Placing Succ while duplicating it into C prevents the + // fallthrough of Succ into either D or PDom, because they now have C as an + // unplaced predecessor + + // Start by figuring out which case we fall into + MachineBasicBlock *PDom = nullptr; + SmallVector SuccSuccs; + // Only scan the relevant successors + auto AdjustedSuccSumProb = + collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs); + BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ); + auto BBFreq = MBFI->getBlockFreq(BB); + auto SuccFreq = MBFI->getBlockFreq(Succ); + BlockFrequency P = BBFreq * PProb; + BlockFrequency Qout = BBFreq * QProb; + uint64_t EntryFreq = MBFI->getEntryFreq(); + // If there are no more successors, it is profitable to copy, as it strictly + // increases fallthrough. + if (SuccSuccs.size() == 0) + return greaterWithBias(P, Qout, EntryFreq); + + auto BestSuccSucc = BranchProbability::getZero(); + // Find the PDom or the best Succ if no PDom exists. + for (MachineBasicBlock *SuccSucc : SuccSuccs) { + auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc); + if (Prob > BestSuccSucc) + BestSuccSucc = Prob; + if (PDom == nullptr) + if (MPDT->dominates(SuccSucc, Succ)) { + PDom = SuccSucc; + break; + } + } + // For the comparisons, we need to know Succ's best incoming edge that isn't + // from BB. + auto SuccBestPred = BlockFrequency(0); + for (MachineBasicBlock *SuccPred : Succ->predecessors()) { + if (SuccPred == Succ || SuccPred == BB + || BlockToChain[SuccPred] == &Chain + || (BlockFilter && !BlockFilter->count(SuccPred))) + continue; + auto Freq = MBFI->getBlockFreq(SuccPred) + * MBPI->getEdgeProbability(SuccPred, Succ); + if (Freq > SuccBestPred) + SuccBestPred = Freq; + } + // Qin is Succ's best unplaced incoming edge that isn't BB + BlockFrequency Qin = SuccBestPred; + // If it doesn't have a post-dominating successor, here is the calculation: + // BB BB + // | \Qout | \ + // P| C | = + // = C' | C + // | /Qin | | + // | / | C' (+Succ) + // Succ Succ /| + // / \ | \/ | + // U/ =V = /= = + // / \ | / \| + // D E D E + // '=' : Branch taken for that CFG edge + // Cost in the first case is: P + V + // For this calculation, we always assume P > Qout. If Qout > P + // The result of this function will be ignored at the caller. + // Cost in the second case is: Qout + Qin * V + P * U + P * V + // TODO(iteratee): If we lay out D after Succ, the P * U term + // goes away. This logic is coming in D28522. + + if (PDom == nullptr || !Succ->isSuccessor(PDom)) { + BranchProbability UProb = BestSuccSucc; + BranchProbability VProb = AdjustedSuccSumProb - UProb; + BlockFrequency V = SuccFreq * VProb; + BlockFrequency QinV = Qin * VProb; + BlockFrequency BaseCost = P + V; + BlockFrequency DupCost = Qout + QinV + P * AdjustedSuccSumProb; + return greaterWithBias(BaseCost, DupCost, EntryFreq); + } + BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom); + BranchProbability VProb = AdjustedSuccSumProb - UProb; + BlockFrequency U = SuccFreq * UProb; + BlockFrequency V = SuccFreq * VProb; + // If there is a post-dominating successor, here is the calculation: + // BB BB BB BB + // | \Qout | \ | \Qout | \ + // |P C | = |P C | = + // = C' |P C = C' |P C + // | /Qin | | | /Qin | | + // | / | C' (+Succ) | / | C' (+Succ) + // Succ Succ /| Succ Succ /| + // | \ V | \/ | | \ V | \/ | + // |U \ |U /\ | |U = |U /\ | + // = D = = =| | D | = =| + // | / |/ D | / |/ D + // | / | / | = | / + // |/ | / |/ | = + // Dom Dom Dom Dom + // '=' : Branch taken for that CFG edge + // The cost for taken branches in the first case is P + U + // The cost in the second case (assuming independence), given the layout: + // BB, Succ, (C+Succ), D, Dom + // is Qout + P * V + Qin * U + // compare P + U vs Qout + P * V + Qin * U. + // + // The 3rd and 4th cases cover when Dom would be chosen to follow Succ. + // + // For the 3rd case, the cost is P + 2 * V + // For the 4th case, the cost is Qout + Qin * U + P * V + V + // We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V + if (UProb > AdjustedSuccSumProb / 2 + && !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], + UProb, UProb, Chain, BlockFilter)) { + // Cases 3 & 4 + return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb), + EntryFreq); + } + // Cases 1 & 2 + return greaterWithBias( + (P + U), (Qout + Qin * UProb + P * AdjustedSuccSumProb), EntryFreq); +} + + +/// When the option TailDupPlacement is on, this method checks if the +/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated +/// into all of its unplaced, unfiltered predecessors, that are not BB. +bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( + MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, + const BlockFilterSet *BlockFilter) { + if (!shouldTailDuplicate(Succ)) + return false; + + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // Make sure all unplaced and unfiltered predecessors can be + // tail-duplicated into. + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain) + continue; + if (!TailDup.canTailDuplicate(Succ, Pred)) + return false; + } + return true; +} + /// When the option OutlineOptionalBranches is on, this method /// checks if the fallthrough candidate block \p Succ (of block /// \p BB) also has other unscheduled predecessor blocks which @@ -609,11 +850,11 @@ if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) { /* See case 1 below for the cost analysis. For BB->Succ to * be taken with smaller cost, the following needs to hold: - * Prob(BB->Succ) > 2* Prob(BB->Pred) - * So the threshold T - * T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1, - * We have T + T/2 = 1, i.e. T = 2/3. Also adding user specified - * branch bias, we have + * Prob(BB->Succ) > 2 * Prob(BB->Pred) + * So the threshold T in the calculation below + * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred) + * So T / (1 - T) = 2, Yielding T = 2/3 + * Also adding user specified branch bias, we have * T = (2/3)*(ProfileLikelyProb/50) * = (2*ProfileLikelyProb)/150) */ @@ -625,6 +866,12 @@ /// Checks to see if the layout candidate block \p Succ has a better layout /// predecessor than \c BB. If yes, returns true. +/// \p SuccProb: The probability adjusted for only remaining blocks. +/// Only used for logging +/// \p RealSuccProb: The un-adjusted probability. +/// \p Chain: The chain that BB belongs to and Succ is being considered for. +/// \p BlockFilter: if non-null, the set of blocks that make up the loop being +/// considered bool MachineBlockPlacement::hasBetterLayoutPredecessor( MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain, BranchProbability SuccProb, BranchProbability RealSuccProb, @@ -756,13 +1003,15 @@ for (MachineBasicBlock *Pred : Succ->predecessors()) { if (Pred == Succ || BlockToChain[Pred] == &SuccChain || (BlockFilter && !BlockFilter->count(Pred)) || - BlockToChain[Pred] == &Chain) + BlockToChain[Pred] == &Chain || + // This check is redundant except for look ahead. This function is + // called for lookahead by isProfitableToTailDup when BB hasn't been + // placed yet. + (Pred == BB)) continue; // Do backward checking. // For all cases above, we need a backward checking to filter out edges that - // are not 'strongly' biased. With profile data available, the check is - // mostly redundant for case 2 (when threshold prob is set at 50%) unless S - // has more than two successors. + // are not 'strongly' biased. // BB Pred // \ / // Succ @@ -798,14 +1047,15 @@ /// breaking CFG structure, but cave and break such structures in the case of /// very hot successor edges. /// -/// \returns The best successor block found, or null if none are viable. -MachineBasicBlock * +/// \returns The best successor block found, or null if none are viable, along +/// with a boolean indicating if tail duplication is necessary. +MachineBlockPlacement::BlockAndTailDupResult MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter) { const BranchProbability HotProb(StaticLikelyProb, 100); - MachineBasicBlock *BestSucc = nullptr; + BlockAndTailDupResult BestSucc = { nullptr, false }; auto BestProb = BranchProbability::getZero(); SmallVector Successors; @@ -813,6 +1063,12 @@ collectViableSuccessors(BB, Chain, BlockFilter, Successors); DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n"); + + // For blocks with CFG violations, we may be able to lay them out anyway with + // tail-duplication. We keep this vector so we can perform the probability + // calculations the minimum number of times. + SmallVector, 4> + DupCandidates; for (MachineBasicBlock *Succ : Successors) { auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BranchProbability SuccProb = @@ -820,15 +1076,21 @@ // This heuristic is off by default. if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb, - HotProb)) - return Succ; + HotProb)) { + BestSucc.BB = Succ; + return BestSucc; + } BlockChain &SuccChain = *BlockToChain[Succ]; // Skip the edge \c BB->Succ if block \c Succ has a better layout // predecessor that yields lower global cost. if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb, - Chain, BlockFilter)) + Chain, BlockFilter)) { + // If tail duplication would make Succ profitable, place it. + if (TailDupPlacement && shouldTailDuplicate(Succ)) + DupCandidates.push_back(std::make_tuple(SuccProb, Succ)); continue; + } DEBUG( dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " @@ -836,17 +1098,52 @@ << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestProb >= SuccProb) { + if (BestSucc.BB && BestProb >= SuccProb) { DEBUG(dbgs() << " Not the best candidate, continuing\n"); continue; } DEBUG(dbgs() << " Setting it as best candidate\n"); - BestSucc = Succ; + BestSucc.BB = Succ; BestProb = SuccProb; } - if (BestSucc) - DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc) << "\n"); + // Handle the tail duplication candidates in order of decreasing probability. + // Stop at the first one that is profitable. Also stop if they are less + // profitable than BestSucc. Position is important because we preserve it and + // prefer first best match. Here we aren't comparing in order, so we capture + // the position instead. + if (DupCandidates.size() != 0) { + auto cmp = + [](const std::tuple &a, + const std::tuple &b) { + return std::get<0>(a) > std::get<0>(b); + }; + std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp); + } + for(auto &Tup : DupCandidates) { + BranchProbability DupProb; + MachineBasicBlock *Succ; + std::tie(DupProb, Succ) = Tup; + if (DupProb < BestProb) + break; + if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) + // If tail duplication gives us fallthrough when we otherwise wouldn't + // have it, that is a strict gain. + && (BestSucc.BB == nullptr + || isProfitableToTailDup(BB, Succ, BestProb, Chain, + BlockFilter))) { + DEBUG( + dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " + << DupProb + << " (Tail Duplicate)\n"); + BestSucc.BB = Succ; + BestSucc.ShouldTailDup = true; + break; + } + } + + if (BestSucc.BB) + DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n"); return BestSucc; } @@ -995,7 +1292,11 @@ // Look for the best viable successor if there is one to place immediately // after this block. - MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + auto Result = selectBestSuccessor(BB, Chain, BlockFilter); + MachineBasicBlock* BestSucc = Result.BB; + bool ShouldTailDup = Result.ShouldTailDup; + if (TailDupPlacement) + ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc)); // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at @@ -1016,7 +1317,7 @@ // Placement may have changed tail duplication opportunities. // Check for that now. - if (TailDupPlacement && BestSucc) { + if (TailDupPlacement && BestSucc && ShouldTailDup) { // If the chosen successor was duplicated into all its predecessors, // don't bother laying it out, just go round the loop again with BB as // the chain end. @@ -1908,13 +2209,8 @@ DuplicatedToLPred = false; DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber() << "\n"); - bool IsSimple = TailDup.isSimpleBB(BB); - // Blocks with single successors don't create additional fallthrough - // opportunities. Don't duplicate them. TODO: When conditional exits are - // analyzable, allow them to be duplicated. - if (!IsSimple && BB->succ_size() == 1) - return false; - if (!TailDup.shouldTailDuplicate(IsSimple, *BB)) + + if (!shouldTailDuplicate(BB)) return false; // This has to be a callback because none of it can be done after // BB is deleted. @@ -1967,6 +2263,7 @@ llvm::function_ref(RemovalCallback); SmallVector DuplicatedPreds; + bool IsSimple = TailDup.isSimpleBB(BB); TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, &RemovalCallbackRef); @@ -2007,13 +2304,15 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MDT = &getAnalysis(); + MPDT = nullptr; // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. PreferredLoopExit = nullptr; if (TailDupPlacement) { - unsigned TailDupSize = TailDuplicatePlacementThreshold; + MPDT = &getAnalysis(); + unsigned TailDupSize = TailDupPlacementThreshold; if (MF.getFunction()->optForSize()) TailDupSize = 1; TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); @@ -2032,7 +2331,7 @@ BranchFoldPlacement; // No tail merging opportunities if the block number is less than four. if (MF.size() > 3 && EnableTailMerge) { - unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1; + unsigned TailMergeSize = TailDupPlacementThreshold + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, TailMergeSize); @@ -2043,6 +2342,8 @@ BlockToChain.clear(); // Must redo the dominator tree if blocks were changed. MDT->runOnMachineFunction(MF); + if (MPDT) + MPDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } Index: test/CodeGen/AArch64/arm64-atomic.ll =================================================================== --- test/CodeGen/AArch64/arm64-atomic.ll +++ test/CodeGen/AArch64/arm64-atomic.ll @@ -9,10 +9,10 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -27,10 +27,12 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret %new = load i32, i32* %pnew %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 @@ -41,15 +43,15 @@ ; CHECK-LABEL: val_compare_and_swap_rel: ; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: -; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]] +; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: cmp [[RESULT]], w1 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] -; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]] +; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -64,10 +66,10 @@ ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -346,19 +346,15 @@ ; CHECK-NEXT: sub w1, w1, #1 ; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]] ; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]] -; DISABLE-NEXT: b [[IFEND_LABEL]] -; -; DISABLE: [[ELSE_LABEL]]: ; %if.else -; DISABLE: lsl w0, w1, #1 -; -; CHECK: [[IFEND_LABEL]]: +; CHECK-NEXT: [[IFEND_LABEL]]: ; Epilogue code. ; CHECK: add sp, sp, #16 ; CHECK-NEXT: ret ; -; ENABLE: [[ELSE_LABEL]]: ; %if.else -; ENABLE-NEXT: lsl w0, w1, #1 -; ENABLE_NEXT: ret +; CHECK: [[ELSE_LABEL]]: ; %if.else +; CHECK-NEXT: lsl w0, w1, #1 +; DISABLE-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 { entry: %ap = alloca i8*, align 8 Index: test/CodeGen/AArch64/tail-dup-repeat-worklist.ll =================================================================== --- test/CodeGen/AArch64/tail-dup-repeat-worklist.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc -O3 -o - -verify-machineinstrs %s | FileCheck %s -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-unknown-linux-gnu" - -%struct.s1 = type { %struct.s3*, %struct.s1* } -%struct.s2 = type opaque -%struct.s3 = type { i32 } - -; Function Attrs: nounwind -define internal fastcc i32 @repeated_dup_worklist(%struct.s1** %pp1, %struct.s2* %p2, i32 %state, i1 %i1_1, i32 %i32_1) unnamed_addr #0 { -entry: - br label %while.cond.outer - -; The loop gets laid out: -; %while.cond.outer -; %(null) -; %(null) -; %dup2 -; and then %dup1 gets chosen as the next block. -; when dup2 is duplicated into dup1, %worklist could erroneously be placed on -; the worklist, because all of its current predecessors are now scheduled. -; However, after dup2 is tail-duplicated, %worklist can't be on the worklist -; because it now has unscheduled predecessors.q -; CHECK-LABEL: repeated_dup_worklist -; CHECK: // %entry -; CHECK: // %while.cond.outer -; first %(null) block -; CHECK: // in Loop: -; CHECK: ldr -; CHECK-NEXT: tbnz -; second %(null) block -; CHECK: // in Loop: -; CHECK: // %dup2 -; CHECK: // %worklist -; CHECK: // %if.then96.i -while.cond.outer: ; preds = %dup1, %entry - %progress.0.ph = phi i32 [ 0, %entry ], [ %progress.1, %dup1 ] - %inc77 = add nsw i32 %progress.0.ph, 1 - %cmp = icmp slt i32 %progress.0.ph, %i32_1 - br i1 %cmp, label %dup2, label %dup1 - -dup2: ; preds = %if.then96.i, %worklist, %while.cond.outer - %progress.1.ph = phi i32 [ 0, %while.cond.outer ], [ %progress.1, %if.then96.i ], [ %progress.1, %worklist ] - %.pr = load %struct.s1*, %struct.s1** %pp1, align 8 - br label %dup1 - -dup1: ; preds = %dup2, %while.cond.outer - %0 = phi %struct.s1* [ %.pr, %dup2 ], [ undef, %while.cond.outer ] - %progress.1 = phi i32 [ %progress.1.ph, %dup2 ], [ %inc77, %while.cond.outer ] - br i1 %i1_1, label %while.cond.outer, label %worklist - -worklist: ; preds = %dup1 - %snode94 = getelementptr inbounds %struct.s1, %struct.s1* %0, i64 0, i32 0 - %1 = load %struct.s3*, %struct.s3** %snode94, align 8 - %2 = getelementptr inbounds %struct.s3, %struct.s3* %1, i32 0, i32 0 - %3 = load i32, i32* %2, align 4 - %tobool95.i = icmp eq i32 %3, 0 - br i1 %tobool95.i, label %if.then96.i, label %dup2 - -if.then96.i: ; preds = %worklist - call fastcc void @free_s3(%struct.s2* %p2, %struct.s3* %1) #1 - br label %dup2 -} - -; Function Attrs: nounwind -declare fastcc void @free_s3(%struct.s2*, %struct.s3*) unnamed_addr #0 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } Index: test/CodeGen/AArch64/tbz-tbnz.ll =================================================================== --- test/CodeGen/AArch64/tbz-tbnz.ll +++ test/CodeGen/AArch64/tbz-tbnz.ll @@ -10,7 +10,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -28,7 +28,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:x[0-9]+]], x0, #12 -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() @@ -118,7 +118,7 @@ br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -178,7 +178,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -194,7 +194,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -209,7 +209,7 @@ ; CHECK: ldr [[CMP:x[0-9]+]], [x1] ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 %val = load i64, i64* %ptr %tst = icmp slt i64 %val, 0 @@ -229,7 +229,7 @@ br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -247,7 +247,7 @@ ; CHECK: orr [[CMP:x[0-9]+]], x0, x1 ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() Index: test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- test/CodeGen/AMDGPU/branch-relaxation.ll +++ test/CodeGen/AMDGPU/branch-relaxation.ll @@ -335,6 +335,12 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: [[BB3]]: ; %bb3 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm define void @expand_requires_expand(i32 %cond0) #0 { bb0: @@ -356,6 +362,12 @@ br label %bb3 bb3: +; These NOPs prevent tail-duplication-based outlining +; from firing, which defeats the need to expand the branches and this test. + call void asm sideeffect + "v_nop_e64", ""() #0 + call void asm sideeffect + "v_nop_e64", ""() #0 ret void } @@ -385,6 +397,7 @@ ; GCN-NEXT: [[ENDIF]]: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] +; GCN-NEXT: s_sleep 5 ; GCN-NEXT: s_endpgm define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 { entry: @@ -402,6 +415,9 @@ br label %endif endif: + ; layout can remove the split branch if it can copy the return block. + ; This call makes the return block long enough that it doesn't get copied. + call void @llvm.amdgcn.s.sleep(i32 5); ret void } Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -252,10 +252,12 @@ ; GCN: s_cmp_lt_i32 [[COND]], 1 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]] ; GCN: v_cmp_gt_i32_e64 vcc, [[COND]], 0{{$}} -; GCN: s_cbranch_vccnz [[EXIT]] -; GCN: buffer_store +; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]] ; GCN: {{^}}[[EXIT]]: ; GCN: s_endpgm +; GCN: {{^}}[[BODY]]: +; GCN: buffer_store +; GCN: s_endpgm define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -302,9 +304,10 @@ ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] -; GCN: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]] ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0 -; GCN: s_cbranch_scc1 [[ENDIF_LABEL]] +; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]] +; GCN: s_endpgm +; GCN: {{^}}[[IF_UNIFORM_LABEL]]: ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) { @@ -328,14 +331,13 @@ ; GCN-LABEL: {{^}}divergent_inside_uniform: ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] +; GCN: [[IF_LABEL]]: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] -; GCN: [[ENDIF_LABEL]]: -; GCN: s_endpgm define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { entry: %u_cmp = icmp eq i32 %cond, 0 @@ -363,11 +365,11 @@ ; GCN: buffer_store_dword [[ONE]] ; GCN: s_or_b64 exec, exec, [[MASK]] ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]] +; GCN: s_cbranch_scc0 [[IF_UNIFORM:[A-Z0-9_]+]] +; GCN: s_endpgm +; GCN: [[IF_UNIFORM]]: ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; GCN: buffer_store_dword [[TWO]] -; GCN: [[EXIT]]: -; GCN: s_endpgm define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 Index: test/CodeGen/ARM/arm-and-tst-peephole.ll =================================================================== --- test/CodeGen/ARM/arm-and-tst-peephole.ll +++ test/CodeGen/ARM/arm-and-tst-peephole.ll @@ -49,9 +49,9 @@ ; V8-NEXT: beq ; V8-NEXT: %tailrecurse.switch ; V8: cmp -; V8-NEXT: bne -; V8-NEXT: b -; The trailing space in the last line checks that the branch is unconditional +; V8-NEXT: beq +; V8-NEXT: %sw.epilog +; V8-NEXT: bx lr switch i32 %and, label %sw.epilog [ i32 1, label %sw.bb i32 3, label %sw.bb6 Index: test/CodeGen/ARM/atomic-op.ll =================================================================== --- test/CodeGen/ARM/atomic-op.ll +++ test/CodeGen/ARM/atomic-op.ll @@ -320,10 +320,10 @@ ; CHECK: strex [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]] ; CHECK: cmp [[SUCCESS]], #0 ; CHECK: bne [[LOOP_BB]] -; CHECK: b [[END_BB:\.?LBB[0-9]+_[0-9]+]] +; CHECK: dmb ish +; CHECK: bx lr ; CHECK: [[FAIL_BB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[END_BB]]: ; CHECK: dmb ish ; CHECK: bx lr Index: test/CodeGen/ARM/atomic-ops-v8.ll =================================================================== --- test/CodeGen/ARM/atomic-ops-v8.ll +++ test/CodeGen/ARM/atomic-ops-v8.ll @@ -1045,20 +1045,21 @@ ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i8 %old } @@ -1078,20 +1079,21 @@ ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i16 %old } @@ -1110,20 +1112,21 @@ ; r0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: cmp r[[OLD]], r0 -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-NEXT: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-ARM-NEXT: bx lr ret void } @@ -1148,16 +1151,16 @@ ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]] ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r2, r3 is a reasonable guess. ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]] +; CHECK-NEXT: pop ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr Index: test/CodeGen/ARM/cmpxchg-weak.ll =================================================================== --- test/CodeGen/ARM/cmpxchg-weak.ll +++ test/CodeGen/ARM/cmpxchg-weak.ll @@ -13,14 +13,16 @@ ; CHECK-NEXT: dmb ish ; CHECK-NEXT: strex [[SUCCESS:r[0-9]+]], r2, [r0] ; CHECK-NEXT: cmp [[SUCCESS]], #0 -; CHECK-NEXT: bne [[FAILBB:LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: beq [[SUCCESSBB:LBB[0-9]+_[0-9]+]] ; CHECK-NEXT: BB#2: -; CHECK-NEXT: dmb ish ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: bx lr ; CHECK-NEXT: [[LDFAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: str r3, [r0] +; CHECK-NEXT: bx lr +; CHECK-NEXT: [[SUCCESSBB]]: +; CHECK-NEXT: dmb ish ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: bx lr Index: test/CodeGen/Mips/brconnez.ll =================================================================== --- test/CodeGen/Mips/brconnez.ll +++ test/CodeGen/Mips/brconnez.ll @@ -7,7 +7,7 @@ entry: %0 = load i32, i32* @j, align 4 %cmp = icmp eq i32 %0, 0 - br i1 %cmp, label %if.then, label %if.end + br i1 %cmp, label %if.then, label %if.end, !prof !1 ; 16: bnez ${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]] ; 16: lw ${{[0-9]+}}, %got(result)(${{[0-9]+}}) @@ -21,4 +21,4 @@ ret void } - +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/Mips/micromips-compact-branches.ll =================================================================== --- test/CodeGen/Mips/micromips-compact-branches.ll +++ test/CodeGen/Mips/micromips-compact-branches.ll @@ -6,7 +6,7 @@ %x = alloca i32, align 4 %0 = load i32, i32* %x, align 4 %cmp = icmp eq i32 %0, 0 - br i1 %cmp, label %if.then, label %if.end + br i1 %cmp, label %if.then, label %if.end, !prof !1 if.then: store i32 10, i32* %x, align 4 @@ -17,3 +17,4 @@ } ; CHECK: bnezc +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/PowerPC/misched-inorder-latency.ll =================================================================== --- test/CodeGen/PowerPC/misched-inorder-latency.ll +++ test/CodeGen/PowerPC/misched-inorder-latency.ll @@ -17,7 +17,7 @@ %sum1 = add i32 %sumin, 1 %val1 = load i32, i32* %ptr %p = icmp eq i32 %sumin, 0 - br i1 %p, label %true, label %end + br i1 %p, label %true, label %end, !prof !1 true: %sum2 = add i32 %sum1, 1 %ptr2 = getelementptr i32, i32* %ptr, i32 1 @@ -53,3 +53,5 @@ ret i32 %valmerge } declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/PowerPC/tail-dup-break-cfg.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-break-cfg.ll @@ -0,0 +1,140 @@ +; RUN: llc -O2 -o - %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +; Intended layout: +; The code for tail-duplication during layout will produce the layout: +; test1 +; test2 +; body1 (with copy of test2) +; body2 +; exit + +;CHECK-LABEL: tail_dup_break_cfg: +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[BODY1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[EXITLABEL]] +;CHECK-NEXT: [[BODY2LABEL]] +;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +define void @tail_dup_break_cfg(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely +body1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %exit, label %body2, !prof !1 ; %exit more likely +body2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %exit +exit: + ret void +} + +; The branch weights here hint that we shouldn't tail duplicate in this case. +;CHECK-LABEL: tail_dup_dont_break_cfg: +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 4, 1, [[TEST2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %body1 +;CHECK: [[TEST2LABEL]]: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %body2 +;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +define void @tail_dup_dont_break_cfg(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely +body1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp ne i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely +body2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %exit +exit: + ret void +} +declare void @a() +declare void @b() +declare void @c() +declare void @d() + +; This function arranges for the successors of %succ to have already been laid +; out. When we consider whether to lay out succ after bb and to tail-duplicate +; it, v and ret have already been placed, so we tail-duplicate as it removes a +; branch and strictly increases fallthrough +; CHECK-LABEL: tail_dup_no_succ +; CHECK: # %entry +; CHECK: # %v +; CHECK: # %ret +; CHECK: # %bb +; CHECK: # %succ +; CHECK: # %c +; CHECK: bl c +; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 +; CHECK: beq +; CHECK: b +define void @tail_dup_no_succ(i32 %tag) { +entry: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %v, label %bb, !prof !2 ; %v very much more likely +bb: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %succ, label %c, !prof !3 ; %succ more likely +c: + call void @c() + call void @c() + br label %succ +succ: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %ret, label %v, !prof !1 ; %u more likely +v: + call void @d() + call void @d() + br label %ret +ret: + ret void +} + + +!1 = !{!"branch_weights", i32 5, i32 3} +!2 = !{!"branch_weights", i32 95, i32 5} +!3 = !{!"branch_weights", i32 7, i32 3} Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- test/CodeGen/PowerPC/tail-dup-layout.ll +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -19,35 +19,35 @@ ; The CHECK statements check for the whole string of tests and exit block, ; and then check that the correct test has been duplicated into the end of ; the optional blocks and that the optional blocks are in the correct order. -;CHECK-LABEL: f: +;CHECK-LABEL: straight_test: ; test1 may have been merged with entry ;CHECK: mr [[TAGREG:[0-9]+]], 3 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 -;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] -;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2 +;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 -;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] -;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 -;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] -;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 -;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] -;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit ;CHECK: blr -;CHECK-NEXT: [[OPT1LABEL]] +;CHECK-NEXT: .[[OPT1LABEL]] ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 -;CHECK-NEXT: beq 0, [[TEST3LABEL]] -;CHECK-NEXT: [[OPT2LABEL]] +;CHECK-NEXT: beq 0, .[[TEST3LABEL]] +;CHECK-NEXT: .[[OPT2LABEL]] ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 -;CHECK-NEXT: beq 0, [[TEST4LABEL]] -;CHECK-NEXT: [[OPT3LABEL]] +;CHECK-NEXT: beq 0, .[[TEST4LABEL]] +;CHECK-NEXT: .[[OPT3LABEL]] ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 -;CHECK-NEXT: beq 0, [[EXITLABEL]] -;CHECK-NEXT: [[OPT4LABEL]] -;CHECK: b [[EXITLABEL]] +;CHECK-NEXT: beq 0, .[[EXITLABEL]] +;CHECK-NEXT: .[[OPT4LABEL]] +;CHECK: b .[[EXITLABEL]] -define void @f(i32 %tag) { +define void @straight_test(i32 %tag) { entry: br label %test1 test1: @@ -94,7 +94,57 @@ ret void } +; The block then2 is not unavoidable, but since it can be tail-duplicated, it +; should be placed as a fallthrough from test2 and copied. +; CHECK-LABEL: avoidable_test: +; CHECK: # %entry +; CHECK: andi. +; CHECK: # %test2 +; Make sure then2 falls through from test2 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %then2 +; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %end2 +; CHECK: # %else1 +; CHECK: bl a +; CHECK: bl a +; Make sure then2 was copied into else1 +; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 +; CHECK: # %else2 +; CHECK: bl c +define void @avoidable_test(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely +else1: + call void @a() + call void @a() + br label %then2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely +then2: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely +else2: + call void @c() + br label %end2 +end2: + ret void +end1: + call void @d() + ret void +} + declare void @a() declare void @b() declare void @c() declare void @d() + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/SPARC/sjlj.ll =================================================================== --- test/CodeGen/SPARC/sjlj.ll +++ test/CodeGen/SPARC/sjlj.ll @@ -66,14 +66,15 @@ ; CHECK: ba .LBB1_1 ; CHECK: nop ; CHECK:.LBB1_1: ! %entry -; CHECK: ba .LBB1_3 ; CHECK: mov %g0, %i0 +; CHECK: cmp %i0, 0 +; CHECK: bne .LBB1_4 +; CHECK: ba .LBB1_5 ; CHECK:.LBB1_2: ! Block address taken ; CHECK: mov 1, %i0 -; CHECK:.LBB1_3: ! %entry -; CHECK: cmp %i0, 0 ; CHECK: be .LBB1_5 -; CHECK: nop +; CHECK:.LBB1_4: +; CHECK: ba .LBB1_6 } declare i8* @llvm.frameaddress(i32) #2 Index: test/CodeGen/SystemZ/int-cmp-44.ll =================================================================== --- test/CodeGen/SystemZ/int-cmp-44.ll +++ test/CodeGen/SystemZ/int-cmp-44.ll @@ -473,8 +473,8 @@ %xor = xor i32 %val, 1 %add = add i32 %xor, 1000000 call void @foo() - %cmp = icmp ne i32 %add, 0 - br i1 %cmp, label %exit, label %store + %cmp = icmp eq i32 %add, 0 + br i1 %cmp, label %store, label %exit, !prof !1 store: store i32 %add, i32 *%ptr @@ -888,3 +888,5 @@ exit: ret i64 %res } + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/Thumb/thumb-shrink-wrapping.ll =================================================================== --- test/CodeGen/Thumb/thumb-shrink-wrapping.ll +++ test/CodeGen/Thumb/thumb-shrink-wrapping.ll @@ -1,11 +1,12 @@ -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T + ; ; Note: Lots of tests use inline asm instead of regular calls. ; This allows to have a better control on what the allocation will do. @@ -15,6 +16,8 @@ ; edges. ; Also disable the late if-converter as it makes harder to reason on ; the diffs. +; Disable tail-duplication during placement, as v4t vs v5t get different +; results due to branches not being analyzable under v5 ; Initial motivating example: Simple diamond with a call just on one side. ; CHECK-LABEL: foo: Index: test/CodeGen/Thumb2/cbnz.ll =================================================================== --- test/CodeGen/Thumb2/cbnz.ll +++ test/CodeGen/Thumb2/cbnz.ll @@ -26,7 +26,7 @@ call void @x() call void @x() call void @x() - ; CHECK: cbnz + ; CHECK: cbz %q = icmp eq i32 %y, 0 br i1 %q, label %t2, label %f Index: test/CodeGen/Thumb2/ifcvt-compare.ll =================================================================== --- test/CodeGen/Thumb2/ifcvt-compare.ll +++ test/CodeGen/Thumb2/ifcvt-compare.ll @@ -4,7 +4,7 @@ define void @f0(i32 %x) optsize { ; CHECK-LABEL: f0: - ; CHECK: cbnz + ; CHECK: cbz %p = icmp eq i32 %x, 0 br i1 %p, label %t, label %f Index: test/CodeGen/Thumb2/v8_IT_4.ll =================================================================== --- test/CodeGen/Thumb2/v8_IT_4.ll +++ test/CodeGen/Thumb2/v8_IT_4.ll @@ -12,10 +12,11 @@ define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string,std::allocator >"* %this, %"struct.std::basic_string,std::allocator >"* %__str) { ; CHECK-LABEL: _ZNKSs7compareERKSs: -; CHECK: cbnz r0, +; CHECK: cbz r0, +; CHECK-NEXT: %bb1 +; CHECK-NEXT: pop.w ; CHECK-NEXT: %bb ; CHECK-NEXT: sub{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}} -; CHECK-NEXT: %bb1 ; CHECK-NEXT: pop.w entry: %0 = tail call arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string,std::allocator >"* %this) ; [#uses=3] Index: test/CodeGen/WebAssembly/phi.ll =================================================================== --- test/CodeGen/WebAssembly/phi.ll +++ test/CodeGen/WebAssembly/phi.ll @@ -8,8 +8,9 @@ ; Basic phi triangle. ; CHECK-LABEL: test0: -; CHECK: div_s $[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}} -; CHECK: return $[[NUM0]]{{$}} +; CHECK: return $0 +; CHECK: div_s $push[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}} +; CHECK: return $pop[[NUM0]]{{$}} define i32 @test0(i32 %p) { entry: %t = icmp slt i32 %p, 0 Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -69,13 +69,14 @@ ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vucomiss %xmm1, %xmm0 ; ALL-NEXT: jne LBB3_1 -; ALL-NEXT: jnp LBB3_2 +; ALL-NEXT: jp LBB3_1 +; ALL-NEXT: ## BB#2: ## %return +; ALL-NEXT: retq ; ALL-NEXT: LBB3_1: ## %if.end ; ALL-NEXT: seta %al ; ALL-NEXT: movzbl %al, %eax ; ALL-NEXT: leaq {{.*}}(%rip), %rcx ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: LBB3_2: ## %return ; ALL-NEXT: retq entry: %cmp = fcmp oeq float %p, 0.000000e+00 Index: test/CodeGen/X86/bt.ll =================================================================== --- test/CodeGen/X86/bt.ll +++ test/CodeGen/X86/bt.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s ; PR3253 @@ -24,7 +24,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB0_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB0_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = lshr i32 %x, %n %tmp3 = and i32 %tmp29, 1 @@ -44,12 +49,17 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB1_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB1_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = lshr i32 %x, %n %tmp3 = and i32 1, %tmp29 %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -64,7 +74,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB2_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB2_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = ashr i32 %x, %n %tmp3 = and i32 %tmp29, 1 @@ -84,12 +99,17 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB3_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB3_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = ashr i32 %x, %n %tmp3 = and i32 1, %tmp29 %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -104,12 +124,17 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB4_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB4_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %tmp29, %x %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -124,12 +149,17 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB5_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB5_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %x, %tmp29 %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -144,7 +174,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB6_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB6_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = lshr i32 %x, %n %tmp3 = and i32 %tmp29, 1 @@ -164,7 +199,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB7_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB7_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = lshr i32 %x, %n %tmp3 = and i32 1, %tmp29 @@ -184,7 +224,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB8_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB8_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = ashr i32 %x, %n %tmp3 = and i32 %tmp29, 1 @@ -204,7 +249,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB9_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB9_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = ashr i32 %x, %n %tmp3 = and i32 1, %tmp29 @@ -224,7 +274,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB10_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB10_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %tmp29, %x @@ -244,7 +299,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB11_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB11_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %x, %tmp29 @@ -264,7 +324,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB12_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB12_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = lshr i32 %x, %n %tmp3 = and i32 %tmp29, 1 @@ -284,7 +349,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB13_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB13_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = lshr i32 %x, %n %tmp3 = and i32 1, %tmp29 @@ -304,7 +374,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB14_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB14_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = ashr i32 %x, %n %tmp3 = and i32 %tmp29, 1 @@ -324,7 +399,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB15_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB15_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = ashr i32 %x, %n %tmp3 = and i32 1, %tmp29 @@ -344,7 +424,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB16_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB16_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %tmp29, %x @@ -364,7 +449,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB17_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB17_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %x, %tmp29 @@ -384,7 +474,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB18_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB18_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %tmp29, %x @@ -404,7 +499,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jae .LBB19_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB19_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %x, %tmp29 @@ -424,7 +524,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB20_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB20_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = lshr i32 %x, %n %tmp3 = and i32 %tmp29, 1 @@ -444,7 +549,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB21_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB21_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = lshr i32 %x, %n %tmp3 = and i32 1, %tmp29 @@ -464,7 +574,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB22_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB22_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = ashr i32 %x, %n %tmp3 = and i32 %tmp29, 1 @@ -484,7 +599,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB23_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB23_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = ashr i32 %x, %n %tmp3 = and i32 1, %tmp29 @@ -504,7 +624,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB24_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB24_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %tmp29, %x @@ -524,7 +649,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB25_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB25_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %x, %tmp29 @@ -544,7 +674,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB26_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB26_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %tmp29, %x @@ -564,7 +699,12 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: jb .LBB27_2 -; +; CHECK-NEXT: # BB#1: # %bb +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .LBB27_2: # %UnifiedReturnBlock +; CHECK-NEXT: retq entry: %tmp29 = shl i32 1, %n %tmp3 = and i32 %x, %tmp29 @@ -588,7 +728,6 @@ ; CHECK-NEXT: btl %esi, %edi ; CHECK-NEXT: setb %al ; CHECK-NEXT: retq -; %neg = xor i32 %flags, -1 %shl = shl i32 1, %flag %and = and i32 %shl, %neg @@ -598,8 +737,10 @@ define zeroext i1 @extend(i32 %bit, i64 %bits) { ; CHECK-LABEL: extend: -; CHECK: # BB#0: -; CHECK-NEXT: btl %edi, %esi +; CHECK: # BB#0: # %entry +; CHECK-NEXT: btl %edi, %esi +; CHECK-NEXT: setb %al +; CHECK-NEXT: retq entry: %and = and i32 %bit, 31 %sh_prom = zext i32 %and to i64 @@ -608,3 +749,5 @@ %tobool = icmp ne i64 %and1, 0 ret i1 %tobool } + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/X86/fp-une-cmp.ll =================================================================== --- test/CodeGen/X86/fp-une-cmp.ll +++ test/CodeGen/X86/fp-une-cmp.ll @@ -36,8 +36,8 @@ entry: %mul = fmul double %x, %y - %cmp = fcmp une double %mul, 0.000000e+00 - br i1 %cmp, label %bb2, label %bb1 + %cmp = fcmp oeq double %mul, 0.000000e+00 + br i1 %cmp, label %bb1, label %bb2 bb1: %add = fadd double %mul, -1.000000e+00 Index: test/CodeGen/X86/jump_sign.ll =================================================================== --- test/CodeGen/X86/jump_sign.ll +++ test/CodeGen/X86/jump_sign.ll @@ -6,7 +6,7 @@ ; CHECK: jns %tmp1 = add i32 %X, 1 ; [#uses=1] %tmp = icmp slt i32 %tmp1, 0 ; [#uses=1] - br i1 %tmp, label %cond_true, label %cond_next + br i1 %tmp, label %cond_true, label %cond_next, !prof !1 cond_true: ; preds = %entry %tmp2 = tail call i32 (...) @bar( ) ; [#uses=0] @@ -303,3 +303,5 @@ if.end: ret i32 undef } + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/X86/testb-je-fusion.ll =================================================================== --- test/CodeGen/X86/testb-je-fusion.ll +++ test/CodeGen/X86/testb-je-fusion.ll @@ -9,7 +9,7 @@ entry: %and = and i32 %flags, 512 %tobool = icmp eq i32 %and, 0 - br i1 %tobool, label %if.end, label %if.then + br i1 %tobool, label %if.end, label %if.then, !prof !1 if.then: br label %if.end @@ -18,3 +18,4 @@ %hasflag = phi i32 [ 1, %if.then ], [ 0, %entry ] ret i32 %hasflag } +!1 = !{!"branch_weights", i32 1, i32 2}