Index: include/llvm/CodeGen/TailDuplicator.h =================================================================== --- include/llvm/CodeGen/TailDuplicator.h +++ include/llvm/CodeGen/TailDuplicator.h @@ -15,6 +15,7 @@ #ifndef LLVM_CODEGEN_TAILDUPLICATOR_H #define LLVM_CODEGEN_TAILDUPLICATOR_H +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -31,9 +32,11 @@ const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; const MachineBranchProbabilityInfo *MBPI; + MachineLoopInfo *MLI; const MachineModuleInfo *MMI; MachineRegisterInfo *MRI; bool PreRegAlloc; + bool LayoutMode; unsigned TailDupSize; // A list of virtual registers for which to update SSA form. @@ -47,10 +50,12 @@ public: /// Prepare to run on a specific machine function. + /// @param LayoutMode - When true, don't use the existing layout to make + /// decisions. /// @param TailDupSize - Maxmimum size of blocks to tail-duplicate. void initMF(MachineFunction &MF, const MachineModuleInfo *MMI, const MachineBranchProbabilityInfo *MBPI, - unsigned TailDupSize = 0); + MachineLoopInfo *MLI, bool LayoutMode, unsigned TailDupSize = 0); bool tailDuplicateBlocks(MachineFunction &MF); static bool isSimpleBB(MachineBasicBlock *TailBB); bool shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, @@ -58,7 +63,9 @@ /// Returns true if TailBB can successfully be duplicated into PredBB bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB); bool tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB); + MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred, + llvm::function_ref *RemovalCallback = nullptr); private: typedef TargetInstrInfo::RegSubRegPair RegSubRegPair; @@ -84,13 +91,15 @@ SmallVectorImpl &Copies); bool tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies); void appendCopies(MachineBasicBlock *MBB, SmallVectorImpl> &CopyInfos, SmallVectorImpl &Copies); - void removeDeadBlock(MachineBasicBlock *MBB); + void removeDeadBlock(MachineBasicBlock *MBB, + llvm::function_ref *RemovalCallback = nullptr); }; } // End llvm namespace Index: lib/CodeGen/BranchFolding.cpp =================================================================== --- lib/CodeGen/BranchFolding.cpp +++ lib/CodeGen/BranchFolding.cpp @@ -701,8 +701,19 @@ CurMPIter != B && CurMPIter->getHash() == CurHash; --CurMPIter) { for (MPIterator I = std::prev(CurMPIter); I->getHash() == CurHash; --I) { unsigned CommonTailLen; + unsigned MinCommonTailLengthInside = MinCommonTailLength; + // Prefer to leave indirect branches duplicated after placement. + if (AfterBlockPlacement) { + MachineBasicBlock &Block1 = *CurMPIter->getBlock(); + if (Block1.empty()) + // On some platforms return instructions are marked as indirect + // branches and not as return instructions. Work around this by + // checking for return via the CFG. + if (Block1.back().isIndirectBranch() && Block1.succ_size() != 0) + MinCommonTailLengthInside = 21; + } if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(), - MinCommonTailLength, + MinCommonTailLengthInside, CommonTailLen, TrialBBI1, TrialBBI2, SuccBB, PredBB, FuncletMembership, Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -121,6 +122,12 @@ static cl::opt JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); +static cl::opt +TailDupPlacement("tail-dup-placement", + cl::desc("Perform tail duplication during placement. " + "Creates more fallthrough opportunites in " + "outline branches."), + cl::init(true), cl::Hidden); static cl::opt BranchFoldPlacement("branch-fold-placement", @@ -128,6 +135,12 @@ "Reduces code size."), cl::init(true), cl::Hidden); +// Heuristic for tail duplication. +static cl::opt TailDuplicateMergeThresholdPlacement( + "tail-dup-merge-threshold-placement", + cl::desc("Instruction cutoff to consider tail duplicating vs merging"), cl::init(2), + cl::Hidden); + extern cl::opt StaticLikelyProb; extern cl::opt ProfileLikelyProb; @@ -185,6 +198,16 @@ /// \brief End of blocks within the chain. iterator end() { return Blocks.end(); } + bool remove(MachineBasicBlock* BB) { + for(iterator i = begin(); i != end(); ++i) { + if (*i == BB) { + Blocks.erase(i); + return true; + } + } + return false; + } + /// \brief Merge a block chain into this one. /// /// This routine merges a block chain into this one. It takes care of forming @@ -221,6 +244,16 @@ for (MachineBasicBlock *MBB : *this) MBB->dump(); } + + void dump_brief() { + iterator i = begin(); + dbgs() << "["; + if (i != end()) + dbgs() << "#" << (*i)->getNumber(); + for (++i; i != end(); ++i) + dbgs() << ", #" << (*i)->getNumber(); + dbgs() << "]\n"; + } #endif // NDEBUG /// \brief Count of predecessors of any block within the chain which have not @@ -266,6 +299,13 @@ /// \brief A handle to the post dominator tree. MachineDominatorTree *MDT; + /// \brief Duplicator used to duplicate tails during placement. + /// + /// Placement decisions can open up new tail duplication opportunities, but + /// since tail duplication affects placement decisions of later blocks, it + /// must be done inline. + TailDuplicator TailDup; + /// \brief A set of blocks that are unavoidably execute, i.e. they dominate /// all terminators of the MachineFunction. SmallPtrSet UnavoidableBlocks; @@ -287,8 +327,18 @@ /// between basic blocks. DenseMap BlockToChain; + /// Decrease the UnscheduledPredecessors count for all blocks in chain, and + /// if the count goes to 0, add them to the appropriate work list. void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter = nullptr); + + /// Decrease the UnscheduledPredecessors count for a single block, and + /// if the count goes to 0, add them to the appropriate work list. + void markBlockSuccessors( + BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB, + const BlockFilterSet *BlockFilter = nullptr); + + BranchProbability collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter, @@ -298,6 +348,11 @@ const BlockFilterSet *BlockFilter, BranchProbability SuccProb, BranchProbability HotProb); + void maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred, + const BlockChain &Chain, + BlockFilterSet *BlockFilter, + MachineFunction::iterator &PrevUnplacedBlockIt, + bool &Removed, bool &DuplicatedToPred); bool hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain, BranchProbability SuccProb, @@ -323,7 +378,7 @@ SmallPtrSetImpl &UpdatedPreds, const BlockFilterSet *BlockFilter); void buildChain(MachineBasicBlock *BB, BlockChain &Chain, - const BlockFilterSet *BlockFilter = nullptr); + BlockFilterSet *BlockFilter = nullptr); MachineBasicBlock *findBestLoopTop(MachineLoop &L, const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopExit(MachineLoop &L, @@ -388,37 +443,49 @@ /// When a chain is being merged into the "placed" chain, this routine will /// quickly walk the successors of each block in the chain and mark them as /// having one fewer active predecessor. It also adds any successors of this -/// chain which reach the zero-predecessor state to the worklist passed in. +/// chain which reach the zero-predecessor state to the appropriate worklist. void MachineBlockPlacement::markChainSuccessors( BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { // Walk all the blocks in this chain, marking their successors as having // a predecessor placed. for (MachineBasicBlock *MBB : Chain) { - // Add any successors for which this is the only un-placed in-loop - // predecessor to the worklist as a viable candidate for CFG-neutral - // placement. No subsequent placement of this block will violate the CFG - // shape, so we get to use heuristics to choose a favorable placement. - for (MachineBasicBlock *Succ : MBB->successors()) { - if (BlockFilter && !BlockFilter->count(Succ)) - continue; - BlockChain &SuccChain = *BlockToChain[Succ]; - // Disregard edges within a fixed chain, or edges to the loop header. - if (&Chain == &SuccChain || Succ == LoopHeaderBB) - continue; + markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter); + } +} - // This is a cross-chain edge that is within the loop, so decrement the - // loop predecessor count of the destination chain. - if (SuccChain.UnscheduledPredecessors == 0 || - --SuccChain.UnscheduledPredecessors > 0) - continue; +/// \brief Mark a single block's successors as having one fewer preds. +/// +/// Under normal circumstances, this is only called by markChainSuccessors, +/// but if a block that was to be placed is completely tail-duplicated away, +/// and was duplicated into the chain end, we need to redo markBlockSuccessors +/// for just that block. +void MachineBlockPlacement::markBlockSuccessors( + BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB, + const BlockFilterSet *BlockFilter) { + // Add any successors for which this is the only un-placed in-loop + // predecessor to the worklist as a viable candidate for CFG-neutral + // placement. No subsequent placement of this block will violate the CFG + // shape, so we get to use heuristics to choose a favorable placement. + for (MachineBasicBlock *Succ : MBB->successors()) { + if (BlockFilter && !BlockFilter->count(Succ)) + continue; + BlockChain &SuccChain = *BlockToChain[Succ]; + // Disregard edges within a fixed chain, or edges to the loop header. + if (&Chain == &SuccChain || Succ == LoopHeaderBB) + continue; - auto *MBB = *SuccChain.begin(); - if (MBB->isEHPad()) - EHPadWorkList.push_back(MBB); - else - BlockWorkList.push_back(MBB); - } + // This is a cross-chain edge that is within the loop, so decrement the + // loop predecessor count of the destination chain. + if (SuccChain.UnscheduledPredecessors == 0 || + --SuccChain.UnscheduledPredecessors > 0) + continue; + + auto *MBB = *SuccChain.begin(); + if (MBB->isEHPad()) + EHPadWorkList.push_back(MBB); + else + BlockWorkList.push_back(MBB); } } @@ -902,7 +969,7 @@ void MachineBlockPlacement::buildChain( MachineBasicBlock *BB, BlockChain &Chain, - const BlockFilterSet *BlockFilter) { + BlockFilterSet *BlockFilter) { assert(BB && "BB must not be null.\n"); assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n"); MachineFunction::iterator PrevUnplacedBlockIt = F->begin(); @@ -937,6 +1004,46 @@ "layout successor until the CFG reduces\n"); } + // Placement may have changed tail duplication opportunities. + // Check for that now. + if (TailDupPlacement && BestSucc) { + + bool Removed, DuplicatedToBB; + maybeTailDuplicateBlock(BestSucc, BB, Chain, BlockFilter, + PrevUnplacedBlockIt, + Removed, DuplicatedToBB); + if (Removed) { + if (DuplicatedToBB) { + // Do 2 things: If we duplicated into BB, we need to update + // unscheduled predecessors. Then try to duplicate again. + while (DuplicatedToBB) { + MachineBasicBlock *DupBB, *DupPred; + BlockChain::iterator ChainEnd = Chain.end(); + DupBB = *(--ChainEnd); + // If we duplicated into DupPred, we need to update unscheduled + // predecessors. The way to do that is to update just DupPred, and + // not the whole chain. + markBlockSuccessors(Chain, DupBB, LoopHeaderBB, BlockFilter); + // Now try to duplicate again. + if (ChainEnd == Chain.begin()) + break; + DupPred = *(--ChainEnd); + maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter, + PrevUnplacedBlockIt, + Removed, DuplicatedToBB); + // If tail maybeTailDuplicateBlock removes DupBB, it will remove it + // from the chain also, leaving DupPred as the end of the chain. + } + BlockChain::iterator ChainEnd = Chain.end(); + BB = *(--ChainEnd); + } + // If the chosen successor was duplicated into all its predecessors, + // don't bother laying it out, just go round the loop again with BB as + // the chain end. + continue; + } + } + // Place this block, updating the datastructures to reflect its placement. BlockChain &SuccChain = *BlockToChain[BestSucc]; // Zero out UnscheduledPredecessors for the successor we're about to merge in case @@ -945,7 +1052,9 @@ DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to " << getBlockName(BestSucc) << "\n"); markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter); + Chain.merge(BestSucc, &SuccChain); + DEBUG(Chain.dump_brief()); BB = *std::prev(Chain.end()); } @@ -1718,6 +1827,107 @@ } } +/// maybeTailDuplicateBlock - Tail duplicate Succ into (some) predecessors if +/// profitable. +/// @param BB - Basic block that may be duplicated +/// @param LPred - Chosen layout predecessor of Succ +/// @param Chain - Chain to which LPred belongs, and BB will belong. +/// @param BlockFilter - Set of blocks that belong to the loop being laid out. +/// Used to identify which blocks to update predecessor +/// counts. +/// @param PrevUnplacedBlockIt - Iterator pointing to the last block that was +/// chosen in the given order due to unnatural CFG +/// only needed if BB is removed and +/// PrevUnplacedBlockIt pointed to BB. +void MachineBlockPlacement::maybeTailDuplicateBlock( + MachineBasicBlock *BB, MachineBasicBlock *LPred, + const BlockChain &Chain, BlockFilterSet *BlockFilter, + MachineFunction::iterator &PrevUnplacedBlockIt, + bool &Removed, bool &DuplicatedToBB) { + + Removed = false; + DuplicatedToBB = false; + bool CanDupToChain = false; + DEBUG(dbgs() << "Redoing tail duplication for Succ#" + << BB->getNumber() << "\n"); + bool IsSimple = TailDup.isSimpleBB(BB); + // Blocks with single successors don't create additional fallthrough + // opportunities. Don't duplicate them. TODO: When conditional exits are + // analyzable, allow them to be duplicated. + if ((IsSimple || BB->succ_size() != 1) && + TailDup.shouldTailDuplicate(*F, IsSimple, *BB)) { + // Update UnscheduledPredecessors to reflect tail-duplication. + for (MachineBasicBlock *Pred : BB->predecessors()) { + // We're only looking for unscheduled predecessors that match the filter. + BlockChain* PredChain = BlockToChain[Pred]; + if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) + || PredChain == &Chain) + continue; + if (TailDup.canTailDuplicate(BB, Pred)) { + for (MachineBasicBlock *NewSucc : BB->successors()) { + if (BlockFilter && !BlockFilter->count(NewSucc)) + continue; + BlockChain *NewChain = BlockToChain[NewSucc]; + if (NewChain != &Chain && NewChain != PredChain) { + DEBUG(dbgs() << "Bumping unscheduled pred count for chain:\n"); + DEBUG(BlockToChain[NewSucc]->dump_brief()); + DEBUG(dbgs() << "Because of edges: " + << getBlockName(Pred) << " -> " + << getBlockName(BB) << " -> " + << getBlockName(NewSucc) << "\n"); + BlockToChain[NewSucc]->UnscheduledPredecessors++; + } + } + } + } + CanDupToChain = TailDup.canTailDuplicate(BB, LPred); + // This has to be a callback because none of it can be done after + // BB is deleted. + auto RemovalCallback = + [&](MachineBasicBlock *RemBB) { + // Signal to outer function + Removed = true; + + // Remove from the Chain and Chain Map + if (BlockToChain.count(RemBB)) { + BlockChain *Chain = BlockToChain[RemBB]; + Chain->remove(RemBB); + BlockToChain.erase(RemBB); + } + + // Handle the unplaced block iterator + if (&(*PrevUnplacedBlockIt) == RemBB) { + PrevUnplacedBlockIt++; + } + + // Handle the Work Lists + SmallVectorImpl &RemoveList = BlockWorkList; + if (RemBB->isEHPad()) + RemoveList = EHPadWorkList; + for (auto it = RemoveList.begin(); it != RemoveList.end(); ++it) { + if (*it == RemBB) { + RemoveList.erase(it); + break; + } + } + + // Handle the filter set + if (BlockFilter) { + BlockFilter->erase(RemBB); + } + + // TailDuplicator handles removing it from loops. + DEBUG(dbgs() << "TailDuplicator deleted block: " + << getBlockName(RemBB) << "\n"); + }; + auto RemovalCallbackRef = + llvm::function_ref(RemovalCallback); + TailDup.tailDuplicateAndUpdate(*F, IsSimple, BB, LPred, + &RemovalCallbackRef); + } + DuplicatedToBB = Removed && CanDupToChain; +} + bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -1734,6 +1944,14 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MDT = &getAnalysis(); + auto MMI = getAnalysisIfAvailable(); + if (TailDupPlacement) { + unsigned TailDupSize = TailDuplicateMergeThresholdPlacement; + if (MF.getFunction()->optForSize()) + TailDupSize = 1; + TailDup.initMF(MF, MMI, MBPI, MLI, /* LayoutMode */ true, TailDupSize); + } + assert(BlockToChain.empty()); buildCFGChains(); @@ -1747,16 +1965,19 @@ BranchFoldPlacement; // No tail merging opportunities if the block number is less than four. if (MF.size() > 3 && EnableTailMerge) { - // Default to the standard tail-merge-size option. - unsigned TailMergeSize = 0; + unsigned TailMergeSize = TailDuplicateMergeThresholdPlacement + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, TailMergeSize); + DEBUG(MF.dump()); if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), getAnalysisIfAvailable(), MLI, /*AfterBlockPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. + DEBUG(MF.dump()); BlockToChain.clear(); + // Must redo the dominator tree if blocks were changed. + MDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } Index: lib/CodeGen/TailDuplication.cpp =================================================================== --- lib/CodeGen/TailDuplication.cpp +++ lib/CodeGen/TailDuplication.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/IR/Function.h" @@ -49,8 +50,9 @@ auto MMI = getAnalysisIfAvailable(); auto MBPI = &getAnalysis(); + auto MLI = &getAnalysis(); - Duplicator.initMF(MF, MMI, MBPI); + Duplicator.initMF(MF, MMI, MBPI, MLI, /* LayoutMode */ false); bool MadeChange = false; while (Duplicator.tailDuplicateBlocks(MF)) @@ -61,5 +63,6 @@ void TailDuplicatePass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } Index: lib/CodeGen/TailDuplicator.cpp =================================================================== --- lib/CodeGen/TailDuplicator.cpp +++ lib/CodeGen/TailDuplicator.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" @@ -58,16 +59,21 @@ void TailDuplicator::initMF(MachineFunction &MF, const MachineModuleInfo *MMIin, const MachineBranchProbabilityInfo *MBPIin, - unsigned TailDupSizeIn) { + MachineLoopInfo *MLIin, + bool LayoutModeIn, unsigned TailDupSizeIn) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); MMI = MMIin; MBPI = MBPIin; + MLI = MLIin; TailDupSize = TailDupSizeIn; assert(MBPI != nullptr && "Machine Branch Probability Info required"); + assert((LayoutModeIn || MLI != nullptr) && + "Machine Loop Info required in non-layout mode."); + LayoutMode = LayoutModeIn; PreRegAlloc = MRI->isSSA(); } @@ -118,15 +124,17 @@ } /// Tail duplicate the block and cleanup. -bool TailDuplicator::tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB) { +bool TailDuplicator::tailDuplicateAndUpdate( + MachineFunction &MF, bool IsSimple, MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred, + llvm::function_ref *RemovalCallback) { // Save the successors list. SmallSetVector Succs(MBB->succ_begin(), MBB->succ_end()); SmallVector TDBBs; SmallVector Copies; - if (!tailDuplicate(MF, IsSimple, MBB, TDBBs, Copies)) + if (!tailDuplicate(MF, IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies)) return false; ++NumTails; @@ -144,7 +152,7 @@ // If it is dead, remove it. if (isDead) { NumTailDupRemoved += MBB->size(); - removeDeadBlock(MBB); + removeDeadBlock(MBB, RemovalCallback); ++NumDeadBlocks; } @@ -240,7 +248,7 @@ if (!shouldTailDuplicate(MF, IsSimple, *MBB)) continue; - MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB); + MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB, nullptr); } if (PreRegAlloc && TailDupVerify) @@ -501,8 +509,10 @@ bool TailDuplicator::shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, MachineBasicBlock &TailBB) { - // Only duplicate blocks that end with unconditional branches. - if (TailBB.canFallThrough()) + // When doing tail-duplication during layout, the block ordering is in flux, + // so canFallThrough returns a result based on incorrect information and + // should just be ignored. + if (!LayoutMode && TailBB.canFallThrough()) return false; // Don't try to tail-duplicate single-block loops. @@ -532,6 +542,21 @@ && TailBB.canFallThrough()) return false; + // When tail-duplicating relatively early, we shouldnt' de-normalize loops by + // duplicating latches, headers, or loop-predecessors. This interferes with + // later optimizations. During layout this is fine. + if (!LayoutMode) { + MachineLoop *ML = MLI->getLoopFor(&TailBB); + if (ML && (&TailBB == ML->getHeader() || + &TailBB == ML->getLoopLatch())) + return false; + for (auto SB : TailBB.successors()) { + MachineLoop *SML = MLI->getLoopFor(SB); + if (SML && (&TailBB == SML->getLoopPredecessor())) + return false; + } + } + // If the target has hardware branch prediction that can handle indirect // branches, duplicating them can often make them predictable when there // are common paths through the code. The limit needs to be high enough @@ -539,10 +564,15 @@ // that rearrange the predecessors of the indirect branch. bool HasIndirectbr = false; - if (!TailBB.empty()) - HasIndirectbr = TailBB.back().isIndirectBranch(); + if (!TailBB.empty()) { + MachineInstr &back = TailBB.back(); + // On some platforms return instructions are marked as indirect branches and + // not as return instructions. Work around this by checking for return via + // the CFG. + HasIndirectbr = back.isIndirectBranch() && TailBB.succ_size() != 0; + } - if (HasIndirectbr && PreRegAlloc) + if (HasIndirectbr && (PreRegAlloc || LayoutMode)) MaxDuplicateCount = 20; // Check the instructions in the block to determine whether tail-duplication @@ -722,7 +752,7 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB) { - // EH edges are ignored by AnalyzeBranch. + // EH edges are ignored by analyzeBranch. if (PredBB->succ_size() > 1) return false; @@ -737,8 +767,14 @@ /// If it is profitable, duplicate TailBB's contents in each /// of its predecessors. +/// ForcedLayoutPred When non-null, use this block as the layout predecessor +/// instead of the previous block in MF's order. +/// TDBBs A vector to keep track of all blocks tail-duplicated into. +/// Copies A vector of copy instructions inserted. Used later to walk +/// all the inserted copies and remove redundant ones. bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies) { DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n'); @@ -763,7 +799,12 @@ continue; // Don't duplicate into a fall-through predecessor (at least for now). - if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + bool IsLayoutSuccessor = false; + if (ForcedLayoutPred) + IsLayoutSuccessor = (ForcedLayoutPred == PredBB); + else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + IsLayoutSuccessor = true; + if (IsLayoutSuccessor) continue; DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB @@ -816,16 +857,20 @@ // If TailBB was duplicated into all its predecessors except for the prior // block, which falls through unconditionally, move the contents of this // block into the prior block. - MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator()); + MachineBasicBlock *PrevBB = ForcedLayoutPred; + if (!PrevBB) + PrevBB = &*std::prev(TailBB->getIterator()); MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr; SmallVector PriorCond; // This has to check PrevBB->succ_size() because EH edges are ignored by - // AnalyzeBranch. + // analyzeBranch. if (PrevBB->succ_size() == 1 && // Layout preds are not always CFG preds. Check. *PrevBB->succ_begin() == TailBB && !TII->analyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) && - PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 && + PriorCond.empty() && + (!PriorTBB || PriorTBB == TailBB) && + TailBB->pred_size() == 1 && !TailBB->hasAddressTaken()) { DEBUG(dbgs() << "\nMerging into block: " << *PrevBB << "From MBB: " << *TailBB); @@ -852,6 +897,7 @@ } appendCopies(PrevBB, CopyInfos, Copies); } else { + TII->RemoveBranch(*PrevBB); // No PHIs to worry about, just splice the instructions over. PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end()); } @@ -924,10 +970,17 @@ /// Remove the specified dead machine basic block from the function, updating /// the CFG. -void TailDuplicator::removeDeadBlock(MachineBasicBlock *MBB) { +void TailDuplicator::removeDeadBlock( + MachineBasicBlock *MBB, + llvm::function_ref *RemovalCallback) { assert(MBB->pred_empty() && "MBB must be dead!"); DEBUG(dbgs() << "\nRemoving MBB: " << *MBB); + if (RemovalCallback) + (*RemovalCallback)(MBB); + if (MLI) + MLI->removeBlock(MBB); + // Remove all successors. while (!MBB->succ_empty()) MBB->removeSuccessor(MBB->succ_end() - 1); Index: test/CodeGen/AArch64/arm64-extload-knownzero.ll =================================================================== --- test/CodeGen/AArch64/arm64-extload-knownzero.ll +++ test/CodeGen/AArch64/arm64-extload-knownzero.ll @@ -12,7 +12,6 @@ %tmp2 = load i16, i16* %ptr, align 2 br label %bb2 bb2: -; CHECK: %bb2 ; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff ; CHECK: cmp [[REG]], #23 %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ] Index: test/CodeGen/AArch64/machine_cse.ll =================================================================== --- test/CodeGen/AArch64/machine_cse.ll +++ test/CodeGen/AArch64/machine_cse.ll @@ -1,4 +1,8 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s +; -tail-dup-placement causes tail duplication during layout. This breaks the +; assumptions of the test case as written (specifically, it creates an +; additional cmp instruction, creating a false positive), so we pass +; -tail-dup-placement=0 to restore the original behavior ; marked as external to prevent possible optimizations @a = external global i32 Index: test/CodeGen/ARM/2011-03-23-PeepholeBug.ll =================================================================== --- test/CodeGen/ARM/2011-03-23-PeepholeBug.ll +++ test/CodeGen/ARM/2011-03-23-PeepholeBug.ll @@ -25,7 +25,6 @@ br label %bb2 bb2: ; preds = %bb1, %entry -; CHECK: bb2 ; CHECK: cmp [[REG]], #0 ; CHECK: ble %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ] Index: test/CodeGen/PowerPC/branch-opt.ll =================================================================== --- test/CodeGen/PowerPC/branch-opt.ll +++ test/CodeGen/PowerPC/branch-opt.ll @@ -1,9 +1,18 @@ -; RUN: llc -verify-machineinstrs < %s -march=ppc32 | \ -; RUN: grep "b LBB.*" | count 4 +; RUN: llc -verify-machineinstrs < %s -march=ppc32 | FileCheck %s target datalayout = "E-p:32:32" target triple = "powerpc-apple-darwin8.7.0" +;CHECK-LABEL: foo: +; There are 3 inner loops (%bb, %bb12, %bb25) that all exit to %cond_next48 +; The last (whichever it is) should have a fallthrough exit, and the other two +; need an unconditional branch. No other block should have an unconditional +; branch to cond_next48 +;CHECK: b LBB0_13 +;CHECK: b LBB0_13 +;CHECK-NOT: b LBB0_13 +;CHECK: LBB0_13: ; %cond_next48 + define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) { entry: %tmp1 = and i32 %W, 1 ; [#uses=1] Index: test/CodeGen/PowerPC/sjlj.ll =================================================================== --- test/CodeGen/PowerPC/sjlj.ll +++ test/CodeGen/PowerPC/sjlj.ll @@ -74,24 +74,24 @@ ; CHECK-DAG: std [[REGA]], [[OFF:[0-9]+]](31) # 8-byte Folded Spill ; CHECK-DAG: std 1, 16([[REGA]]) ; CHECK-DAG: std 2, 24([[REGA]]) -; CHECK: bcl 20, 31, .LBB1_5 +; CHECK: bcl 20, 31, .LBB1_3 ; CHECK: li 3, 1 -; CHECK: #EH_SjLj_Setup .LBB1_5 +; CHECK: #EH_SjLj_Setup .LBB1_3 ; CHECK: b .LBB1_1 -; CHECK: .LBB1_4: +; CHECK: .LBB1_3: +; CHECK: mflr [[REGL:[0-9]+]] +; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31) # 8-byte Folded Reload +; CHECK: std [[REGL]], 8([[REG2]]) +; CHECK: li 3, 0 + +; CHECK: .LBB1_5: ; CHECK: lfd ; CHECK: lvx ; CHECK: ld ; CHECK: blr -; CHECK: .LBB1_5: -; CHECK: mflr [[REGL:[0-9]+]] -; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31) # 8-byte Folded Reload -; CHECK: std [[REGL]], 8([[REG2]]) -; CHECK: li 3, 0 - ; CHECK-NOAV: @main ; CHECK-NOAV-NOT: stvx ; CHECK-NOAV: bcl Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -0,0 +1,100 @@ +; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +; Intended layout: +; The outlining flag produces the layout +; test1 +; test2 +; test3 +; test4 +; exit +; optional1 +; optional2 +; optional3 +; optional4 +; Tail duplication puts test n+1 at the end of optional n +; so optional1 includes a copy of test2 at the end, and branches +; to test3 (at the top) or falls through to optional 2. +; The CHECK statements check for the whole string of tests and exit block, +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: f: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +;CHECK-NEXT: [[OPT1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[TEST3LABEL]] +;CHECK-NEXT: [[OPT2LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, [[TEST4LABEL]] +;CHECK-NEXT: [[OPT3LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, [[EXITLABEL]] +;CHECK-NEXT: [[OPT4LABEL]] +;CHECK: b [[EXITLABEL]] + +define void @f(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1 +optional1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2 +optional2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3 +optional3: + call void @c() + call void @c() + call void @c() + call void @c() + br label %test4 +test4: + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %exit, label %optional4 +optional4: + call void @d() + call void @d() + call void @d() + call void @d() + br label %exit +exit: + ret void +} + +declare void @a() +declare void @b() +declare void @c() +declare void @d() Index: test/CodeGen/WebAssembly/cfg-stackify.ll =================================================================== --- test/CodeGen/WebAssembly/cfg-stackify.ll +++ test/CodeGen/WebAssembly/cfg-stackify.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false | FileCheck %s -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s ; Test the CFG stackifier pass. @@ -402,32 +402,39 @@ ; CHECK: .LBB11_1: ; CHECK: loop{{$}} ; CHECK: block{{$}} +; CHECK: block{{$}} ; CHECK: br_if 0, $0{{$}} ; CHECK: br 1{{$}} ; CHECK: .LBB11_3: -; CHECK: end_block{{$}} +; CHECK-NEXT: end_block{{$}} ; CHECK: block{{$}} ; CHECK: br_if 0, $1{{$}} ; CHECK: br 1{{$}} ; CHECK: .LBB11_5: -; CHECK: br 0{{$}} +; CHECK-NEXT: end_block{{$}} ; CHECK: .LBB11_6: +; CHECK-NEXT: end_block{{$}} +; CHECK: br 0{{$}} +; CHECK: .LBB11_7: ; CHECK-NEXT: end_loop{{$}} ; OPT-LABEL: doublediamond_in_a_loop: ; OPT: .LBB11_1: ; OPT: loop{{$}} ; OPT: block{{$}} -; OPT: br_if 0, {{[^,]+}}{{$}} +; OPT: block{{$}} ; OPT: block{{$}} ; OPT: br_if 0, {{[^,]+}}{{$}} +; OPT: br_if 1, {{[^,]+}}{{$}} ; OPT: br 2{{$}} ; OPT-NEXT: .LBB11_4: ; OPT-NEXT: end_block{{$}} ; OPT: br 1{{$}} ; OPT: .LBB11_5: ; OPT-NEXT: end_block{{$}} -; OPT: br 0{{$}} ; OPT: .LBB11_6: +; OPT-NEXT: end_block{{$}} +; OPT: br 0{{$}} +; OPT: .LBB11_7: ; OPT-NEXT: end_loop{{$}} define i32 @doublediamond_in_a_loop(i32 %a, i32 %b, i32* %p) { entry: Index: test/CodeGen/WebAssembly/mem-intrinsics.ll =================================================================== --- test/CodeGen/WebAssembly/mem-intrinsics.ll +++ test/CodeGen/WebAssembly/mem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s ; Test memcpy, memmove, and memset intrinsics. Index: test/CodeGen/X86/avx512-i1test.ll =================================================================== --- test/CodeGen/X86/avx512-i1test.ll +++ test/CodeGen/X86/avx512-i1test.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc -O2 < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s ; ModuleID = 'bugpoint-reduced-simplified.bc' target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -20,6 +20,7 @@ ; CHECK-NEXT: LBB0_3: ## %bb35 ; CHECK-NEXT: ## in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: LBB0_2: ## %bb33 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: testb %al, %al Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -177,6 +177,12 @@ ret i32 %sum } +; Tail duplication during layout can entirely remove body0 by duplicating it +; into the entry block and into body1. This is a good thing but it isn't what +; this test is looking for. So to make the blocks longer so they don't get +; duplicated, we add some calls to dummy. +declare void @dummy() + define i32 @test_loop_rotate(i32 %i, i32* %a) { ; Check that we rotate conditional exits from the loop to the bottom of the ; loop, eliminating unconditional branches to the top. @@ -194,6 +200,8 @@ %base = phi i32 [ 0, %entry ], [ %sum, %body1 ] %next = add i32 %iv, 1 %exitcond = icmp eq i32 %next, %i + call void @dummy() + call void @dummy() br i1 %exitcond, label %exit, label %body1 body1: @@ -945,7 +953,7 @@ ; First rotated loop top. ; CHECK: .p2align ; CHECK: %while.end -; CHECK: %for.cond +; %for.cond gets completely tail-duplicated away. ; CHECK: %if.then ; CHECK: %if.else ; CHECK: %if.end10 Index: test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll =================================================================== --- test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll +++ test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: fmaddsubpd_loop_128: ; CHECK: vfmaddsub231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -28,7 +28,7 @@ ; CHECK-LABEL: fmsubaddpd_loop_128: ; CHECK: vfmsubadd231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -54,7 +54,7 @@ ; CHECK-LABEL: fmaddpd_loop_128: ; CHECK: vfmadd231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -80,7 +80,7 @@ ; CHECK-LABEL: fmsubpd_loop_128: ; CHECK: vfmsub231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -106,7 +106,7 @@ ; CHECK-LABEL: fnmaddpd_loop_128: ; CHECK: vfnmadd231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -132,7 +132,7 @@ ; CHECK-LABEL: fnmsubpd_loop_128: ; CHECK: vfnmsub231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -329,7 +329,7 @@ ; CHECK-LABEL: fmaddsubpd_loop_256: ; CHECK: vfmaddsub231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -355,7 +355,7 @@ ; CHECK-LABEL: fmsubaddpd_loop_256: ; CHECK: vfmsubadd231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -381,7 +381,7 @@ ; CHECK-LABEL: fmaddpd_loop_256: ; CHECK: vfmadd231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -407,7 +407,7 @@ ; CHECK-LABEL: fmsubpd_loop_256: ; CHECK: vfmsub231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -433,7 +433,7 @@ ; CHECK-LABEL: fnmaddpd_loop_256: ; CHECK: vfnmadd231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -459,7 +459,7 @@ ; CHECK-LABEL: fnmsubpd_loop_256: ; CHECK: vfnmsub231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: Index: test/CodeGen/X86/fp-une-cmp.ll =================================================================== --- test/CodeGen/X86/fp-une-cmp.ll +++ test/CodeGen/X86/fp-une-cmp.ll @@ -56,11 +56,11 @@ ; CHECK-NEXT: ucomisd %xmm1, %xmm0 ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: jp .LBB1_1 -; CHECK-NEXT: .LBB1_2: # %bb2 +; CHECK-NEXT: # %bb2 ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB1_1: # %bb1 ; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: jmp .LBB1_2 +; CHECK-NEXT: retq entry: %mul = fmul double %x, %y Index: test/CodeGen/X86/pr11202.ll =================================================================== --- test/CodeGen/X86/pr11202.ll +++ test/CodeGen/X86/pr11202.ll @@ -15,5 +15,8 @@ br label %l1 } -; CHECK: .Ltmp0: # Address of block that was removed by CodeGen +; It is correct for either l1 or l2 to be removed. +; If l2 is removed, the message should be "Address of block that was removed by CodeGen" +; If l1 is removed, it should be "Block address taken." +; CHECK: .Ltmp0: # {{Address of block that was removed by CodeGen|Block address taken}} ; CHECK: .quad .Ltmp0 Index: test/CodeGen/X86/ragreedy-bug.ll =================================================================== --- test/CodeGen/X86/ragreedy-bug.ll +++ test/CodeGen/X86/ragreedy-bug.ll @@ -3,16 +3,34 @@ ; This testing case is reduced from 197.parser prune_match function. ; We make sure register copies are not generated on isupper.exit blocks. -; CHECK: isupper.exit +; isupper.exit and isupper.exit223 get tail-duplicated into all their +; predecessors. +; CHECK: cond.true.i.i ; CHECK-NEXT: in Loop +; Mem-move +; CHECK-NEXT: movl +; CHECK-NEXT: andl ; CHECK-NEXT: testl ; CHECK-NEXT: jne -; CHECK: isupper.exit +; CHECK: cond.true.i.i217 ; CHECK-NEXT: in Loop +; Mem-move +; CHECK-NEXT: movl +; CHECK-NEXT: andl ; CHECK-NEXT: testl ; CHECK-NEXT: je +; CHECK: cond.false.i.i ; CHECK: maskrune +; CHECK-NEXT: movzbl +; CHECK-NEXT: movzbl +; CHECK-NEXT: testl +; CHECK-NEXT: je +; CHECK: cond.false.i.i219 ; CHECK: maskrune +; CHECK-NEXT: movzbl +; CHECK-NEXT: movzbl +; CHECK-NEXT: testl +; CHECK-NEXT: jne %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* } %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* } Index: test/CodeGen/X86/sse1.ll =================================================================== --- test/CodeGen/X86/sse1.ll +++ test/CodeGen/X86/sse1.ll @@ -58,21 +58,23 @@ ; X32-NEXT: je .LBB1_1 ; X32-NEXT: # BB#2: # %entry ; X32-NEXT: xorps %xmm1, %xmm1 -; X32-NEXT: jmp .LBB1_3 +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: jne .LBB1_5 +; X32-NEXT: jmp .LBB1_4 ; X32-NEXT: .LBB1_1: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: .LBB1_3: # %entry ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: je .LBB1_4 -; X32-NEXT: # BB#5: # %entry +; X32-NEXT: .LBB1_5: # %entry ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: jmp .LBB1_6 +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: jne .LBB1_8 +; X32-NEXT: jmp .LBB1_7 ; X32-NEXT: .LBB1_4: ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: .LBB1_6: # %entry ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: je .LBB1_7 -; X32-NEXT: # BB#8: # %entry +; X32-NEXT: .LBB1_8: # %entry ; X32-NEXT: xorps %xmm3, %xmm3 ; X32-NEXT: jmp .LBB1_9 ; X32-NEXT: .LBB1_7: @@ -95,21 +97,23 @@ ; X64-NEXT: je .LBB1_1 ; X64-NEXT: # BB#2: # %entry ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: jmp .LBB1_3 +; X64-NEXT: testl %edx, %edx +; X64-NEXT: jne .LBB1_5 +; X64-NEXT: jmp .LBB1_4 ; X64-NEXT: .LBB1_1: ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: .LBB1_3: # %entry ; X64-NEXT: testl %edx, %edx ; X64-NEXT: je .LBB1_4 -; X64-NEXT: # BB#5: # %entry +; X64-NEXT: .LBB1_5: # %entry ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: jmp .LBB1_6 +; X64-NEXT: testl %r8d, %r8d +; X64-NEXT: jne .LBB1_8 +; X64-NEXT: jmp .LBB1_7 ; X64-NEXT: .LBB1_4: ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: .LBB1_6: # %entry ; X64-NEXT: testl %r8d, %r8d ; X64-NEXT: je .LBB1_7 -; X64-NEXT: # BB#8: # %entry +; X64-NEXT: .LBB1_8: # %entry ; X64-NEXT: xorps %xmm3, %xmm3 ; X64-NEXT: jmp .LBB1_9 ; X64-NEXT: .LBB1_7: Index: test/CodeGen/X86/update-terminator.mir =================================================================== --- test/CodeGen/X86/update-terminator.mir +++ test/CodeGen/X86/update-terminator.mir @@ -5,17 +5,30 @@ @a = external global i16 @b = external global i32 + declare void @dummy1() + declare void @dummy2() + declare void @dummy3() + ; Function Attrs: nounwind define void @f2() { br i1 undef, label %bb1, label %bb3 bb1: + call void @dummy1() + call void @dummy1() + call void @dummy1() br i1 undef, label %bb2, label %bb2 bb2: + call void @dummy2() + call void @dummy2() + call void @dummy2() br label %bb4 bb3: + call void @dummy3() + call void @dummy3() + call void @dummy3() br label %bb2 bb4: @@ -40,15 +53,24 @@ bb.1: successors: %bb.2(100) + CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp JNE_1 %bb.2, implicit %eflags bb.2: successors: %bb.4(100) + CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp JMP_1 %bb.4 bb.3: successors: %bb.2(100) + CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp JMP_1 %bb.2 bb.4: