Index: llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h =================================================================== --- llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h +++ llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h @@ -25,6 +25,8 @@ namespace llvm { class BasicBlock; +class DomTreeUpdater; +class LoopInfo; class Value; struct DivRemMapKey { @@ -66,8 +68,9 @@ /// /// This optimization may add basic blocks immediately after BB; for obvious /// reasons, you shouldn't pass those blocks to bypassSlowDivision. -bool bypassSlowDivision( - BasicBlock *BB, const DenseMap &BypassWidth); +bool bypassSlowDivision(BasicBlock *BB, + const DenseMap &BypassWidth, + DomTreeUpdater *DTU = nullptr, LoopInfo *LI = nullptr); } // end namespace llvm Index: llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -251,6 +252,15 @@ "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false), cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion.")); +#ifdef EXPENSIVE_CHECKS +static bool VerifyDT = true; +#else +static bool VerifyDT = false; +#endif +static cl::opt VerifyDTUpdates( + "cgp-verify-dt-updates", cl::location(VerifyDT), cl::Hidden, + cl::desc("Verify dominator tree updates in CodeGenPrepare")); + static cl::opt VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false), cl::desc("Enable BFI update verification for " @@ -309,6 +319,8 @@ const TargetTransformInfo *TTI = nullptr; const BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr; const TargetLibraryInfo *TLInfo = nullptr; + DominatorTree *DT = nullptr; + DomTreeUpdater *DTU = nullptr; LoopInfo *LI = nullptr; std::unique_ptr BFI; std::unique_ptr BPI; @@ -360,10 +372,6 @@ /// DataLayout for the Function being processed. const DataLayout *DL = nullptr; - /// Building the dominator tree can be expensive, so we only build it - /// lazily and update it when required. - std::unique_ptr DT; - public: /// If encounter huge function, we need to limit the build time. bool IsHugeFunc = false; @@ -399,6 +407,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addUsedIfAvailable(); } @@ -422,20 +431,16 @@ } } - // Get the DominatorTree, building if necessary. - DominatorTree &getDT(Function &F) { - if (!DT) - DT = std::make_unique(F); - return *DT; - } + // Get the DominatorTree, updating it if necessary. + DominatorTree &getDT() { return DTU->getDomTree(); } void removeAllAssertingVHReferences(Value *V); bool eliminateAssumptions(Function &F); - bool eliminateFallThrough(Function &F, DominatorTree *DT = nullptr); - bool eliminateMostlyEmptyBlocks(Function &F); + bool eliminateFallThrough(Function &F); + bool eliminateMostlyEmptyBlocks(Function &F, bool &ResetDT); BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB); bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; - void eliminateMostlyEmptyBlock(BasicBlock *BB); + bool eliminateMostlyEmptyBlock(BasicBlock *BB); bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, bool isPreheader); bool makeBitReverse(Instruction &I); @@ -476,7 +481,7 @@ Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, bool HasPromoted, TypePromotionTransaction &TPT, SmallVectorImpl &SpeculativelyMovedExts); - bool splitBranchCondition(Function &F, ModifyDT &ModifiedDT); + bool splitBranchCondition(Function &F); bool simplifyOffsetableRelocate(GCStatepointInst &I); bool tryToSinkFreeOperands(Instruction *I); @@ -495,6 +500,7 @@ INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE, "Optimize for code generation", false, false) INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReader) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) @@ -519,6 +525,7 @@ TRI = SubtargetInfo->getRegisterInfo(); TLInfo = &getAnalysis().getTLI(F); TTI = &getAnalysis().getTTI(F); + DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); BPI.reset(new BranchProbabilityInfo(F, *LI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); @@ -549,6 +556,14 @@ F.setSectionPrefix("unknown"); } + DomTreeUpdater DTUpdater(DT, DomTreeUpdater::UpdateStrategy::Lazy); + DTU = &DTUpdater; + auto resetDTAndLI = [&]() { + DTU->recalculate(F); + LI->releaseMemory(); + LI->analyze(*DT); + }; + /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) { @@ -561,7 +576,7 @@ BasicBlock *Next = BB->getNextNode(); // F.hasOptSize is already checked in the outer if statement. if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get())) - EverMadeChange |= bypassSlowDivision(BB, BypassWidths); + EverMadeChange |= bypassSlowDivision(BB, BypassWidths, DTU, LI); BB = Next; } } @@ -573,31 +588,40 @@ // Eliminate blocks that contain only PHI nodes and an // unconditional branch. - EverMadeChange |= eliminateMostlyEmptyBlocks(F); + bool ResetDT = false; + EverMadeChange |= eliminateMostlyEmptyBlocks(F, ResetDT); + if (ResetDT) + resetDTAndLI(); - ModifyDT ModifiedDT = ModifyDT::NotModifyDT; if (!DisableBranchOpts) - EverMadeChange |= splitBranchCondition(F, ModifiedDT); + EverMadeChange |= splitBranchCondition(F); // Split some critical edges where one of the sources is an indirect branch, // to help generate sane code for PHIs involving such edges. - EverMadeChange |= - SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/true); + if (SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/true)) { + EverMadeChange = true; + resetDTAndLI(); + } + +#ifndef NDEBUG + if (VerifyDT) + assert(getDT().verify(DominatorTree::VerificationLevel::Fast) && + "Incorrect DominatorTree updates in CGP"); + + if (VerifyLoopInfo) + LI->verify(getDT()); +#endif // If we are optimzing huge function, we need to consider the build time. // Because the basic algorithm's complex is near O(N!). IsHugeFunc = F.size() > HugeFuncThresholdInCGPP; - // Transformations above may invalidate dominator tree and/or loop info. - DT.reset(); - LI->releaseMemory(); - LI->analyze(getDT(F)); - bool MadeChange = true; bool FuncIterated = false; while (MadeChange) { MadeChange = false; + DTU->flush(); for (BasicBlock &BB : llvm::make_early_inc_range(F)) { if (FuncIterated && !FreshBBs.contains(&BB)) continue; @@ -605,9 +629,6 @@ ModifyDT ModifiedDTOnIteration = ModifyDT::NotModifyDT; bool Changed = optimizeBlock(BB, ModifiedDTOnIteration); - if (ModifiedDTOnIteration == ModifyDT::ModifyBBDT) - DT.reset(); - MadeChange |= Changed; if (IsHugeFunc) { // If the BB is updated, it may still has chance to be optimized. @@ -640,11 +661,15 @@ MadeChange |= optimizePhiTypes(F); if (MadeChange) - eliminateFallThrough(F, DT.get()); + eliminateFallThrough(F); #ifndef NDEBUG - if (MadeChange && VerifyLoopInfo) - LI->verify(getDT(F)); + if (VerifyDT) + assert(getDT().verify(DominatorTree::VerificationLevel::Fast) && + "Incorrect DominatorTree updates in CGP"); + + if (VerifyLoopInfo) + LI->verify(getDT()); #endif // Really free removed instructions during promotion. @@ -662,6 +687,9 @@ NewGEPBases.clear(); SunkAddrs.clear(); + // LoopInfo is not needed anymore and ConstantFoldTerminator can break it. + LI = nullptr; + if (!DisableBranchOpts) { MadeChange = false; // Use a set vector to get deterministic iteration order. The order the @@ -670,7 +698,7 @@ SmallSetVector WorkList; for (BasicBlock &BB : F) { SmallVector Successors(successors(&BB)); - MadeChange |= ConstantFoldTerminator(&BB, true); + MadeChange |= ConstantFoldTerminator(&BB, true, nullptr, DTU); if (!MadeChange) continue; @@ -685,13 +713,16 @@ BasicBlock *BB = WorkList.pop_back_val(); SmallVector Successors(successors(BB)); - DeleteDeadBlock(BB); + DeleteDeadBlock(BB, DTU); for (BasicBlock *Succ : Successors) if (pred_empty(Succ)) WorkList.insert(Succ); } + // Flush pending DT updates in order to finalise deletion of dead blocks. + DTU->flush(); + // Merge pairs of basic blocks with unconditional branches, connected by // a single edge. if (EverMadeChange || MadeChange) @@ -778,7 +809,7 @@ /// Merge basic blocks which are connected by a single edge, where one of the /// basic blocks has a single successor pointing to the other basic block, /// which has a single predecessor. -bool CodeGenPrepare::eliminateFallThrough(Function &F, DominatorTree *DT) { +bool CodeGenPrepare::eliminateFallThrough(Function &F) { bool Changed = false; // Scan all of the blocks in the function, except for the entry block. // Use a temporary array to avoid iterator being invalidated when @@ -800,19 +831,13 @@ if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; - // Make an effort to skip unreachable blocks. - if (DT && !DT->isReachableFromEntry(BB)) - continue; - BranchInst *Term = dyn_cast(SinglePred->getTerminator()); if (Term && !Term->isConditional()) { Changed = true; LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n"); // Merge BB into SinglePred and delete it. - MergeBlockIntoPredecessor(BB, /* DTU */ nullptr, LI, /* MSSAU */ nullptr, - /* MemDep */ nullptr, - /* PredecessorWithTwoSuccessors */ false, DT); + MergeBlockIntoPredecessor(BB, DTU, LI); Preds.insert(SinglePred); if (IsHugeFunc) { @@ -868,7 +893,7 @@ /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these /// blocks so we can split them the way we want them. -bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { +bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F, bool &ResetDT) { SmallPtrSet Preheaders; SmallVector LoopList(LI->begin(), LI->end()); while (!LoopList.empty()) { @@ -878,6 +903,7 @@ Preheaders.insert(Preheader); } + ResetDT = false; bool MadeChange = false; // Copy blocks into a temporary array to avoid iterator invalidation issues // as we remove them. @@ -899,7 +925,7 @@ !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB))) continue; - eliminateMostlyEmptyBlock(BB); + ResetDT |= eliminateMostlyEmptyBlock(BB); MadeChange = true; } return MadeChange; @@ -1076,7 +1102,8 @@ /// Eliminate a basic block that has only phi's and an unconditional branch in /// it. -void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { +/// Indicate that the DT was modified only if the DT wasn't updated. +bool CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { BranchInst *BI = cast(BB->getTerminator()); BasicBlock *DestBB = BI->getSuccessor(0); @@ -1090,7 +1117,7 @@ assert(SinglePred == BB && "Single predecessor not the same as predecessor"); // Merge DestBB into SinglePred/BB and delete it. - MergeBlockIntoPredecessor(DestBB); + MergeBlockIntoPredecessor(DestBB, DTU, LI); // Note: BB(=SinglePred) will not be deleted on this path. // DestBB(=its single successor) is the one that was deleted. LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n"); @@ -1100,7 +1127,7 @@ FreshBBs.insert(SinglePred); FreshBBs.erase(DestBB); } - return; + return false; } } @@ -1138,6 +1165,7 @@ ++NumBlocksElim; LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); + return true; } // Computes a map of base pointer relocation instructions to corresponding @@ -1514,7 +1542,7 @@ // Finally, we need to ensure that the insert point will dominate all // existing uses of the increment. - auto &DT = getDT(*BO->getParent()->getParent()); + auto &DT = getDT(); if (DT.dominates(Cmp->getParent(), BO->getParent())) // If we're moving up the dom tree, all uses are trivially dominated. // (This is the common case for code produced by LSR.) @@ -2199,7 +2227,7 @@ /// /// If the transform is performed, return true and set ModifiedDT to true. static bool despeculateCountZeros(IntrinsicInst *CountZeros, - LoopInfo &LI, + DomTreeUpdater &DTU, LoopInfo &LI, const TargetLowering *TLI, const DataLayout *DL, ModifyDT &ModifiedDT, SmallSet &FreshBBs, @@ -2227,7 +2255,8 @@ // The intrinsic will be sunk behind a compare against zero and branch. BasicBlock *StartBlock = CountZeros->getParent(); - BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); + BasicBlock *CallBlock = SplitBlock(StartBlock, CountZeros, &DTU, &LI, + /* MSSAU */ nullptr, "cond.false"); if (IsHugeFunc) FreshBBs.insert(CallBlock); @@ -2235,17 +2264,11 @@ // in this block to select the result of the intrinsic or the bit-width // constant if the input to the intrinsic is zero. BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); - BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); + BasicBlock *EndBlock = SplitBlock(CallBlock, &*SplitPt, &DTU, &LI, + /* MSSAU */ nullptr, "cond.end"); if (IsHugeFunc) FreshBBs.insert(EndBlock); - // Update the LoopInfo. The new blocks are in the same loop as the start - // block. - if (Loop *L = LI.getLoopFor(StartBlock)) { - L->addBasicBlockToLoop(CallBlock, LI); - L->addBasicBlockToLoop(EndBlock, LI); - } - // Set up a builder to create a compare, conditional branch, and PHI. IRBuilder<> Builder(CountZeros->getContext()); Builder.SetInsertPoint(StartBlock->getTerminator()); @@ -2260,6 +2283,7 @@ Value *Cmp = Builder.CreateICmpEQ(Op, Zero, "cmpz"); Builder.CreateCondBr(Cmp, EndBlock, CallBlock); StartBlock->getTerminator()->eraseFromParent(); + DTU.applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock}}); // Create a PHI in the end block to select either the output of the intrinsic // or the bit width of the operand. @@ -2420,7 +2444,7 @@ case Intrinsic::cttz: case Intrinsic::ctlz: // If counting zeros is expensive, try to avoid it. - return despeculateCountZeros(II, *LI, TLI, DL, ModifiedDT, FreshBBs, + return despeculateCountZeros(II, *DTU, *LI, TLI, DL, ModifiedDT, FreshBBs, IsHugeFunc); case Intrinsic::fshl: case Intrinsic::fshr: @@ -2588,7 +2612,7 @@ continue; // Duplicate the return into TailCallBB. - (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB); + (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB, DTU); assert(!VerifyBFIUpdates || BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB)); BFI->setBlockFreq( @@ -2601,7 +2625,7 @@ // If we eliminated all predecessors of the block, delete the block now. if (Changed && !BB->hasAddressTaken() && pred_empty(BB)) - BB->eraseFromParent(); + DTU->deleteBB(BB); return Changed; } @@ -5356,10 +5380,7 @@ // Defer the query (and possible computation of) the dom tree to point of // actual use. It's expected that most address matches don't actually need // the domtree. - auto getDTFn = [MemoryInst, this]() -> const DominatorTree & { - Function *F = MemoryInst->getParent()->getParent(); - return this->getDT(*F); - }; + auto getDTFn = [this]() -> const DominatorTree & { return getDT(); }; ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, getDTFn, *TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, @@ -6024,7 +6045,7 @@ continue; bool inserted = false; for (auto &Pt : CurPts) { - if (getDT(F).dominates(Inst, Pt)) { + if (getDT().dominates(Inst, Pt)) { replaceAllUsesWith(Pt, Inst, FreshBBs, IsHugeFunc); RemovedInsts.insert(Pt); Pt->removeFromParent(); @@ -6033,7 +6054,7 @@ Changed = true; break; } - if (!getDT(F).dominates(Pt, Inst)) + if (!getDT().dominates(Pt, Inst)) // Give up if we need to merge in a common dominator as the // experiments show it is not profitable. continue; @@ -6155,8 +6176,8 @@ if (isa(BaseI)) NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); else if (InvokeInst *Invoke = dyn_cast(BaseI)) { - NewBaseInsertBB = - SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI); + NewBaseInsertBB = SplitEdge(NewBaseInsertBB, + Invoke->getNormalDest(), &getDT(), LI); NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); } else NewBaseInsertPt = std::next(BaseI->getIterator()); @@ -6976,12 +6997,6 @@ llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))) return false; - // The DominatorTree needs to be rebuilt by any consumers after this - // transformation. We simply reset here rather than setting the ModifiedDT - // flag to avoid restarting the function walk in runOnFunction for each - // select optimized. - DT.reset(); - // Transform a sequence like this: // start: // %cmp = cmp uge i32 %a, %b @@ -7006,6 +7021,9 @@ // block and its branch may be optimized away. In that case, one side of the // first branch will point directly to select.end, and the corresponding PHI // predecessor block will be the start block. + // The CFG is altered here and we update the DominatorTree and the LoopInfo, + // but we don't set a ModifiedDT flag to avoid restarting the function walk in + // runOnFunction for each select optimized. // Collect values that go on the true side and the values that go on the false // side. @@ -7031,20 +7049,20 @@ BranchInst *TrueBranch = nullptr; BranchInst *FalseBranch = nullptr; if (TrueInstrs.size() == 0) { - FalseBranch = cast(SplitBlockAndInsertIfElse( - CondFr, &*SplitPt, false, nullptr, nullptr, LI)); + FalseBranch = cast( + SplitBlockAndInsertIfElse(CondFr, &*SplitPt, false, nullptr, DTU, LI)); FalseBlock = FalseBranch->getParent(); EndBlock = cast(FalseBranch->getOperand(0)); } else if (FalseInstrs.size() == 0) { - TrueBranch = cast(SplitBlockAndInsertIfThen( - CondFr, &*SplitPt, false, nullptr, nullptr, LI)); + TrueBranch = cast( + SplitBlockAndInsertIfThen(CondFr, &*SplitPt, false, nullptr, DTU, LI)); TrueBlock = TrueBranch->getParent(); EndBlock = cast(TrueBranch->getOperand(0)); } else { Instruction *ThenTerm = nullptr; Instruction *ElseTerm = nullptr; SplitBlockAndInsertIfThenElse(CondFr, &*SplitPt, &ThenTerm, &ElseTerm, - nullptr, nullptr, LI); + nullptr, DTU, LI); TrueBranch = cast(ThenTerm); FalseBranch = cast(ElseTerm); TrueBlock = TrueBranch->getParent(); @@ -8320,13 +8338,9 @@ // For huge function we tend to quickly go though the inner optmization // opportunities in the BB. So we go back to the BB head to re-optimize // each instruction instead of go back to the function head. - if (IsHugeFunc) { - DT.reset(); - getDT(*BB.getParent()); + if (IsHugeFunc) break; - } else { - return true; - } + return true; } } } while (ModifiedDT == ModifyDT::ModifyInstDT); @@ -8379,7 +8393,7 @@ // to re-order dbg.value intrinsics. bool CodeGenPrepare::placeDbgValues(Function &F) { bool MadeChange = false; - DominatorTree DT(F); + DominatorTree &DT = getDT(); for (BasicBlock &BB : F) { for (Instruction &Insn : llvm::make_early_inc_range(BB)) { @@ -8489,7 +8503,7 @@ /// /// FIXME: Remove the (equivalent?) implementation in SelectionDAG. /// -bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) { +bool CodeGenPrepare::splitBranchCondition(Function &F) { if (!TM->Options.EnableFastISel || TLI->isJumpExpensive()) return false; @@ -8583,6 +8597,20 @@ PN.addIncoming(Val, TmpBB); } + if (LI) { + if (Loop *L = LI->getLoopFor(&BB)) + L->addBasicBlockToLoop(TmpBB, *LI); + } + + if (DTU) { + // The edge we need to delete starts at BB and ends at whatever TBB ends + // up pointing to. + DTU->applyUpdates({{DominatorTree::Insert, &BB, TmpBB}, + {DominatorTree::Insert, TmpBB, TBB}, + {DominatorTree::Insert, TmpBB, FBB}, + {DominatorTree::Delete, &BB, TBB}}); + } + // Update the branch weights (from SelectionDAGBuilder:: // FindMergedConditions). if (Opc == Instruction::Or) { @@ -8658,7 +8686,6 @@ } } - ModifiedDT = ModifyDT::ModifyBBDT; MadeChange = true; LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump(); Index: llvm/lib/Transforms/Utils/BypassSlowDivision.cpp =================================================================== --- llvm/lib/Transforms/Utils/BypassSlowDivision.cpp +++ llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -18,7 +18,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -32,6 +33,8 @@ #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include #include @@ -77,6 +80,8 @@ Instruction *SlowDivOrRem = nullptr; IntegerType *BypassType = nullptr; BasicBlock *MainBB = nullptr; + DomTreeUpdater *DTU = nullptr; + LoopInfo *LI = nullptr; bool isHashLikeValue(Value *V, VisitedSetTy &Visited); ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); @@ -100,7 +105,8 @@ Type *getSlowType() { return SlowDivOrRem->getType(); } public: - FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); + FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths, + DomTreeUpdater *DTU, LoopInfo *LI); Value *getReplacement(DivCacheTy &Cache); }; @@ -108,7 +114,9 @@ } // end anonymous namespace FastDivInsertionTask::FastDivInsertionTask(Instruction *I, - const BypassWidthsTy &BypassWidths) { + const BypassWidthsTy &BypassWidths, + DomTreeUpdater *DTU, LoopInfo *LI) + : DTU(DTU), LI(LI) { switch (I->getOpcode()) { case Instruction::UDiv: case Instruction::SDiv: @@ -413,7 +421,7 @@ // lets us entirely avoid a long div. // Split the basic block before the div/rem. - BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); + BasicBlock *SuccessorBB = SplitBlock(MainBB, SlowDivOrRem, DTU, LI); // Remove the unconditional branch from MainBB to SuccessorBB. MainBB->back().eraseFromParent(); QuotRemWithBB Long; @@ -424,13 +432,23 @@ QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); + + if (DTU) + DTU->applyUpdates({{DominatorTree::Insert, MainBB, Fast.BB}, + {DominatorTree::Insert, Fast.BB, SuccessorBB}}); + + if (LI) { + if (Loop *L = LI->getLoopFor(MainBB)) + L->addBasicBlockToLoop(Fast.BB, *LI); + } + return Result; } else { // General case. Create both slow and fast div/rem pairs and choose one of // them at runtime. // Split the basic block before the div/rem. - BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); + BasicBlock *SuccessorBB = SplitBlock(MainBB, SlowDivOrRem, DTU, LI); // Remove the unconditional branch from MainBB to SuccessorBB. MainBB->back().eraseFromParent(); QuotRemWithBB Fast = createFastBB(SuccessorBB); @@ -439,6 +457,21 @@ Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, DivisorShort ? nullptr : Divisor); Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); + + if (DTU) + DTU->applyUpdates({{DominatorTree::Insert, MainBB, Fast.BB}, + {DominatorTree::Insert, MainBB, Slow.BB}, + {DominatorTree::Insert, Fast.BB, SuccessorBB}, + {DominatorTree::Insert, Slow.BB, SuccessorBB}, + {DominatorTree::Delete, MainBB, SuccessorBB}}); + + if (LI) { + if (Loop *L = LI->getLoopFor(MainBB)) { + L->addBasicBlockToLoop(Fast.BB, *LI); + L->addBasicBlockToLoop(Slow.BB, *LI); + } + } + return Result; } } @@ -446,7 +479,8 @@ /// This optimization identifies DIV/REM instructions in a BB that can be /// profitably bypassed and carried out with a shorter, faster divide. bool llvm::bypassSlowDivision(BasicBlock *BB, - const BypassWidthsTy &BypassWidths) { + const BypassWidthsTy &BypassWidths, + DomTreeUpdater *DTU, LoopInfo *LI) { DivCacheTy PerBBDivCache; bool MadeChange = false; @@ -461,7 +495,7 @@ if (I->hasNUses(0)) continue; - FastDivInsertionTask Task(I, BypassWidths); + FastDivInsertionTask Task(I, BypassWidths, DTU, LI); if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { I->replaceAllUsesWith(Replacement); I->eraseFromParent(); Index: llvm/test/Transforms/CodeGenPrepare/AMDGPU/bypass-slow-div-debug-info-inseltpoison.ll =================================================================== --- llvm/test/Transforms/CodeGenPrepare/AMDGPU/bypass-slow-div-debug-info-inseltpoison.ll +++ llvm/test/Transforms/CodeGenPrepare/AMDGPU/bypass-slow-div-debug-info-inseltpoison.ll @@ -4,22 +4,22 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; CHECK-LABEL: @sdiv64( -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], [[DBG6:!dbg !.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, [[DBG6]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, [[DBG6]] -; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP9:%.*]], [[DBG6]] +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], !dbg [[DBG6:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, !dbg [[DBG6]] +; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP9:%.*]], !dbg [[DBG6]] ; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[B]] to i32, [[DBG6]] -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[A]] to i32, [[DBG6]] -; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], [[DBG6]] -; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64, [[DBG6]] -; CHECK-NEXT: br label [[TMP11:%.*]], [[DBG6]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[B]] to i32, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[A]] to i32, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64, !dbg [[DBG6]] +; CHECK-NEXT: br label [[DOTSPLIT:%.*]], !dbg [[DBG6]] ; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = sdiv i64 [[A]], [[B]], [[DBG6]] -; CHECK-NEXT: br label [[TMP11]], [[DBG6]] -; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP8]], [[TMP4]] ], [ [[TMP10]], [[TMP9]] ], [[DBG6]] -; CHECK-NEXT: ret i64 [[TMP12]] +; CHECK-NEXT: [[TMP10:%.*]] = sdiv i64 [[A]], [[B]], !dbg [[DBG6]] +; CHECK-NEXT: br label [[DOTSPLIT]], !dbg [[DBG6]] +; CHECK: .split: +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP8]], [[TMP4]] ], [ [[TMP10]], [[TMP9]] ], !dbg [[DBG6]] +; CHECK-NEXT: ret i64 [[TMP11]] ; %d = sdiv i64 %a, %b, !dbg !6 ret i64 %d @@ -29,27 +29,27 @@ ; division. define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; CHECK-LABEL: @sdivrem64( -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], [[DBG6]] -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, [[DBG6]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, [[DBG6]] -; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP11:%.*]], [[DBG6]] +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, !dbg [[DBG6]] +; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP11:%.*]], !dbg [[DBG6]] ; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[B]] to i32, [[DBG6]] -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[A]] to i32, [[DBG6]] -; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], [[DBG6]] -; CHECK-NEXT: [[TMP8:%.*]] = urem i32 [[TMP6]], [[TMP5]], [[DBG6]] -; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64, [[DBG6]] -; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64, [[DBG6]] -; CHECK-NEXT: br label [[TMP14:%.*]], [[DBG6]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[B]] to i32, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[A]] to i32, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP8:%.*]] = urem i32 [[TMP6]], [[TMP5]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64, !dbg [[DBG6]] +; CHECK-NEXT: br label [[DOTSPLIT:%.*]], !dbg [[DBG6]] ; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = sdiv i64 [[A]], [[B]], [[DBG6]] -; CHECK-NEXT: [[TMP13:%.*]] = srem i64 [[A]], [[B]], [[DBG6]] -; CHECK-NEXT: br label [[TMP14]], [[DBG6]] -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ [[TMP9]], [[TMP4]] ], [ [[TMP12]], [[TMP11]] ], [[DBG6]] -; CHECK-NEXT: [[TMP16:%.*]] = phi i64 [ [[TMP10]], [[TMP4]] ], [ [[TMP13]], [[TMP11]] ], [[DBG6]] -; CHECK-NEXT: [[INS0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP15]], i32 0 -; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i64> [[INS0]], i64 [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = sdiv i64 [[A]], [[B]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP13:%.*]] = srem i64 [[A]], [[B]], !dbg [[DBG6]] +; CHECK-NEXT: br label [[DOTSPLIT]], !dbg [[DBG6]] +; CHECK: .split: +; CHECK-NEXT: [[TMP14:%.*]] = phi i64 [ [[TMP9]], [[TMP4]] ], [ [[TMP12]], [[TMP11]] ], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ [[TMP10]], [[TMP4]] ], [ [[TMP13]], [[TMP11]] ], !dbg [[DBG6]] +; CHECK-NEXT: [[INS0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i32 0 +; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i64> [[INS0]], i64 [[TMP15]], i32 1 ; CHECK-NEXT: ret <2 x i64> [[INS1]] ; %d = sdiv i64 %a, %b, !dbg !6 Index: llvm/test/Transforms/CodeGenPrepare/AMDGPU/bypass-slow-div-debug-info.ll =================================================================== --- llvm/test/Transforms/CodeGenPrepare/AMDGPU/bypass-slow-div-debug-info.ll +++ llvm/test/Transforms/CodeGenPrepare/AMDGPU/bypass-slow-div-debug-info.ll @@ -4,22 +4,22 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; CHECK-LABEL: @sdiv64( -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], !dbg !6 -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, !dbg !6 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, !dbg !6 -; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP9:%.*]], !dbg !6 +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], !dbg [[DBG6:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, !dbg [[DBG6]] +; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP9:%.*]], !dbg [[DBG6]] ; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[B]] to i32, !dbg !6 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[A]] to i32, !dbg !6 -; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], !dbg !6 -; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64, !dbg !6 -; CHECK-NEXT: br label [[TMP11:%.*]], !dbg !6 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[B]] to i32, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[A]] to i32, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64, !dbg [[DBG6]] +; CHECK-NEXT: br label [[DOTSPLIT:%.*]], !dbg [[DBG6]] ; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = sdiv i64 [[A]], [[B]], !dbg !6 -; CHECK-NEXT: br label [[TMP11]], !dbg !6 -; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP8]], [[TMP4]] ], [ [[TMP10]], [[TMP9]] ], !dbg !6 -; CHECK-NEXT: ret i64 [[TMP12]] +; CHECK-NEXT: [[TMP10:%.*]] = sdiv i64 [[A]], [[B]], !dbg [[DBG6]] +; CHECK-NEXT: br label [[DOTSPLIT]], !dbg [[DBG6]] +; CHECK: .split: +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP8]], [[TMP4]] ], [ [[TMP10]], [[TMP9]] ], !dbg [[DBG6]] +; CHECK-NEXT: ret i64 [[TMP11]] ; %d = sdiv i64 %a, %b, !dbg !6 ret i64 %d @@ -29,27 +29,27 @@ ; division. define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; CHECK-LABEL: @sdivrem64( -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], !dbg !6 -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, !dbg !6 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, !dbg !6 -; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP11:%.*]], !dbg !6 +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, !dbg [[DBG6]] +; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP11:%.*]], !dbg [[DBG6]] ; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[B]] to i32, !dbg !6 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[A]] to i32, !dbg !6 -; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], !dbg !6 -; CHECK-NEXT: [[TMP8:%.*]] = urem i32 [[TMP6]], [[TMP5]], !dbg !6 -; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64, !dbg !6 -; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64, !dbg !6 -; CHECK-NEXT: br label [[TMP14:%.*]], !dbg !6 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[B]] to i32, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[A]] to i32, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP8:%.*]] = urem i32 [[TMP6]], [[TMP5]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64, !dbg [[DBG6]] +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64, !dbg [[DBG6]] +; CHECK-NEXT: br label [[DOTSPLIT:%.*]], !dbg [[DBG6]] ; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = sdiv i64 [[A]], [[B]], !dbg !6 -; CHECK-NEXT: [[TMP13:%.*]] = srem i64 [[A]], [[B]], !dbg !6 -; CHECK-NEXT: br label [[TMP14]], !dbg !6 -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ [[TMP9]], [[TMP4]] ], [ [[TMP12]], [[TMP11]] ], !dbg !6 -; CHECK-NEXT: [[TMP16:%.*]] = phi i64 [ [[TMP10]], [[TMP4]] ], [ [[TMP13]], [[TMP11]] ], !dbg !6 -; CHECK-NEXT: [[INS0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP15]], i32 0 -; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i64> [[INS0]], i64 [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = sdiv i64 [[A]], [[B]], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP13:%.*]] = srem i64 [[A]], [[B]], !dbg [[DBG6]] +; CHECK-NEXT: br label [[DOTSPLIT]], !dbg [[DBG6]] +; CHECK: .split: +; CHECK-NEXT: [[TMP14:%.*]] = phi i64 [ [[TMP9]], [[TMP4]] ], [ [[TMP12]], [[TMP11]] ], !dbg [[DBG6]] +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ [[TMP10]], [[TMP4]] ], [ [[TMP13]], [[TMP11]] ], !dbg [[DBG6]] +; CHECK-NEXT: [[INS0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP14]], i32 0 +; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i64> [[INS0]], i64 [[TMP15]], i32 1 ; CHECK-NEXT: ret <2 x i64> [[INS1]] ; %d = sdiv i64 %a, %b, !dbg !6