Index: llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -251,6 +252,15 @@ "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false), cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion.")); +#ifdef EXPENSIVE_CHECKS +static bool VerifyDT = true; +#else +static bool VerifyDT = false; +#endif +static cl::opt VerifyDTUpdates( + "cgp-verify-dt-updates", cl::location(VerifyDT), cl::Hidden, + cl::desc("Verify dominator tree updates in CodeGenPrepare")); + static cl::opt VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false), cl::desc("Enable BFI update verification for " @@ -304,6 +314,7 @@ const TargetTransformInfo *TTI = nullptr; const BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr; const TargetLibraryInfo *TLInfo = nullptr; + std::unique_ptr DT; LoopInfo *LI = nullptr; std::unique_ptr BFI; std::unique_ptr BPI; @@ -355,9 +366,6 @@ /// DataLayout for the Function being processed. const DataLayout *DL = nullptr; - /// Building the dominator tree can be expensive, so we only build it - /// lazily and update it when required. - std::unique_ptr DT; public: /// If encounter huge function, we need to limit the build time. @@ -417,12 +425,6 @@ } } - // Get the DominatorTree, building if necessary. - DominatorTree &getDT(Function &F) { - if (!DT) - DT = std::make_unique(F); - return *DT; - } void removeAllAssertingVHReferences(Value *V); bool eliminateAssumptions(Function &F); @@ -584,9 +586,9 @@ IsHugeFunc = F.size() > HugeFuncThresholdInCGPP; // Transformations above may invalidate dominator tree and/or loop info. - DT.reset(); + DT = std::make_unique(F); LI->releaseMemory(); - LI->analyze(getDT(F)); + LI->analyze(*DT); bool MadeChange = true; bool FuncIterated = false; @@ -600,9 +602,6 @@ ModifyDT ModifiedDTOnIteration = ModifyDT::NotModifyDT; bool Changed = optimizeBlock(BB, ModifiedDTOnIteration); - if (ModifiedDTOnIteration == ModifyDT::ModifyBBDT) - DT.reset(); - MadeChange |= Changed; if (IsHugeFunc) { // If the BB is updated, it may still has chance to be optimized. @@ -638,8 +637,14 @@ eliminateFallThrough(F, DT.get()); #ifndef NDEBUG - if (MadeChange && VerifyLoopInfo) - LI->verify(getDT(F)); + if (MadeChange) { + if (VerifyDT) + assert(DT->verify(DominatorTree::VerificationLevel::Fast) && + "Incorrect DominatorTree updates in CGP"); + + if (VerifyLoopInfo) + LI->verify(*DT); + } #endif // Really free removed instructions during promotion. @@ -1504,14 +1509,13 @@ // Finally, we need to ensure that the insert point will dominate all // existing uses of the increment. - auto &DT = getDT(*BO->getParent()->getParent()); - if (DT.dominates(Cmp->getParent(), BO->getParent())) + if (DT->dominates(Cmp->getParent(), BO->getParent())) // If we're moving up the dom tree, all uses are trivially dominated. // (This is the common case for code produced by LSR.) return true; // Otherwise, special case the single use in the phi recurrence. - return BO->hasOneUse() && DT.dominates(Cmp->getParent(), L->getLoopLatch()); + return BO->hasOneUse() && DT->dominates(Cmp->getParent(), L->getLoopLatch()); }; if (BO->getParent() != Cmp->getParent() && !IsReplacableIVIncrement(BO)) { // We used to use a dominator tree here to allow multi-block optimization. @@ -2189,6 +2193,7 @@ /// /// If the transform is performed, return true and set ModifiedDT to true. static bool despeculateCountZeros(IntrinsicInst *CountZeros, + DominatorTree &DT, LoopInfo &LI, const TargetLowering *TLI, const DataLayout *DL, ModifyDT &ModifiedDT, @@ -2216,8 +2221,10 @@ return false; // The intrinsic will be sunk behind a compare against zero and branch. + DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy); BasicBlock *StartBlock = CountZeros->getParent(); - BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); + BasicBlock *CallBlock = SplitBlock(StartBlock, CountZeros, &DTU, &LI, + /* MSSAU */ nullptr, "cond.false"); if (IsHugeFunc) FreshBBs.insert(CallBlock); @@ -2225,17 +2232,11 @@ // in this block to select the result of the intrinsic or the bit-width // constant if the input to the intrinsic is zero. BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); - BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); + BasicBlock *EndBlock = SplitBlock(CallBlock, &*SplitPt, &DTU, &LI, + /* MSSAU */ nullptr, "cond.end"); if (IsHugeFunc) FreshBBs.insert(EndBlock); - // Update the LoopInfo. The new blocks are in the same loop as the start - // block. - if (Loop *L = LI.getLoopFor(StartBlock)) { - L->addBasicBlockToLoop(CallBlock, LI); - L->addBasicBlockToLoop(EndBlock, LI); - } - // Set up a builder to create a compare, conditional branch, and PHI. IRBuilder<> Builder(CountZeros->getContext()); Builder.SetInsertPoint(StartBlock->getTerminator()); @@ -2250,6 +2251,7 @@ Value *Cmp = Builder.CreateICmpEQ(Op, Zero, "cmpz"); Builder.CreateCondBr(Cmp, EndBlock, CallBlock); StartBlock->getTerminator()->eraseFromParent(); + DTU.applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock}}); // Create a PHI in the end block to select either the output of the intrinsic // or the bit width of the operand. @@ -2410,8 +2412,8 @@ case Intrinsic::cttz: case Intrinsic::ctlz: // If counting zeros is expensive, try to avoid it. - return despeculateCountZeros(II, *LI, TLI, DL, ModifiedDT, FreshBBs, - IsHugeFunc); + return despeculateCountZeros(II, *DT, *LI,TLI, DL, ModifiedDT, + FreshBBs, IsHugeFunc); case Intrinsic::fshl: case Intrinsic::fshr: return optimizeFunnelShift(II); @@ -2570,6 +2572,7 @@ } bool Changed = false; + DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); for (auto const &TailCallBB : TailCallBBs) { // Make sure the call instruction is followed by an unconditional branch to // the return block. @@ -2578,7 +2581,7 @@ continue; // Duplicate the return into TailCallBB. - (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB); + (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB, &DTU); assert(!VerifyBFIUpdates || BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB)); BFI->setBlockFreq( @@ -2591,7 +2594,7 @@ // If we eliminated all predecessors of the block, delete the block now. if (Changed && !BB->hasAddressTaken() && pred_empty(BB)) - BB->eraseFromParent(); + DTU.deleteBB(BB); return Changed; } @@ -3285,7 +3288,7 @@ const TargetRegisterInfo &TRI; const DataLayout &DL; const LoopInfo &LI; - const std::function getDTFn; + const DominatorTree &DT; /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and /// the memory instruction that we're computing this address for. @@ -3322,14 +3325,14 @@ AddressingModeMatcher( SmallVectorImpl &AMI, const TargetLowering &TLI, const TargetRegisterInfo &TRI, const LoopInfo &LI, - const std::function getDTFn, Type *AT, + const DominatorTree &DT, Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, std::pair, int64_t> &LargeOffsetGEP, bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), - DL(MI->getModule()->getDataLayout()), LI(LI), getDTFn(getDTFn), + DL(MI->getModule()->getDataLayout()), LI(LI), DT(DT), AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) { @@ -3348,14 +3351,14 @@ Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst, SmallVectorImpl &AddrModeInsts, const TargetLowering &TLI, const LoopInfo &LI, - const std::function getDTFn, + const DominatorTree &DT, const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, std::pair, int64_t> &LargeOffsetGEP, bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { ExtAddrMode Result; - bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, LI, getDTFn, + bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, LI, DT, AccessTy, AS, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI) @@ -4136,7 +4139,7 @@ // (Note that we defer the (expensive) domtree base legality check // to the very last possible point.) if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace) && - getDTFn().dominates(IVInc, MemoryInst)) { + DT.dominates(IVInc, MemoryInst)) { AddrModeInsts.push_back(cast(IVInc)); AddrMode = TestAddrMode; return true; @@ -5237,7 +5240,7 @@ 0); TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, getDTFn, + AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, DT, AddressAccessTy, AS, UserI, Result, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI); @@ -5343,15 +5346,8 @@ AddrModeInsts.clear(); std::pair, int64_t> LargeOffsetGEP(nullptr, 0); - // Defer the query (and possible computation of) the dom tree to point of - // actual use. It's expected that most address matches don't actually need - // the domtree. - auto getDTFn = [MemoryInst, this]() -> const DominatorTree & { - Function *F = MemoryInst->getParent()->getParent(); - return this->getDT(*F); - }; ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( - V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, getDTFn, + V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, *DT, *TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI.get()); @@ -6014,7 +6010,7 @@ continue; bool inserted = false; for (auto &Pt : CurPts) { - if (getDT(F).dominates(Inst, Pt)) { + if (DT->dominates(Inst, Pt)) { replaceAllUsesWith(Pt, Inst, FreshBBs, IsHugeFunc); RemovedInsts.insert(Pt); Pt->removeFromParent(); @@ -6023,7 +6019,7 @@ Changed = true; break; } - if (!getDT(F).dominates(Pt, Inst)) + if (!DT->dominates(Pt, Inst)) // Give up if we need to merge in a common dominator as the // experiments show it is not profitable. continue; @@ -6966,12 +6962,6 @@ llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))) return false; - // The DominatorTree needs to be rebuilt by any consumers after this - // transformation. We simply reset here rather than setting the ModifiedDT - // flag to avoid restarting the function walk in runOnFunction for each - // select optimized. - DT.reset(); - // Transform a sequence like this: // start: // %cmp = cmp uge i32 %a, %b @@ -6996,6 +6986,9 @@ // block and its branch may be optimized away. In that case, one side of the // first branch will point directly to select.end, and the corresponding PHI // predecessor block will be the start block. + // The CFG is altered here and we update the DominatorTree and the LoopInfo, + // but we don't set a ModifiedDT flag to avoid restarting the function walk in + // runOnFunction for each select optimized. // Collect values that go on the true side and the values that go on the false // side. @@ -7009,6 +7002,7 @@ // Split the select block, according to how many (if any) values go on each // side. + DomTreeUpdater DTU(DT.get(), DomTreeUpdater::UpdateStrategy::Lazy); BasicBlock *StartBlock = SI->getParent(); BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); @@ -7021,20 +7015,20 @@ BranchInst *TrueBranch = nullptr; BranchInst *FalseBranch = nullptr; if (TrueInstrs.size() == 0) { - FalseBranch = cast(SplitBlockAndInsertIfElse( - CondFr, &*SplitPt, false, nullptr, nullptr, LI)); + FalseBranch = cast( + SplitBlockAndInsertIfElse(CondFr, &*SplitPt, false, nullptr, &DTU, LI)); FalseBlock = FalseBranch->getParent(); EndBlock = cast(FalseBranch->getOperand(0)); } else if (FalseInstrs.size() == 0) { - TrueBranch = - cast(SplitBlockAndInsertIfThen(CondFr, &*SplitPt, false, nullptr, nullptr, LI)); + TrueBranch = cast( + SplitBlockAndInsertIfThen(CondFr, &*SplitPt, false, nullptr, &DTU, LI)); TrueBlock = TrueBranch->getParent(); EndBlock = cast(TrueBranch->getOperand(0)); } else { Instruction *ThenTerm = nullptr; Instruction *ElseTerm = nullptr; SplitBlockAndInsertIfThenElse(CondFr, &*SplitPt, &ThenTerm, &ElseTerm, - nullptr, nullptr, LI); + nullptr, &DTU, LI); TrueBranch = cast(ThenTerm); FalseBranch = cast(ElseTerm); TrueBlock = TrueBranch->getParent(); @@ -8311,13 +8305,9 @@ // For huge function we tend to quickly go though the inner optmization // opportunities in the BB. So we go back to the BB head to re-optimize // each instruction instead of go back to the function head. - if (IsHugeFunc) { - DT.reset(); - getDT(*BB.getParent()); + if (IsHugeFunc) break; - } else { - return true; - } + return true; } } } while (ModifiedDT == ModifyDT::ModifyInstDT);