Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -33,6 +33,7 @@ class PredIteratorCache; class ScalarEvolution; class TargetLibraryInfo; +class TargetTransformInfo; /// \brief Captures loop safety information. /// It keep information for loop & its header may throw exception. @@ -357,11 +358,12 @@ /// first order w.r.t the DominatorTree. This allows us to visit definitions /// before uses, allowing us to hoist a loop body in one pass without iteration. /// Takes DomTreeNode, AliasAnalysis, LoopInfo, DominatorTree, DataLayout, -/// TargetLibraryInfo, Loop, AliasSet information for all instructions of the -/// loop and loop safety information as arguments. It returns changed status. +/// TargetLibraryInfo, TargetTransformInfo, Loop, AliasSet information for all +/// instructions of the loop and loop safety information as arguments. It +/// returns changed status. bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *, - TargetLibraryInfo *, Loop *, AliasSetTracker *, - LICMSafetyInfo *); + TargetLibraryInfo *, TargetTransformInfo *TTI, Loop *, + AliasSetTracker *, LICMSafetyInfo *); /// \brief Try to promote memory values to scalars by sinking stores out of /// the loop and moving loads to before the loop. We do this by looping over Index: lib/Transforms/Scalar/LICM.cpp =================================================================== --- lib/Transforms/Scalar/LICM.cpp +++ lib/Transforms/Scalar/LICM.cpp @@ -42,6 +42,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -105,6 +106,18 @@ LICMSafetyInfo *SafetyInfo); namespace { + struct HoistAux { + Instruction *I; // The candidate instruction to be promoted. + unsigned Cost; // How much cost can be saved if I is promoted. + SmallVector Deps; // The hoist candidates which generate + // values used in current instruction. + unsigned HoistOrder; // The order number in which the instruction is + // hoisted. Used for sanity check. + HoistAux(Instruction *Inst) : I(Inst), Cost(0), HoistOrder(0) {} + }; +} + +namespace { struct LICM : public LoopPass { static char ID; // Pass identification, replacement for typeid LICM() : LoopPass(ID) { @@ -119,6 +132,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); + AU.addRequired(); getLoopAnalysisUsage(AU); } @@ -135,6 +149,7 @@ DominatorTree *DT; // Dominator Tree for the current Loop. TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. + TargetTransformInfo *TTI; // TargetTransformInfo to get register num. // State that is updated as we process loops. bool Changed; // Set to true when we change anything. @@ -162,6 +177,7 @@ INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) Pass *llvm::createLICMPass() { return new LICM(); } @@ -182,6 +198,8 @@ DT = &getAnalysis().getDomTree(); TLI = &getAnalysis().getTLI(); + Function *F = L->getHeader()->getParent(); + TTI = &getAnalysis().getTTI(*F); assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -210,7 +228,7 @@ Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop, CurAST, &SafetyInfo); if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, CurLoop, CurAST, &SafetyInfo); // Now that all loop invariants have been removed from the loop, promote any @@ -314,13 +332,176 @@ return Changed; } +/// Get Instruction cost. If the loop invariant instruction is expensive, +/// promote it to outside of the loop is beneficial even if the increased +/// live range will cause more spill. +/// +static unsigned getInstructionCost(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::PHI: + case Instruction::BitCast: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + return 0; + case Instruction::GetElementPtr: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Load: + case Instruction::ZExt: + case Instruction::SExt: + return 1; + default: + return 2; + } +} + +/// Hoist loop invariant candidates saved in HoistVec. +/// When a value is hoisted outside of a loop, its live range is stretched to +/// cover the whole loop body. If there are many such long stretched values +/// and the loop size is huge, it can increase the compile time of Lazy Value +/// Information analysis (LVI) and region based register splitting a lot. +/// To alleviate the problem, LICM is throttled for large size loop and for +/// non-expensive instructions. A simple rough heuristic is used here: for +/// large loop, cheap invariant instruction will not be hoisted if instructions +/// already being hoisted reach the register number of the target arch. +/// +static SmallVector HoistVec; +static bool selectHoist(DominatorTree *DT, Loop *CurLoop, + LICMSafetyInfo *SafetyInfo, TargetLibraryInfo *TLI, + TargetTransformInfo *TTI) { + bool Changed = false; + unsigned UsedReg = 0; + unsigned LoopSize = CurLoop->getNumBlocks(); + unsigned RegNum; + unsigned Order = 0; + if (LoopSize < 30) { + RegNum = UINT_MAX; + } else if (LoopSize < 100) { + RegNum = + std::max(TTI->getNumberOfRegisters(0), TTI->getNumberOfRegisters(1)); + } else { + RegNum = 0; + } + SmallVector ExpHoistVec; + SmallVector ChpHoistVec; + for (auto HAI = HoistVec.rbegin(); HAI != HoistVec.rend(); ++HAI) { + HoistAux *HA = (*HAI); + HA->Cost += getInstructionCost(HA->I); + // If current candidate is expensive, mark its deps also as expensive. + for (auto HADep : HA->Deps) + HADep->Cost += HA->Cost; + if (HA->Cost > 1) + ExpHoistVec.push_back(HA); + else + ChpHoistVec.push_back(HA); + } + // For expensive candidates, hoist them anyway. + for (auto HAI = ExpHoistVec.rbegin(); HAI != ExpHoistVec.rend(); ++HAI) { + HoistAux *HA = (*HAI); + // It is needed to check isSafeToExecuteUnconditionally before hoist again + // because we have following case: + // define void @test(..., dereferenceable(8) %cptr, ...) + // for.body: + // ... + // if.then: ; preds = %for.body + // %c = load i32*, i32** %cptr, !dereferenceable !0 + // %1 = load i32, i32* %c, align 4 + // ... + // br label %for.inc + // for.inc: + // ... + // br i1 %exitcond, label %for.end, label %for.body + // %c = load i32*, i32** %cptr, ... can be speculatively hoisted because + // %cptr is a param marked as a dereferenceable. Once %c = load ... is + // hoisted, the dereferenceable attribute associated with %c should be + // dropped because %c = load ... is not in its original context anymore. + // So the value %c used in the second load will not have dereferenceable + // attribute after the first load is hoisted and it should not be hoisted + // even if it is already in HoistVec. + if (isSafeToExecuteUnconditionally( + *(HA->I), DT, TLI, CurLoop, SafetyInfo, + CurLoop->getLoopPreheader()->getTerminator())) { + Changed |= hoist(*(HA->I), DT, CurLoop, SafetyInfo); + Order++; + HA->HoistOrder = Order; + UsedReg++; + } + } + // For cheap candidates, hoist them only when there is available register + // left. + for (auto HAI = ChpHoistVec.rbegin(); HAI != ChpHoistVec.rend(); ++HAI) { + HoistAux *HA = (*HAI); + if (UsedReg < RegNum && isSafeToExecuteUnconditionally( + *(HA->I), DT, TLI, CurLoop, SafetyInfo, + CurLoop->getLoopPreheader()->getTerminator())) { + Changed |= hoist(*(HA->I), DT, CurLoop, SafetyInfo); + Order++; + HA->HoistOrder = Order; + UsedReg++; + } else { + break; + } + } +#ifndef NDEBUG + // Add a sanity check to ensure if an Instruction is hoisted, the + // instructions it depends on are hoisted before it. + for (auto HA : HoistVec) { + if (HA->HoistOrder) { + for (auto HADep : HA->Deps) { + assert(HADep->HoistOrder && "Dep Instruction is not hoisted"); + assert(HADep->HoistOrder < HA->HoistOrder && "Hoist order is wrong"); + } + } + } +#endif + return Changed; +} + +/// Check if an instruction is loop invariant. +/// All the invariant candidates will be collected into HoistVec first and +/// then selected in selectHoist once for all. So when current func is called +/// for an instruction, its deps havn't been hoisted yet. So an operand is +/// loop invariant either when it is defined outside of current loop, or it +/// is defined by instruction in HoistVec. +/// +static bool isLICMInvariant(Loop *L, Instruction *I, + SmallVector &Deps) { + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + if (!L->isLoopInvariant(I->getOperand(i))) { + Instruction *ChildI = dyn_cast(I->getOperand(i)); + if (!ChildI) + continue; + bool found = false; + for (auto HA : HoistVec) { + if (HA->I == ChildI) { + found = true; + Deps.push_back(HA); + break; + } + } + if (!found) + return false; + } + } + return true; +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before /// uses, allowing us to hoist a loop body in one pass without iteration. /// bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + DominatorTree *DT, TargetLibraryInfo *TLI, + TargetTransformInfo *TTI, Loop *CurLoop, AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && @@ -334,8 +515,8 @@ // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - bool Changed = false; - if (!inSubLoop(BB, CurLoop, LI)) + if (!inSubLoop(BB, CurLoop, LI)) { + SmallVector Deps; for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) { Instruction &I = *II++; // Try constant folding this instruction. If all the operands are @@ -355,16 +536,30 @@ // if all of the operands of the instruction are loop invariant and if it // is safe to hoist the instruction. // - if (CurLoop->hasLoopInvariantOperands(&I) && + Deps.clear(); + if (isLICMInvariant(CurLoop, &I, Deps) && canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo) && - isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, - CurLoop->getLoopPreheader()->getTerminator())) - Changed |= hoist(I, DT, CurLoop, SafetyInfo); + isSafeToExecuteUnconditionally( + I, DT, TLI, CurLoop, SafetyInfo, + CurLoop->getLoopPreheader()->getTerminator())) { + HoistAux *HA = new HoistAux(&I); + HA->Deps.insert(HA->Deps.end(), Deps.begin(), Deps.end()); + HoistVec.push_back(HA); + } } + } const std::vector &Children = N->getChildren(); for (DomTreeNode *Child : Children) - Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); + hoistRegion(Child, AA, LI, DT, TLI, TTI, CurLoop, CurAST, SafetyInfo); + + bool Changed = false; + if (N == DT->getNode(CurLoop->getHeader())) { + Changed = selectHoist(DT, CurLoop, SafetyInfo, TLI, TTI); + for (auto HA : HoistVec) + delete HA; + HoistVec.clear(); + } return Changed; }