diff --git a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h --- a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h +++ b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h @@ -15,8 +15,10 @@ #ifndef LLVM_ANALYSIS_LOOPUNROLLANALYZER_H #define LLVM_ANALYSIS_LOOPUNROLLANALYZER_H +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/InstVisitor.h" // This class is used to get an estimate of the optimization effects that we @@ -36,6 +38,55 @@ // And finally: // v = b[1] namespace llvm { + +namespace { +/// A struct to densely store the state of an instruction after unrolling at +/// each iteration. +/// +/// This is designed to work like a tuple of for the +/// purposes of hashing and lookup, but to be able to associate two boolean +/// states with each key. +struct UnrolledInstState { + Instruction *I; + int Iteration : 30; + unsigned IsFree : 1; + unsigned IsCounted : 1; +}; + +/// Hashing and equality testing for a set of the instruction states. +struct UnrolledInstStateKeyInfo { + using PtrInfo = DenseMapInfo; + using PairInfo = DenseMapInfo>; + + static inline UnrolledInstState getEmptyKey() { + return {PtrInfo::getEmptyKey(), 0, 0, 0}; + } + + static inline UnrolledInstState getTombstoneKey() { + return {PtrInfo::getTombstoneKey(), 0, 0, 0}; + } + + static inline unsigned getHashValue(const UnrolledInstState &S) { + return PairInfo::getHashValue({S.I, S.Iteration}); + } + + static inline bool isEqual(const UnrolledInstState &LHS, + const UnrolledInstState &RHS) { + return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration}); + } +}; + +struct EstimatedUnrollCost { + /// The estimated cost after unrolling. + unsigned UnrolledCost; + + /// The estimated dynamic cost of executing the instructions in the + /// rolled form. + unsigned RolledDynamicCost; +}; + +} // namespace + class UnrolledInstAnalyzer : private InstVisitor { typedef InstVisitor Base; friend class InstVisitor; @@ -45,16 +96,27 @@ }; public: - UnrolledInstAnalyzer(unsigned Iteration, - DenseMap &SimplifiedValues, - ScalarEvolution &SE, const Loop *L) - : SimplifiedValues(SimplifiedValues), SE(SE), L(L) { - IterationNumber = SE.getConstant(APInt(64, Iteration)); + UnrolledInstAnalyzer( + unsigned Iteration, DenseMap &SimplifiedValues, + const SmallPtrSetImpl &EphValues, + DenseSet &InstCostMap, + unsigned &RolledDynamicCost, unsigned &UnrolledCost, + unsigned MaxUnrolledLoopSize, ScalarEvolution &SE, + const TargetTransformInfo &TTI, const Loop *L) + : SimplifiedValues(SimplifiedValues), EphValues(EphValues), + InstCostMap(InstCostMap), RolledDynamicCost(RolledDynamicCost), + UnrolledCost(UnrolledCost), MaxUnrolledLoopSize(MaxUnrolledLoopSize), + SE(SE), TTI(TTI), L(L) { + IterationNumber = SE.getConstant(APInt(64, Iteration)); } // Allow access to the initial visit method. using Base::visit; + bool visitBB(BasicBlock &B); + + void AddCostRecursively(Instruction &RootI, int Iteration); + private: /// A cache of pointer bases and constant-folded offsets corresponding /// to GEP (or derived from GEP) instructions. @@ -77,8 +139,15 @@ /// constants after complete unrolling, and account for likely simplifications /// post-unrolling. DenseMap &SimplifiedValues; + const SmallPtrSetImpl &EphValues; + DenseSet &InstCostMap; + + unsigned &RolledDynamicCost; + unsigned &UnrolledCost; + unsigned MaxUnrolledLoopSize; ScalarEvolution &SE; + const TargetTransformInfo &TTI; const Loop *L; bool simplifyInstWithSCEV(Instruction *I); diff --git a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp --- a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp +++ b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -16,6 +16,94 @@ using namespace llvm; +#define DEBUG_TYPE "loop-unroll" + +// Helper function to accumulate cost for instructions in the loop. +void UnrolledInstAnalyzer::AddCostRecursively(Instruction &RootI, + int Iteration) { + // A small worklist used to accumulate cost of instructions from each + // observable and reached root in the loop. + SmallVector CostWorklist; + + // PHI-used worklist used between iterations while accumulating cost. + SmallVector PHIUsedList; + + assert(Iteration >= 0 && "Cannot have a negative iteration!"); + assert(CostWorklist.empty() && "Must start with an empty cost list"); + assert(PHIUsedList.empty() && "Must start with an empty phi used list"); + CostWorklist.push_back(&RootI); + for (;; --Iteration) { + do { + Instruction *I = CostWorklist.pop_back_val(); + + // InstCostMap only uses I and Iteration as a key, the other two values + // don't matter here. + auto CostIter = InstCostMap.find({I, Iteration, 0, 0}); + if (CostIter == InstCostMap.end()) + // If an input to a PHI node comes from a dead path through the loop + // we may have no cost data for it here. What that actually means is + // that it is free. + continue; + auto &Cost = *CostIter; + if (Cost.IsCounted) + // Already counted this instruction. + continue; + + // Mark that we are counting the cost of this instruction now. + Cost.IsCounted = true; + + // If this is a PHI node in the loop header, just add it to the PHI set. + if (auto *PhiI = dyn_cast(I)) + if (PhiI->getParent() == L->getHeader()) { + assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " + "inherently simplify during unrolling."); + if (Iteration == 0) + continue; + + // Push the incoming value from the backedge into the PHI used list + // if it is an in-loop instruction. We'll use this to populate the + // cost worklist for the next iteration (as we count backwards). + if (auto *OpI = dyn_cast( + PhiI->getIncomingValueForBlock(L->getLoopLatch()))) + if (L->contains(OpI)) + PHIUsedList.push_back(OpI); + continue; + } + + // First accumulate the cost of this instruction. + if (!Cost.IsFree) { + UnrolledCost += TTI.getUserCost(I); + LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration " + << Iteration << "): "); + LLVM_DEBUG(I->dump()); + } + + // We must count the cost of every operand which is not free, + // recursively. If we reach a loop PHI node, simply add it to the set + // to be considered on the next iteration (backwards!). + for (Value *Op : I->operands()) { + // Check whether this operand is free due to being a constant or + // outside the loop. + auto *OpI = dyn_cast(Op); + if (!OpI || !L->contains(OpI)) + continue; + + // Otherwise accumulate its cost. + CostWorklist.push_back(OpI); + } + } while (!CostWorklist.empty()); + + if (PHIUsedList.empty()) + // We've exhausted the search. + break; + + assert(Iteration > 0 && + "Cannot track PHI-used values past the first iteration!"); + CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end()); + PHIUsedList.clear(); + } +}; + /// Try to simplify instruction \param I using its SCEV expression. /// /// The idea is that some AddRec expressions become constants, which then @@ -212,3 +300,57 @@ // The loop induction PHI nodes are definitionally free. return PN.getParent() == L->getHeader(); } + +bool UnrolledInstAnalyzer::visitBB(BasicBlock &BB) { + int Iteration = + cast(IterationNumber)->getValue()->getZExtValue(); + for (Instruction &I : BB) { + // These won't get into the final code - don't even try calculating the + // cost for them. + if (isa(I) || EphValues.count(&I)) + continue; + + // Track this instruction's expected baseline cost when executing the + // rolled loop form. + RolledDynamicCost += TTI.getUserCost(&I); + + // Visit the instruction to analyze its loop cost after unrolling, + // and if the visitor returns true, mark the instruction as free after + // unrolling and continue. + bool IsFree = visit(I); + bool Inserted = InstCostMap + .insert({&I, (int)Iteration, (unsigned)IsFree, + /*IsCounted*/ false}) + .second; + (void)Inserted; + assert(Inserted && "Cannot have a state for an unvisited instruction!"); + + if (IsFree) + continue; + + // Can't properly model a cost of a call. + // FIXME: With a proper cost model we should be able to do it. + if (auto *CI = dyn_cast(&I)) { + const Function *Callee = CI->getCalledFunction(); + if (!Callee || TTI.isLoweredToCall(Callee)) { + LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n"); + return false; + } + } + + // If the instruction might have a side-effect recursively account for + // the cost of it and all the instructions leading up to it. + if (I.mayHaveSideEffects()) + AddCostRecursively(I, Iteration); + + // If unrolled body turns out to be too big, bail out. + if (UnrolledCost > MaxUnrolledLoopSize) { + LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" + << " UnrolledCost: " << UnrolledCost + << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize + << "\n"); + return false; + } + } + return true; +} diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -257,51 +257,6 @@ namespace { -/// A struct to densely store the state of an instruction after unrolling at -/// each iteration. -/// -/// This is designed to work like a tuple of for the -/// purposes of hashing and lookup, but to be able to associate two boolean -/// states with each key. -struct UnrolledInstState { - Instruction *I; - int Iteration : 30; - unsigned IsFree : 1; - unsigned IsCounted : 1; -}; - -/// Hashing and equality testing for a set of the instruction states. -struct UnrolledInstStateKeyInfo { - using PtrInfo = DenseMapInfo; - using PairInfo = DenseMapInfo>; - - static inline UnrolledInstState getEmptyKey() { - return {PtrInfo::getEmptyKey(), 0, 0, 0}; - } - - static inline UnrolledInstState getTombstoneKey() { - return {PtrInfo::getTombstoneKey(), 0, 0, 0}; - } - - static inline unsigned getHashValue(const UnrolledInstState &S) { - return PairInfo::getHashValue({S.I, S.Iteration}); - } - - static inline bool isEqual(const UnrolledInstState &LHS, - const UnrolledInstState &RHS) { - return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration}); - } -}; - -struct EstimatedUnrollCost { - /// The estimated cost after unrolling. - unsigned UnrolledCost; - - /// The estimated dynamic cost of executing the instructions in the - /// rolled form. - unsigned RolledDynamicCost; -}; - } // end anonymous namespace /// Figure out if the loop is worth full unrolling. @@ -487,7 +442,9 @@ while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, EphValues, + InstCostMap, RolledDynamicCost, UnrolledCost, + MaxUnrolledLoopSize, SE, TTI, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -498,53 +455,8 @@ // Visit all instructions in the given basic block and try to simplify // it. We don't change the actual IR, just count optimization // opportunities. - for (Instruction &I : *BB) { - // These won't get into the final code - don't even try calculating the - // cost for them. - if (isa(I) || EphValues.count(&I)) - continue; - - // Track this instruction's expected baseline cost when executing the - // rolled loop form. - RolledDynamicCost += TTI.getUserCost(&I); - - // Visit the instruction to analyze its loop cost after unrolling, - // and if the visitor returns true, mark the instruction as free after - // unrolling and continue. - bool IsFree = Analyzer.visit(I); - bool Inserted = InstCostMap.insert({&I, (int)Iteration, - (unsigned)IsFree, - /*IsCounted*/ false}).second; - (void)Inserted; - assert(Inserted && "Cannot have a state for an unvisited instruction!"); - - if (IsFree) - continue; - - // Can't properly model a cost of a call. - // FIXME: With a proper cost model we should be able to do it. - if (auto *CI = dyn_cast(&I)) { - const Function *Callee = CI->getCalledFunction(); - if (!Callee || TTI.isLoweredToCall(Callee)) { - LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n"); - return None; - } - } - - // If the instruction might have a side-effect recursively account for - // the cost of it and all the instructions leading up to it. - if (I.mayHaveSideEffects()) - AddCostRecursively(I, Iteration); - - // If unrolled body turns out to be too big, bail out. - if (UnrolledCost > MaxUnrolledLoopSize) { - LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" - << " UnrolledCost: " << UnrolledCost - << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize - << "\n"); - return None; - } - } + if (!Analyzer.visitBB(*BB)) + return None; Instruction *TI = BB->getTerminator(); @@ -588,7 +500,7 @@ BBWorklist.insert(Succ); else ExitWorklist.insert({BB, Succ}); - AddCostRecursively(*TI, Iteration); + Analyzer.AddCostRecursively(*TI, Iteration); } // If we found no optimization opportunities on the first iteration, we diff --git a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp --- a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp +++ b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp @@ -26,6 +26,8 @@ bool runOnFunction(Function &F) override { LoopInfo *LI = &getAnalysis().getLoopInfo(); ScalarEvolution *SE = &getAnalysis().getSE(); + const TargetTransformInfo &TTI = + getAnalysis().getTTI(F); Function::iterator FI = F.begin(); FI++; // First basic block is entry - skip it. @@ -33,11 +35,16 @@ Loop *L = LI->getLoopFor(Header); BasicBlock *Exiting = L->getExitingBlock(); + DenseSet InstCostMap; + unsigned UnrolledCost = 0; + unsigned RolledDynamicCost = 0; + + SmallPtrSet EphValues; SimplifiedValuesVector.clear(); TripCount = SE->getSmallConstantTripCount(L, Exiting); for (unsigned Iteration = 0; Iteration < TripCount; Iteration++) { DenseMap SimplifiedValues; - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, EphValues, InstCostMap, RolledDynamicCost, UnrolledCost, 0,*SE, TTI, L); for (auto *BB : L->getBlocks()) for (Instruction &I : *BB) Analyzer.visit(I); @@ -46,6 +53,7 @@ return false; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -322,6 +330,7 @@ INITIALIZE_PASS_BEGIN(UnrollAnalyzerTest, "unrollanalyzertestpass", "unrollanalyzertestpass", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)