Index: include/llvm/Analysis/LoopUnrollAnalyzer.h =================================================================== --- include/llvm/Analysis/LoopUnrollAnalyzer.h +++ include/llvm/Analysis/LoopUnrollAnalyzer.h @@ -48,8 +48,9 @@ public: UnrolledInstAnalyzer(unsigned Iteration, DenseMap &SimplifiedValues, - ScalarEvolution &SE, const Loop *L) - : SimplifiedValues(SimplifiedValues), SE(SE), L(L) { + ScalarEvolution &SE, const Loop *L, bool CompleteUnroll) + : SimplifiedValues(SimplifiedValues), SE(SE), + L(L), CompleteUnroll(CompleteUnroll) { IterationNumber = SE.getConstant(APInt(64, Iteration)); } @@ -81,6 +82,7 @@ ScalarEvolution &SE; const Loop *L; + bool CompleteUnroll; bool simplifyInstWithSCEV(Instruction *I); Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -326,6 +326,11 @@ UP.Partial = UP.Runtime = UP.UpperBound = true; UP.PartialThreshold = MaxOps; + const TargetMachine &TM = getTLI()->getTargetMachine(); + // Unroll uncountable inner loops at O3 and higher. + if (L->getSubLoops().size() == 0 && TM.getOptLevel() > CodeGenOpt::Default) + UP.Force = true; + // Avoid unrolling when optimizing for size. UP.OptSizeThreshold = 0; UP.PartialOptSizeThreshold = 0; Index: lib/Analysis/LoopUnrollAnalyzer.cpp =================================================================== --- lib/Analysis/LoopUnrollAnalyzer.cpp +++ lib/Analysis/LoopUnrollAnalyzer.cpp @@ -35,6 +35,8 @@ SimplifiedValues[I] = SC->getValue(); return true; } + if (!CompleteUnroll) + return false; auto *AR = dyn_cast(S); if (!AR || AR->getLoop() != L) @@ -211,6 +213,10 @@ if (Base::visitPHINode(PN)) return true; + // Consider PHI is foldable only after complete unroll. + if (!CompleteUnroll) + return false; + // The loop induction PHI nodes are definitionally free. return PN.getParent() == L->getHeader(); } Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -89,6 +89,11 @@ cl::desc("Allow generation of a loop remainder (extra iterations) " "when unrolling a loop.")); +static cl::opt UnrollUncountable( + "unroll-uncountable", cl::Hidden, cl::init(false), + cl::desc("Allow unroll of uncountable (trip count is not countable) " + "loops when profitable.")); + static cl::opt UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); @@ -185,6 +190,8 @@ UP.Runtime = UnrollRuntime; if (UnrollMaxUpperBound == 0) UP.UpperBound = false; + if (UnrollUncountable.getNumOccurrences() > 0) + UP.Force = true; if (UnrollAllowPeeling.getNumOccurrences() > 0) UP.AllowPeeling = UnrollAllowPeeling; @@ -239,6 +246,27 @@ }; } +/// Some instructions get the same value during loop execution. +/// The function find a cycle length for such. +static unsigned getPhiCycleLength (PHINode *PHI, const Loop *L) { + Value *V = PHI->getIncomingValueForBlock(L->getLoopLatch()); + Instruction *I = dyn_cast(V); + if (!I) + return 0; + switch (I->getOpcode()) { + case Instruction::Sub: + if (ConstantInt *CS = dyn_cast(I->getOperand(0))) + if (CS->isZero() && I->getOperand(1) == PHI) + return 2; + case Instruction::Xor: + if (I->getOperand(0) == PHI && L->isLoopInvariant(I->getOperand(1))) + return 2; + default: + return 0; + } + return 0; +} + namespace { struct EstimatedUnrollCost { /// \brief The estimated cost after unrolling. @@ -264,9 +292,9 @@ /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. static Optional -analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, +analyzeLoopUnrollCost(const Loop *L, unsigned UnrollCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, - unsigned MaxUnrolledLoopSize) { + unsigned MaxUnrolledLoopSize, bool CompleteUnroll = true) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -278,9 +306,9 @@ if (!L->empty()) return None; - // Don't simulate loops with a big or unknown tripcount - if (!UnrollMaxIterationsCountToAnalyze || !TripCount || - TripCount > UnrollMaxIterationsCountToAnalyze) + // Don't simulate loops with a big or unknown UnrollCount + if (!UnrollMaxIterationsCountToAnalyze || !UnrollCount || + UnrollCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector BBWorklist; @@ -341,8 +369,10 @@ // If this is a PHI node in the loop header, just add it to the PHI set. if (auto *PhiI = dyn_cast(I)) if (PhiI->getParent() == L->getHeader()) { - assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " - "inherently simplify during unrolling."); + if (CompleteUnroll) + assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " + "inherently simplify during complete " + "unrolling."); if (Iteration == 0) continue; @@ -373,7 +403,15 @@ auto *OpI = dyn_cast(Op); if (!OpI || !L->contains(OpI)) continue; - + // For regular unroll instruction that depends on PHI also costs. + if (!CompleteUnroll && isa(OpI)) { + auto *OpIP = dyn_cast(OpI); + if(OpIP->getParent() == L->getHeader()) + if (auto *OpIPI = + dyn_cast( + OpIP->getIncomingValueForBlock(L->getLoopLatch()))) + CostWorklist.push_back(OpIPI); + } // Otherwise accumulate its cost. CostWorklist.push_back(OpI); } @@ -402,7 +440,7 @@ // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. - for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { + for (unsigned Iteration = 0; Iteration < UnrollCount; ++Iteration) { DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); // Prepare for the iteration by collecting any simplified entry or backedge @@ -418,10 +456,26 @@ PHI->getNumIncomingValues() == 2 && "Must have an incoming value only for the preheader and the latch."); + // If incoming value for PHI is another loop PHI, that means we need to + // store previous value (without unroll). + if (PHINode *PHIL = dyn_cast( + PHI->getIncomingValueForBlock(L->getLoopLatch()))) + if (Iteration && L->contains(PHIL)) + RolledDynamicCost++; + + unsigned PhiCycle = getPhiCycleLength (PHI, L); + // For complete unroll if we were unable to get PHI cycle length + // consider cycle length as full unroll count (TripCount). + if (CompleteUnroll && PhiCycle == 0) + PhiCycle = UnrollCount; + // If PHI has no cycle we are unable to simplify it. + if (PhiCycle == 0) + continue; + Value *V = PHI->getIncomingValueForBlock( - Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); + Iteration % PhiCycle ? L->getLoopLatch() : L->getLoopPreheader()); Constant *C = dyn_cast(V); - if (Iteration != 0 && !C) + if ((Iteration % PhiCycle) && !C) C = SimplifiedValues.lookup(V); if (C) SimplifiedInputValues.push_back({PHI, C}); @@ -431,8 +485,9 @@ SimplifiedValues.clear(); while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, + SE, L, CompleteUnroll); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -531,7 +586,7 @@ // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. - if (UnrolledCost == RolledDynamicCost) { + if (CompleteUnroll && UnrolledCost == RolledDynamicCost) { DEBUG(dbgs() << " No opportunities found.. exiting.\n" << " UnrolledCost: " << UnrolledCost << "\n"); return None; @@ -548,9 +603,12 @@ break; Value *Op = PN->getIncomingValueForBlock(ExitingBB); + if (auto *OpPN = dyn_cast(Op)) + if (OpPN->getParent() == L->getHeader()) + Op = OpPN->getIncomingValueForBlock(L->getLoopLatch()); if (auto *OpI = dyn_cast(Op)) if (L->contains(OpI)) - AddCostRecursively(*OpI, TripCount - 1); + AddCostRecursively(*OpI, UnrollCount - 1); } } @@ -917,6 +975,25 @@ << NV("UnrollCount", UP.Count) << " time(s)."); } + // Estimate if Force unroll could be profitable. + if (!isa(SE->getBackedgeTakenCount(L)) && UP.Count >= 2 + && !ExplicitUnroll) { + UP.Force = false; + } else { + Optional Cost = + analyzeLoopUnrollCost(L, 2, DT, *SE, TTI, + UP.PartialThreshold + UP.BEInsns, false); + if (!Cost || Cost->UnrolledCost >= Cost->RolledDynamicCost || + (unsigned)Cost->UnrolledCost > UP.PartialThreshold) + UP.Force = false; + } + if (UP.Force) { + // Currently force unroll only by 2. + UP.Count = 2; + DEBUG(dbgs() << " unrolling with count: " << UP.Count << "\n"); + // Unroll uncountable loops only once. + return true; + } if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n"); Index: unittests/Analysis/UnrollAnalyzer.cpp =================================================================== --- unittests/Analysis/UnrollAnalyzer.cpp +++ unittests/Analysis/UnrollAnalyzer.cpp @@ -38,7 +38,7 @@ TripCount = SE->getSmallConstantTripCount(L, Exiting); for (unsigned Iteration = 0; Iteration < TripCount; Iteration++) { DenseMap SimplifiedValues; - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L, true); for (auto *BB : L->getBlocks()) for (Instruction &I : *BB) Analyzer.visit(I);