Index: include/llvm/Analysis/LoopUnrollAnalyzer.h =================================================================== --- include/llvm/Analysis/LoopUnrollAnalyzer.h +++ include/llvm/Analysis/LoopUnrollAnalyzer.h @@ -48,8 +48,9 @@ public: UnrolledInstAnalyzer(unsigned Iteration, DenseMap &SimplifiedValues, - ScalarEvolution &SE, const Loop *L) - : SimplifiedValues(SimplifiedValues), SE(SE), L(L) { + ScalarEvolution &SE, const Loop *L, bool CompleteUnroll) + : SimplifiedValues(SimplifiedValues), SE(SE), + L(L), CompleteUnroll(CompleteUnroll) { IterationNumber = SE.getConstant(APInt(64, Iteration)); } @@ -81,6 +82,7 @@ ScalarEvolution &SE; const Loop *L; + bool CompleteUnroll; bool simplifyInstWithSCEV(Instruction *I); Index: lib/Analysis/LoopUnrollAnalyzer.cpp =================================================================== --- lib/Analysis/LoopUnrollAnalyzer.cpp +++ lib/Analysis/LoopUnrollAnalyzer.cpp @@ -35,6 +35,8 @@ SimplifiedValues[I] = SC->getValue(); return true; } + if (!CompleteUnroll) + return false; auto *AR = dyn_cast(S); if (!AR || AR->getLoop() != L) @@ -211,6 +213,10 @@ if (Base::visitPHINode(PN)) return true; + // Consider PHI is foldable only after complete unroll. + if (!CompleteUnroll) + return false; + // The loop induction PHI nodes are definitionally free. return PN.getParent() == L->getHeader(); } Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -244,7 +244,7 @@ static Optional analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, - int MaxUnrolledLoopSize) { + int MaxUnrolledLoopSize, bool CompleteUnroll = true) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -319,8 +319,9 @@ // If this is a PHI node in the loop header, just add it to the PHI set. if (auto *PhiI = dyn_cast(I)) if (PhiI->getParent() == L->getHeader()) { - assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " - "inherently simplify during unrolling."); + if (CompleteUnroll) + assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " + "inherently simplify during unrolling."); if (Iteration == 0) continue; @@ -396,21 +397,46 @@ PHI->getNumIncomingValues() == 2 && "Must have an incoming value only for the preheader and the latch."); - Value *V = PHI->getIncomingValueForBlock( - Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); + Value *VP = PHI->getIncomingValueForBlock(L->getLoopPreheader()); + Value *VL = PHI->getIncomingValueForBlock(L->getLoopLatch()); + Value *V = Iteration == 0 ? VP : VL; Constant *C = dyn_cast(V); if (Iteration != 0 && !C) C = SimplifiedValues.lookup(V); - if (C) + if (C && CompleteUnroll) SimplifiedInputValues.push_back({PHI, C}); + // If incoming value for PHI is another PHI, that means we need to + // store previous value. + if (Iteration != 0 && isa(VL)) + RolledDynamicCost++; + // Simplify recurent XOR: + // I = PHI [VP, VL] + // VL = I ^ X + // + // To: + // I1 = VP + // VL1 = VP ^ X + // I2 = VP ^ X + // VL2 = VP + if ((TripCount & 1) == 0) { + Instruction *IL = dyn_cast(VL); + if (IL && IL->getOpcode() == Instruction::Xor && + IL->getOperand(0) == PHI && L->isLoopInvariant(IL->getOperand(1))) { + Constant *CP = dyn_cast(VP); + if (Iteration & 1) + CP = SimplifiedValues.lookup(VL); + if (CP) + SimplifiedInputValues.push_back({PHI, CP}); + } + } } // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L, CompleteUnroll); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -509,7 +535,7 @@ // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. - if (UnrolledCost == RolledDynamicCost) { + if (CompleteUnroll && UnrolledCost == RolledDynamicCost) { DEBUG(dbgs() << " No opportunities found.. exiting.\n" << " UnrolledCost: " << UnrolledCost << "\n"); return None; @@ -913,6 +939,21 @@ << NV("UnrollCount", UP.Count) << " time(s)."); } + // Estimate if Force unroll could be profitable. + // Currently try only 2 times unroll. + // Do it only for innermost loops with unknown trip count. + // For known trip count Runtime unroll is more profitable. + if (!ExplicitUnroll && L->getSubLoops().size() == 0 && + isa(SE->getBackedgeTakenCount(L))) { + Optional Cost = + analyzeLoopUnrollCost(L, 2, DT, *SE, TTI, + UP.PartialThreshold, false); + if (Cost && Cost->UnrolledCost < Cost->RolledDynamicCost) { + UP.Force = true; + UP.Count = 2; + } + } + if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n"); Index: unittests/Analysis/UnrollAnalyzer.cpp =================================================================== --- unittests/Analysis/UnrollAnalyzer.cpp +++ unittests/Analysis/UnrollAnalyzer.cpp @@ -38,7 +38,7 @@ TripCount = SE->getSmallConstantTripCount(L, Exiting); for (unsigned Iteration = 0; Iteration < TripCount; Iteration++) { DenseMap SimplifiedValues; - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L, true); for (auto *BB : L->getBlocks()) for (Instruction &I : *BB) Analyzer.visit(I);