Index: include/llvm/Analysis/LoopUnrollAnalyzer.h =================================================================== --- include/llvm/Analysis/LoopUnrollAnalyzer.h +++ include/llvm/Analysis/LoopUnrollAnalyzer.h @@ -48,8 +48,9 @@ public: UnrolledInstAnalyzer(unsigned Iteration, DenseMap &SimplifiedValues, - ScalarEvolution &SE, const Loop *L) - : SimplifiedValues(SimplifiedValues), SE(SE), L(L) { + ScalarEvolution &SE, const Loop *L, bool CompleteUnroll) + : SimplifiedValues(SimplifiedValues), SE(SE), + L(L), CompleteUnroll(CompleteUnroll) { IterationNumber = SE.getConstant(APInt(64, Iteration)); } @@ -81,6 +82,7 @@ ScalarEvolution &SE; const Loop *L; + bool CompleteUnroll; bool simplifyInstWithSCEV(Instruction *I); Index: lib/Analysis/LoopUnrollAnalyzer.cpp =================================================================== --- lib/Analysis/LoopUnrollAnalyzer.cpp +++ lib/Analysis/LoopUnrollAnalyzer.cpp @@ -35,6 +35,8 @@ SimplifiedValues[I] = SC->getValue(); return true; } + if (!CompleteUnroll) + return false; auto *AR = dyn_cast(S); if (!AR || AR->getLoop() != L) @@ -211,6 +213,10 @@ if (Base::visitPHINode(PN)) return true; + // Consider PHI is foldable only after complete unroll. + if (!CompleteUnroll) + return false; + // The loop induction PHI nodes are definitionally free. return PN.getParent() == L->getHeader(); } Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -89,6 +89,11 @@ cl::desc("Allow generation of a loop remainder (extra iterations) " "when unrolling a loop.")); +static cl::opt UnrollAllowForce( + "unroll-allow-force", cl::Hidden, cl::init(false), + cl::desc("Allow generation of a loop remainder (extra iterations) " + "when unrolling a loop.")); + static cl::opt UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); @@ -229,6 +234,27 @@ }; } +/// Some instructions get the same value during loop execution. +/// The function find a cycle length for such. +static unsigned getPhiCycleLength (PHINode *PHI, const Loop *L) { + Value *V = PHI->getIncomingValueForBlock(L->getLoopLatch()); + Instruction *I = dyn_cast(V); + if (!I) + return 0; + switch (I->getOpcode()) { + case Instruction::Sub: + if (ConstantInt *CS = dyn_cast(I->getOperand(0))) + if (CS->isZero() && I->getOperand(1) == PHI) + return 2; + case Instruction::Xor: + if (I->getOperand(0) == PHI && L->isLoopInvariant(I->getOperand(1))) + return 2; + default: + return 0; + } + return 0; +} + namespace { struct EstimatedUnrollCost { /// \brief The estimated cost after unrolling. @@ -254,9 +280,9 @@ /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. static Optional -analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, +analyzeLoopUnrollCost(const Loop *L, unsigned UnrollCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, - unsigned MaxUnrolledLoopSize) { + unsigned MaxUnrolledLoopSize, bool CompleteUnroll = true) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -268,9 +294,9 @@ if (!L->empty()) return None; - // Don't simulate loops with a big or unknown tripcount - if (!UnrollMaxIterationsCountToAnalyze || !TripCount || - TripCount > UnrollMaxIterationsCountToAnalyze) + // Don't simulate loops with a big or unknown UnrollCount + if (!UnrollMaxIterationsCountToAnalyze || !UnrollCount || + UnrollCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector BBWorklist; @@ -331,8 +357,10 @@ // If this is a PHI node in the loop header, just add it to the PHI set. if (auto *PhiI = dyn_cast(I)) if (PhiI->getParent() == L->getHeader()) { - assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " - "inherently simplify during unrolling."); + if (CompleteUnroll) + assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " + "inherently simplify during complete " + "unrolling."); if (Iteration == 0) continue; @@ -363,7 +391,15 @@ auto *OpI = dyn_cast(Op); if (!OpI || !L->contains(OpI)) continue; - + // For regular unroll instruction that depends on PHI also costs. + if (!CompleteUnroll && isa(OpI)) { + auto *OpIP = dyn_cast(OpI); + if(OpIP->getParent() == L->getHeader()) + if (auto *OpIPI = + dyn_cast( + OpIP->getIncomingValueForBlock(L->getLoopLatch()))) + CostWorklist.push_back(OpIPI); + } // Otherwise accumulate its cost. CostWorklist.push_back(OpI); } @@ -392,7 +428,7 @@ // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. - for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { + for (unsigned Iteration = 0; Iteration < UnrollCount; ++Iteration) { DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); // Prepare for the iteration by collecting any simplified entry or backedge @@ -408,10 +444,26 @@ PHI->getNumIncomingValues() == 2 && "Must have an incoming value only for the preheader and the latch."); + // If incoming value for PHI is another loop PHI, that means we need to + // store previous value (without unroll). + if (PHINode *PHIL = dyn_cast( + PHI->getIncomingValueForBlock(L->getLoopLatch()))) + if (Iteration && L->contains(PHIL)) + RolledDynamicCost++; + + unsigned PhiCycle = getPhiCycleLength (PHI, L); + // For complete unroll if we were unable to get PHI cycle length + // consider cycle length as full unroll count (TripCount). + if (CompleteUnroll && PhiCycle == 0) + PhiCycle = UnrollCount; + // If PHI has no cycle we are unable to simplify it. + if (PhiCycle == 0) + continue; + Value *V = PHI->getIncomingValueForBlock( - Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); + Iteration % PhiCycle ? L->getLoopLatch() : L->getLoopPreheader()); Constant *C = dyn_cast(V); - if (Iteration != 0 && !C) + if ((Iteration % PhiCycle) && !C) C = SimplifiedValues.lookup(V); if (C) SimplifiedInputValues.push_back({PHI, C}); @@ -421,8 +473,9 @@ SimplifiedValues.clear(); while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, + SE, L, CompleteUnroll); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -521,7 +574,7 @@ // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. - if (UnrolledCost == RolledDynamicCost) { + if (CompleteUnroll && UnrolledCost == RolledDynamicCost) { DEBUG(dbgs() << " No opportunities found.. exiting.\n" << " UnrolledCost: " << UnrolledCost << "\n"); return None; @@ -538,9 +591,12 @@ break; Value *Op = PN->getIncomingValueForBlock(ExitingBB); + if (auto *OpPN = dyn_cast(Op)) + if (OpPN->getParent() == L->getHeader()) + Op = OpPN->getIncomingValueForBlock(L->getLoopLatch()); if (auto *OpI = dyn_cast(Op)) if (L->contains(OpI)) - AddCostRecursively(*OpI, TripCount - 1); + AddCostRecursively(*OpI, UnrollCount - 1); } } @@ -907,6 +963,25 @@ << NV("UnrollCount", UP.Count) << " time(s)."); } + // Estimate if Force unroll could be profitable. + // Currently try only 2 times unroll. + // Do it only for innermost loops. + if (UnrollAllowForce && !ExplicitUnroll && + (UP.Count < 2 || + isa(SE->getBackedgeTakenCount(L)))) { + Optional Cost = + analyzeLoopUnrollCost(L, 2, DT, *SE, TTI, + UP.PartialThreshold + UP.BEInsns, false); + if (Cost && Cost->UnrolledCost < Cost->RolledDynamicCost) { + if ((unsigned)Cost->UnrolledCost <= UP.PartialThreshold) + UP.Force = true; + UP.Count = 2; + DEBUG(dbgs() << " unrolling with count: " << UP.Count << "\n"); + // Unroll uncountable loops only once. + return true; + } + } + if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n"); Index: test/Transforms/LoopUnroll/unroll-force.ll =================================================================== --- test/Transforms/LoopUnroll/unroll-force.ll +++ test/Transforms/LoopUnroll/unroll-force.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -S -unroll-runtime -loop-unroll -unroll-allow-force | FileCheck %s +; The test checks unroll for uncountable loop, when previous value is stored. + +; CHECK: loop.body.if: +; CHECK: loop.body.if.1: + +%struct.list = type { %struct.list*, i32 } + +@head = global %struct.list { %struct.list* null, i32 1 }, align 8 + +; Function Attrs: norecurse nounwind uwtable +define i32 @main(i32 %argc, i8** nocapture readnone %argv) { +entry: + %0 = load i32, i32* getelementptr inbounds (%struct.list, %struct.list* @head, i64 0, i32 1), align 8 + br label %loop.body + +loop.body: ; preds = %entry, %loop.body.if + %prev = phi %struct.list* [ @head, %entry ], [ %cur, %loop.body.if ] + %cur = phi %struct.list* [ @head, %entry ], [ %2, %loop.body.if ] + %1 = phi i32 [ %0, %entry ], [ %3, %loop.body.if ] + %next.cur = getelementptr inbounds %struct.list, %struct.list* %cur, i64 0, i32 0 + %2 = load %struct.list*, %struct.list** %next.cur, align 8 + %tobool = icmp eq %struct.list* %2, null + br i1 %tobool, label %if.else, label %loop.body.if + +loop.body.if: ; preds = %loop.body + %val = getelementptr inbounds %struct.list, %struct.list* %cur, i64 0, i32 1 + %3 = load i32, i32* %val, align 8 + %cmp = icmp sgt i32 %3, %0 + br i1 %cmp, label %loopexit, label %loop.body + +if.else: ; preds = %if.then + %next.prev = getelementptr inbounds %struct.list, %struct.list* %prev, i64 0, i32 0 + store %struct.list* null, %struct.list** %next.prev, align 8 + br label %end + +loopexit: ; preds = %loop.body.if + br label %end + +end: ; preds = %loopexit, %if.else + %4 = phi i32 [ %1, %if.else ], [ %3, %loopexit ] + ret i32 %4 +} + +; The test checks unroll for uncountable loop, when XOR is simplified. + +; CHECK: while.body: +; CHECK: while.body.1: + +; Function Attrs: norecurse nounwind uwtable +define void @xor(i32* nocapture %a) local_unnamed_addr #0 { +entry: + %0 = load i32, i32* %a, align 4 + %entry.cmp = icmp eq i32 %0, -1 + br i1 %entry.cmp, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %1 = phi i32 [ %2, %while.body ], [ %0, %entry ] + %arrayidx12 = phi i32* [ %arrayidx, %while.body ], [ %a, %entry ] + %s = phi i32 [ %xor, %while.body ], [ 0, %entry ] + %i = phi i32 [ %inc, %while.body ], [ undef, %entry ] + %add = add nsw i32 %1, %s + store i32 %add, i32* %arrayidx12, align 4 + %xor = xor i32 %s, 1 + %inc = add nsw i32 %i, 1 + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %inc + %2 = load i32, i32* %arrayidx, align 4 + %cmp = icmp eq i32 %2, -1 + br i1 %cmp, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + Index: unittests/Analysis/UnrollAnalyzer.cpp =================================================================== --- unittests/Analysis/UnrollAnalyzer.cpp +++ unittests/Analysis/UnrollAnalyzer.cpp @@ -38,7 +38,7 @@ TripCount = SE->getSmallConstantTripCount(L, Exiting); for (unsigned Iteration = 0; Iteration < TripCount; Iteration++) { DenseMap SimplifiedValues; - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L, true); for (auto *BB : L->getBlocks()) for (Instruction &I : *BB) Analyzer.visit(I);