Index: include/llvm/Analysis/LoopUnrollAnalyzer.h =================================================================== --- include/llvm/Analysis/LoopUnrollAnalyzer.h +++ include/llvm/Analysis/LoopUnrollAnalyzer.h @@ -48,8 +48,9 @@ public: UnrolledInstAnalyzer(unsigned Iteration, DenseMap &SimplifiedValues, - ScalarEvolution &SE, const Loop *L) - : SimplifiedValues(SimplifiedValues), SE(SE), L(L) { + ScalarEvolution &SE, const Loop *L, bool CompleteUnroll) + : SimplifiedValues(SimplifiedValues), SE(SE), + L(L), CompleteUnroll(CompleteUnroll) { IterationNumber = SE.getConstant(APInt(64, Iteration)); } @@ -81,6 +82,7 @@ ScalarEvolution &SE; const Loop *L; + bool CompleteUnroll; bool simplifyInstWithSCEV(Instruction *I); Index: lib/Analysis/LoopUnrollAnalyzer.cpp =================================================================== --- lib/Analysis/LoopUnrollAnalyzer.cpp +++ lib/Analysis/LoopUnrollAnalyzer.cpp @@ -35,6 +35,8 @@ SimplifiedValues[I] = SC->getValue(); return true; } + if (!CompleteUnroll) + return false; auto *AR = dyn_cast(S); if (!AR || AR->getLoop() != L) @@ -211,6 +213,10 @@ if (Base::visitPHINode(PN)) return true; + // Consider PHI is foldable only after complete unroll. + if (!CompleteUnroll) + return false; + // The loop induction PHI nodes are definitionally free. return PN.getParent() == L->getHeader(); } Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -250,7 +250,7 @@ static Optional analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, - int MaxUnrolledLoopSize) { + int MaxUnrolledLoopSize, bool CompleteUnroll = true) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -325,8 +325,9 @@ // If this is a PHI node in the loop header, just add it to the PHI set. if (auto *PhiI = dyn_cast(I)) if (PhiI->getParent() == L->getHeader()) { - assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " - "inherently simplify during unrolling."); + if (CompleteUnroll) + assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " + "inherently simplify during unrolling."); if (Iteration == 0) continue; @@ -402,21 +403,49 @@ PHI->getNumIncomingValues() == 2 && "Must have an incoming value only for the preheader and the latch."); - Value *V = PHI->getIncomingValueForBlock( - Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); - Constant *C = dyn_cast(V); - if (Iteration != 0 && !C) - C = SimplifiedValues.lookup(V); - if (C) - SimplifiedInputValues.push_back({PHI, C}); + Value *VP = PHI->getIncomingValueForBlock(L->getLoopPreheader()); + Value *VL = PHI->getIncomingValueForBlock(L->getLoopLatch()); + if (CompleteUnroll) { + Value *V = Iteration == 0 ? VP : VL; + Constant *C = dyn_cast(V); + if (Iteration != 0 && !C) + C = SimplifiedValues.lookup(V); + if (C) + SimplifiedInputValues.push_back({PHI, C}); + } + // If incoming value for PHI is another PHI, that means we need to + // store previous value. + Instruction *IL = dyn_cast(VL); + if (Iteration != 0 && isa(VL) && IL && L->contains(IL)) + RolledDynamicCost++; + // Simplify recurent XOR: + // I = PHI [VP, VL] + // VL = I ^ X + // + // To: + // I1 = VP + // VL1 = VP ^ X + // I2 = VP ^ X + // VL2 = VP + if ((TripCount & 1) == 0) { + if (IL && IL->getOpcode() == Instruction::Xor && + IL->getOperand(0) == PHI && L->isLoopInvariant(IL->getOperand(1))) { + Constant *CP = dyn_cast(VP); + if (Iteration & 1) + CP = SimplifiedValues.lookup(VL); + if (CP) + SimplifiedInputValues.push_back({PHI, CP}); + } + } } // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, + SE, L, CompleteUnroll); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -515,7 +544,7 @@ // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. - if (UnrolledCost == RolledDynamicCost) { + if (CompleteUnroll && UnrolledCost == RolledDynamicCost) { DEBUG(dbgs() << " No opportunities found.. exiting.\n" << " UnrolledCost: " << UnrolledCost << "\n"); return None; @@ -532,6 +561,9 @@ break; Value *Op = PN->getIncomingValueForBlock(ExitingBB); + if (auto *OpPN = dyn_cast(Op)) + if (OpPN->getParent() == L->getHeader()) + Op = OpPN->getIncomingValueForBlock(L->getLoopLatch()); if (auto *OpI = dyn_cast(Op)) if (L->contains(OpI)) AddCostRecursively(*OpI, TripCount - 1); @@ -929,6 +961,24 @@ << NV("UnrollCount", UP.Count) << " time(s)."); } + // Estimate if Force unroll could be profitable. + // Currently try only 2 times unroll. + // Do it only for innermost loops with unknown trip count. + // For know trip count Runtime unroll is more profitable. + if (!ExplicitUnroll && + isa(SE->getBackedgeTakenCount(L))) { + Optional Cost = + analyzeLoopUnrollCost(L, 2, DT, *SE, TTI, + UP.PartialThreshold, false); + if (Cost && Cost->UnrolledCost < Cost->RolledDynamicCost) { + UP.Force = true; + UP.Count = 2; + // Unroll uncountable loops only once. + DEBUG(dbgs() << " unrolling with count: " << UP.Count << "\n"); + return true; + } + } + if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n"); Index: test/Transforms/LoopUnroll/unroll-force.ll =================================================================== --- test/Transforms/LoopUnroll/unroll-force.ll +++ test/Transforms/LoopUnroll/unroll-force.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -S -unroll-runtime -loop-unroll | FileCheck %s +; The test checks unroll for uncountable loop, when previous value is stored. + +; CHECK: loop.body.if: +; CHECK: loop.body.if.1: + +%struct.list = type { %struct.list*, i32 } + +@head = global %struct.list { %struct.list* null, i32 1 }, align 8 + +; Function Attrs: norecurse nounwind uwtable +define i32 @main(i32 %argc, i8** nocapture readnone %argv) { +entry: + %0 = load i32, i32* getelementptr inbounds (%struct.list, %struct.list* @head, i64 0, i32 1), align 8 + br label %loop.body + +loop.body: ; preds = %entry, %loop.body.if + %prev = phi %struct.list* [ @head, %entry ], [ %cur, %loop.body.if ] + %cur = phi %struct.list* [ @head, %entry ], [ %2, %loop.body.if ] + %1 = phi i32 [ %0, %entry ], [ %3, %loop.body.if ] + %next.cur = getelementptr inbounds %struct.list, %struct.list* %cur, i64 0, i32 0 + %2 = load %struct.list*, %struct.list** %next.cur, align 8 + %tobool = icmp eq %struct.list* %2, null + br i1 %tobool, label %if.else, label %loop.body.if + +loop.body.if: ; preds = %loop.body + %val = getelementptr inbounds %struct.list, %struct.list* %cur, i64 0, i32 1 + %3 = load i32, i32* %val, align 8 + %cmp = icmp sgt i32 %3, %0 + br i1 %cmp, label %loopexit, label %loop.body + +if.else: ; preds = %if.then + %next.prev = getelementptr inbounds %struct.list, %struct.list* %prev, i64 0, i32 0 + store %struct.list* null, %struct.list** %next.prev, align 8 + br label %end + +loopexit: ; preds = %loop.body.if + br label %end + +end: ; preds = %loopexit, %if.else + %4 = phi i32 [ %1, %if.else ], [ %3, %loopexit ] + ret i32 %4 +} + +; The test checks unroll for uncountable loop, when XOR is simplified. + +; CHECK: while.body: +; CHECK: while.body.1: + +; Function Attrs: norecurse nounwind uwtable +define void @xor(i32* nocapture %a) local_unnamed_addr #0 { +entry: + %0 = load i32, i32* %a, align 4 + %entry.cmp = icmp eq i32 %0, -1 + br i1 %entry.cmp, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %1 = phi i32 [ %2, %while.body ], [ %0, %entry ] + %arrayidx12 = phi i32* [ %arrayidx, %while.body ], [ %a, %entry ] + %s = phi i32 [ %xor, %while.body ], [ 0, %entry ] + %i = phi i32 [ %inc, %while.body ], [ undef, %entry ] + %add = add nsw i32 %1, %s + store i32 %add, i32* %arrayidx12, align 4 + %xor = xor i32 %s, 1 + %inc = add nsw i32 %i, 1 + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %inc + %2 = load i32, i32* %arrayidx, align 4 + %cmp = icmp eq i32 %2, -1 + br i1 %cmp, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + Index: unittests/Analysis/UnrollAnalyzer.cpp =================================================================== --- unittests/Analysis/UnrollAnalyzer.cpp +++ unittests/Analysis/UnrollAnalyzer.cpp @@ -38,7 +38,7 @@ TripCount = SE->getSmallConstantTripCount(L, Exiting); for (unsigned Iteration = 0; Iteration < TripCount; Iteration++) { DenseMap SimplifiedValues; - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L, true); for (auto *BB : L->getBlocks()) for (Instruction &I : *BB) Analyzer.visit(I);