Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -382,6 +382,8 @@ bool UpperBound; /// Allow peeling off loop iterations for loops with low dynamic tripcount. bool AllowPeeling; + /// Allow peeling off all the iterations of the runtime loop remainder. + bool PeelRemainder; }; /// \brief Get target-customized preferences for the generic loop unrolling Index: include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- include/llvm/Transforms/Utils/UnrollLoop.h +++ include/llvm/Transforms/Utils/UnrollLoop.h @@ -42,14 +42,17 @@ bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime, bool AllowExpensiveTripCount, bool PreserveCondBr, bool PreserveOnlyFirst, - unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI, + unsigned TripMultiple, unsigned PeelCount, bool PeelRemainder, + LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA); bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, - bool UseEpilogRemainder, LoopInfo *LI, + bool UseEpilogRemainder, bool PeelRemainder, + LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, bool PreserveLCSSA); void computePeelCount(Loop *L, unsigned LoopSize, Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -114,6 +114,10 @@ cl::desc("Allows loops to be peeled when the dynamic " "trip count is known to be low.")); +static cl::opt UnrollPeelRemainder( + "peel-remainder", cl::Hidden, + cl::desc("Allow the loop remainder to be peeled.")); + // This option isn't ever intended to be enabled, it serves to allow // experiments to check the assumptions about when this kind of revisit is // necessary. @@ -152,6 +156,7 @@ UP.Partial = false; UP.Runtime = false; UP.AllowRemainder = true; + UP.PeelRemainder = false; UP.AllowExpensiveTripCount = false; UP.Force = false; UP.UpperBound = false; @@ -187,6 +192,8 @@ UP.UpperBound = false; if (UnrollAllowPeeling.getNumOccurrences() > 0) UP.AllowPeeling = UnrollAllowPeeling; + if (UnrollPeelRemainder.getNumOccurrences() > 0) + UP.PeelRemainder = UnrollPeelRemainder; // Apply user values provided by argument if (UserThreshold.hasValue()) { @@ -938,7 +945,7 @@ Optional ProvidedUpperBound) { DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); - if (HasUnrollDisablePragma(L)) + if (HasUnrollDisablePragma(L)) return false; if (!L->isLoopSimplifyForm()) { DEBUG( @@ -1032,7 +1039,8 @@ // Unroll the loop. if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero, - TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE, + TripMultiple, UP.PeelCount, UP.PeelRemainder, + LI, &SE, &DT, &AC, &ORE, PreserveLCSSA)) return false; Index: lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- lib/Transforms/Utils/LoopUnroll.cpp +++ lib/Transforms/Utils/LoopUnroll.cpp @@ -295,7 +295,8 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime, bool AllowExpensiveTripCount, bool PreserveCondBr, bool PreserveOnlyFirst, - unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI, + unsigned TripMultiple, unsigned PeelCount, + bool PeelRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) { @@ -418,7 +419,8 @@ if (RuntimeTripCount && TripMultiple % Count != 0 && !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount, - EpilogProfitability, LI, SE, DT, + EpilogProfitability, PeelRemainder, + LI, SE, DT, AC, PreserveLCSSA)) { if (Force) RuntimeTripCount = false; Index: lib/Transforms/Utils/LoopUnrollRuntime.cpp =================================================================== --- lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -294,7 +294,8 @@ /// Return the new cloned loop that is created when CreateRemainderLoop is true. static Loop * CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, - const bool UseEpilogRemainder, BasicBlock *InsertTop, + const bool UseEpilogRemainder, const bool PeelRemainder, + BasicBlock *InsertTop, BasicBlock *InsertBot, BasicBlock *Preheader, std::vector &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) { @@ -413,10 +414,13 @@ } LLVMContext &Context = NewLoop->getHeader()->getContext(); - SmallVector DisableOperands; - DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); - MDNode *DisableNode = MDNode::get(Context, DisableOperands); - MDs.push_back(DisableNode); + if (!PeelRemainder) { + SmallVector DisableOperands; + DisableOperands.push_back(MDString::get(Context, + "llvm.loop.unroll.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + } MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. @@ -484,6 +488,41 @@ : false; } +static void RemovePeeledLoop(Loop *L, LoopInfo *LI, DominatorTree *DT) { + // Loops that can be peeled are in their simple form, with a single block + // and exit. + BasicBlock *Preheader = L->getLoopPreheader(); + auto *ExitBB = L->getExitingBlock(); + auto *Term = ExitBB->getTerminator(); + BasicBlock *Dest = nullptr; + + // Find the successor block. + for (unsigned i = 0; i < 2; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + if (Succ != ExitBB) { + Dest = Succ; + break; + } + } + + // Remove the loop body from the CFG and dominator tree. + Dest->removePredecessor(ExitBB); + ExitBB->getTerminator()->eraseFromParent(); + DT->deleteEdge(ExitBB, Dest); + + ExitBB->removePredecessor(Preheader); + Preheader->getTerminator()->eraseFromParent(); + DT->deleteEdge(Preheader, ExitBB); + + // Have the preheader branch directly to the loop's successor. + BranchInst::Create(Dest, Preheader); + DT->insertEdge(Preheader, Dest); + + ExitBB->dropAllReferences(); + ExitBB->removeFromParent(); + LI->markAsRemoved(L); +} + /// Insert code in the prolog/epilog code when unrolling a loop with a /// run-time trip-count. /// @@ -525,8 +564,10 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, bool UseEpilogRemainder, + bool PeelRemainder, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, bool PreserveLCSSA) { + DominatorTree *DT, AssumptionCache *AC, + bool PreserveLCSSA) { DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); DEBUG(L->dump()); @@ -739,7 +780,8 @@ BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; Loop *remainderLoop = CloneLoopBlocks( - L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot, + L, ModVal, CreateRemainderLoop, UseEpilogRemainder, PeelRemainder, + InsertTop, InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI); // Insert the cloned blocks into the function. @@ -883,6 +925,11 @@ formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA); } + if (remainderLoop && PeelRemainder) { + if (peelLoop(remainderLoop, Count - 1, LI, SE, DT, AC, PreserveLCSSA)) + RemovePeeledLoop(remainderLoop, LI, DT); + } + NumRuntimeUnrolled++; return true; } Index: test/Transforms/LoopUnroll/runtime-peel-remainder.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/runtime-peel-remainder.ll @@ -0,0 +1,68 @@ +; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-count=4 -peel-remainder -simplifycfg -instcombine | FileCheck %s + +; CHECK-LABEL: unroll +define i32 @unroll(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { +entry: + %cmp9 = icmp eq i32 %N, 0 + br i1 %cmp9, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: + %c.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %c.0.lcssa + +; CHECK-LABEL: for.body.lr.ph +; CHECK: [[COUNT:%[a-z.0-9]+]] = add nsw i64 %wide.trip.count, -1 +; CHECK: %xtraiter = and i64 %wide.trip.count, 3 +; CHECK: [[CMP:%[a-z.0-9]+]] = icmp ult i64 [[COUNT]], 3 +; CHECK: br i1 [[CMP]], label %[[CLEANUP:.*]], label %for.body.lr.ph.new + +; CHECK-LABEL: for.body.lr.ph.new: +; CHECK: sub nsw i64 %wide.trip.count, %xtraiter +; CHECK: br label %for.body + +; CHECK: [[CLEANUP]]: +; CHECK: [[MOD:%[a-z.0-9]+]] = icmp eq i64 %xtraiter, 0 +; CHECK: br i1 [[MOD]], label %[[EXIT:.*]], label %[[EPIL_PEEL0:.*]] + +; CHECK: [[EPIL_PEEL0]]: +; CHECK: [[PEEL_CMP0:%[a-z.0-9]+]] = icmp eq i64 %xtraiter, 1 +; CHECK: br i1 [[PEEL_CMP0]], label %[[EXIT]], label %[[EPIL_PEEL1:.*]], + +; CHECK: [[EPIL_PEEL1]]: +; CHECK: [[PEEL_CMP1:%[a-z.0-9]+]] = icmp eq i64 %xtraiter, 2 +; CHECK: br i1 [[PEEL_CMP1]], label %[[EXIT]], label %[[EPIL_PEEL2:.*]], + +; CHECK: [[EPIL_PEEL2]]: +; CHECK: ret i32 + +; CHECK: [[EXIT]]: +; CHECK: ret i32 + +; CHECK-LABEL: for.body: +; CHECK: [[INDVAR0:%[a-z.0-9]+]] = phi i64 [ 0, %for.body.lr.ph +; CHECK: [[ITER:%[a-z.0-9]+]] = phi i64 [ %unroll_iter +; CHECK: or i64 [[INDVAR0]], 1 +; CHECK: or i64 [[INDVAR0]], 2 +; CHECK: or i64 [[INDVAR0]], 3 +; CHECK: add nsw i64 [[INDVAR0]], 4 +; CHECK: [[SUB:%[a-z.0-9]+]] = add i64 [[ITER]], -4 +; CHECK: [[ITER_CMP:%[a-z.0-9]+]] = icmp eq i64 [[SUB]] +; CHECK: br i1 [[ITER_CMP]], label %[[CLEANUP]], label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %c.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %c.010 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +}