Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -382,6 +382,8 @@ bool UpperBound; /// Allow peeling off loop iterations for loops with low dynamic tripcount. bool AllowPeeling; + /// Allow unrolling of all the iterations of the runtime loop remainder. + bool UnrollRemainder; }; /// \brief Get target-customized preferences for the generic loop unrolling Index: llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h +++ llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h @@ -42,14 +42,18 @@ bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime, bool AllowExpensiveTripCount, bool PreserveCondBr, bool PreserveOnlyFirst, - unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI, - ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, bool PreserveLCSSA); + unsigned TripMultiple, unsigned PeelCount, bool UnrollRemainder, + LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, OptimizationRemarkEmitter *ORE, + bool PreserveLCSSA); bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, - bool UseEpilogRemainder, LoopInfo *LI, + bool UseEpilogRemainder, bool UnrollRemainder, + LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, bool PreserveLCSSA); void computePeelCount(Loop *L, unsigned LoopSize, Index: llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -115,6 +115,10 @@ cl::desc("Allows loops to be peeled when the dynamic " "trip count is known to be low.")); +static cl::opt UnrollUnrollRemainder( + "unroll-remainder", cl::Hidden, + cl::desc("Allow the loop remainder to be unrolled.")); + // This option isn't ever intended to be enabled, it serves to allow // experiments to check the assumptions about when this kind of revisit is // necessary. @@ -153,6 +157,7 @@ UP.Partial = false; UP.Runtime = false; UP.AllowRemainder = true; + UP.UnrollRemainder = false; UP.AllowExpensiveTripCount = false; UP.Force = false; UP.UpperBound = false; @@ -188,6 +193,8 @@ UP.UpperBound = false; if (UnrollAllowPeeling.getNumOccurrences() > 0) UP.AllowPeeling = UnrollAllowPeeling; + if (UnrollUnrollRemainder.getNumOccurrences() > 0) + UP.UnrollRemainder = UnrollUnrollRemainder; // Apply user values provided by argument if (UserThreshold.hasValue()) { @@ -1034,7 +1041,8 @@ // Unroll the loop. if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero, - TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE, + TripMultiple, UP.PeelCount, UP.UnrollRemainder, + LI, &SE, &DT, &AC, &ORE, PreserveLCSSA)) return false; Index: llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp +++ llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp @@ -295,7 +295,8 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime, bool AllowExpensiveTripCount, bool PreserveCondBr, bool PreserveOnlyFirst, - unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI, + unsigned TripMultiple, unsigned PeelCount, + bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) { @@ -418,7 +419,8 @@ if (RuntimeTripCount && TripMultiple % Count != 0 && !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount, - EpilogProfitability, LI, SE, DT, + EpilogProfitability, UnrollRemainder, + LI, SE, DT, AC, ORE, PreserveLCSSA)) { if (Force) RuntimeTripCount = false; Index: llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -294,7 +294,8 @@ /// Return the new cloned loop that is created when CreateRemainderLoop is true. static Loop * CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, - const bool UseEpilogRemainder, BasicBlock *InsertTop, + const bool UseEpilogRemainder, const bool UnrollRemainder, + BasicBlock *InsertTop, BasicBlock *InsertBot, BasicBlock *Preheader, std::vector &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) { @@ -413,10 +414,13 @@ } LLVMContext &Context = NewLoop->getHeader()->getContext(); - SmallVector DisableOperands; - DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); - MDNode *DisableNode = MDNode::get(Context, DisableOperands); - MDs.push_back(DisableNode); + if (!UnrollRemainder) { + SmallVector DisableOperands; + DisableOperands.push_back(MDString::get(Context, + "llvm.loop.unroll.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + } MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. @@ -525,8 +529,11 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, bool UseEpilogRemainder, + bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, bool PreserveLCSSA) { + DominatorTree *DT, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, + bool PreserveLCSSA) { DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); DEBUG(L->dump()); @@ -739,7 +746,8 @@ BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; Loop *remainderLoop = CloneLoopBlocks( - L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot, + L, ModVal, CreateRemainderLoop, UseEpilogRemainder, UnrollRemainder, + InsertTop, InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI); // Insert the cloned blocks into the function. @@ -883,6 +891,15 @@ formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA); } + if (remainderLoop && UnrollRemainder) { + UnrollLoop(remainderLoop, /*Count*/Count - 1, /*TripCount*/Count - 1, + /*Force*/false, /*AllowRuntime*/false, + /*AllowExpensiveTripCount*/false, /*PreserveCondBr*/true, + /*PreserveOnlyFirst*/false, /*TripMultiple*/1, + /*PeelCount*/0, /*UnrollRemainder*/false, LI, SE, DT, AC, ORE, + PreserveLCSSA); + } + NumRuntimeUnrolled++; return true; } Index: llvm/trunk/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll =================================================================== --- llvm/trunk/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll +++ llvm/trunk/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-count=4 -unroll-remainder -instcombine | FileCheck %s + +; CHECK-LABEL: unroll +define i32 @unroll(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { +entry: + %cmp9 = icmp eq i32 %N, 0 + br i1 %cmp9, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: + %c.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %c.0.lcssa + +; CHECK-LABEL: for.body.lr.ph +; CHECK: [[COUNT:%[a-z.0-9]+]] = add nsw i64 %wide.trip.count, -1 +; CHECK: %xtraiter = and i64 %wide.trip.count, 3 +; CHECK: [[CMP:%[a-z.0-9]+]] = icmp ult i64 [[COUNT]], 3 +; CHECK: br i1 [[CMP]], label %[[CLEANUP:.*]], label %for.body.lr.ph.new + +; CHECK-LABEL: for.body.lr.ph.new: +; CHECK: %unroll_iter = sub nsw i64 %wide.trip.count, %xtraiter +; CHECK: br label %for.body + +; CHECK: [[CLEANUP]]: +; CHECK: [[MOD:%[a-z.0-9]+]] = icmp eq i64 %xtraiter, 0 +; CHECK: br i1 [[MOD]], label %[[EXIT:.*]], label %[[EPIL_PEEL0_PRE:.*]] + +; CHECK: [[EPIL_PEEL0_PRE]]: +; CHECK: br label %[[EPIL_PEEL0:.*]] + +; CHECK: [[EPIL_PEEL0]]: +; CHECK: [[PEEL_CMP0:%[a-z.0-9]+]] = icmp eq i64 %xtraiter, 1 +; CHECK: br i1 [[PEEL_CMP0]], label %[[EPIL_EXIT:.*]], label %[[EPIL_PEEL1:.*]], + +; CHECK: [[EPIL_EXIT]]: +; CHECK: br label %[[EXIT]] + +; CHECK: [[EXIT]]: +; CHECK: ret i32 + +; CHECK-LABEL: for.body: +; CHECK: [[INDVAR0:%[a-z.0-9]+]] = phi i64 [ 0, %for.body.lr.ph +; CHECK: [[ITER:%[a-z.0-9]+]] = phi i64 [ %unroll_iter +; CHECK: or i64 [[INDVAR0]], 1 +; CHECK: or i64 [[INDVAR0]], 2 +; CHECK: or i64 [[INDVAR0]], 3 +; CHECK: add nsw i64 [[INDVAR0]], 4 +; CHECK: [[SUB:%[a-z.0-9]+]] = add i64 [[ITER]], -4 +; CHECK: [[ITER_CMP:%[a-z.0-9]+]] = icmp eq i64 [[SUB]], 0 +; CHECK: br i1 [[ITER_CMP]], label %[[LOOP_EXIT:.*]], label %for.body + +; CHECK: [[EPIL_PEEL1]]: +; CHECK: [[PEEL_CMP1:%[a-z.0-9]+]] = icmp eq i64 %xtraiter, 2 +; CHECK: br i1 [[PEEL_CMP1]], label %[[EPIL_EXIT]], label %[[EPIL_PEEL2:.*]], + +; CHECK: [[EPIL_PEEL2]]: +; CHECK: br label %[[EXIT]] + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %c.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %c.010 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +}