Index: lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- lib/Transforms/Utils/LoopUnroll.cpp +++ lib/Transforms/Utils/LoopUnroll.cpp @@ -172,6 +172,62 @@ return false; } +/// The function chooses which type of unroll (epilog or prolog) is more +/// profitabale. +/// Epilog unroll is more profitable when there is PHI that starts from +/// constant. In this case epilog will leave PHI start from constant, +/// but prolog will convert it to non-constant. +/// +/// loop: +/// PN = PHI [I, Latch], [CI, PreHeader] +/// I = foo(PN) +/// ... +/// +/// Epilog unroll case. +/// loop: +/// PN = PHI [I2, Latch], [CI, PreHeader] +/// I1 = foo(PN) +/// I2 = foo(I1) +/// ... +/// Prolog unroll case. +/// NewPN = PHI [PrologI, Prolog], [CI, PreHeader] +/// loop: +/// PN = PHI [I2, Latch], [NewPN, PreHeader] +/// I1 = foo(PN) +/// I2 = foo(I1) +/// ... +/// +/// In addition we get potential alignment benefit if +/// the constant is modulo of unroll count. +static bool isEpilogProfitable(Loop *L, unsigned Count) { + assert(Count > 0); + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *PreHeader = L->getLoopPreheader(); + BasicBlock *Header = L->getHeader(); + assert(Latch || PreHeader || Header); + for (Instruction &BBI : *Header) { + PHINode *PN = dyn_cast(&BBI); + if (!PN) + break; + if (PN->getBasicBlockIndex(PreHeader) < 0) + continue; + Value *V = PN->getIncomingValueForBlock(PreHeader); + if (!V) + continue; + if (ConstantInt *CI = dyn_cast(V)) { + APInt Val = CI->getValue(); + // If CI type width is less than Count width, consider epilog unroll + // is profitable. For example for bool type, epilog is always profitable. + if (Val.getBitWidth() <= 31 && (1UL << Val.getBitWidth()) <= Count) + return true; + APInt Divider (Val.getBitWidth(), (uint64_t)Count); + if (Val.urem(Divider) == 0) + return true; + } + } + return false; +} + /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true /// if unrolling was successful, or false if the loop was unmodified. Unrolling /// can only fail when the loop's latch block is not terminated by a conditional @@ -301,12 +357,15 @@ "Unroll count must divide trip multiple if loop contains a " "convergent operation."); }); + bool EpiplogProfitability = + UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog + : isEpilogProfitable (L, Count); // Don't output the runtime loop remainder if Count is a multiple of // TripMultiple. Such a remainder is never needed, and is unsafe if the loop // contains a convergent instruction. if (RuntimeTripCount && TripMultiple % Count != 0 && !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount, - UnrollRuntimeEpilog, LI, SE, DT, + EpiplogProfitability, LI, SE, DT, PreserveLCSSA)) { if (Force) RuntimeTripCount = false; Index: test/Transforms/LoopUnroll/runtime-loop5.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop5.ll +++ test/Transforms/LoopUnroll/runtime-loop5.ll @@ -11,9 +11,6 @@ %cmp1 = icmp eq i3 %n, 0 br i1 %cmp1, label %for.end, label %for.body -; UNROLL-16-NOT: for.body.prol: -; UNROLL-4: for.body.prol: - for.body: ; preds = %for.body, %entry ; UNROLL-16-LABEL: for.body: ; UNROLL-4-LABEL: for.body: @@ -39,6 +36,10 @@ ; UNROLL-16-LABEL: for.end ; UNROLL-4-LABEL: for.end + +; UNROLL-16-NOT: for.body.epil: +; UNROLL-4: for.body.epil: + for.end: ; preds = %for.body, %entry %sum.0.lcssa = phi i3 [ 0, %entry ], [ %add, %for.body ] ret i3 %sum.0.lcssa Index: test/Transforms/LoopUnroll/unroll-pragmas.ll =================================================================== --- test/Transforms/LoopUnroll/unroll-pragmas.ll +++ test/Transforms/LoopUnroll/unroll-pragmas.ll @@ -171,10 +171,6 @@ ; should be duplicated (original and 4x unrolled). ; ; CHECK-LABEL: @runtime_loop_with_count4( -; CHECK: for.body.prol: -; CHECK: store -; CHECK-NOT: store -; CHECK: br i1 ; CHECK: for.body ; CHECK: store ; CHECK: store @@ -182,6 +178,10 @@ ; CHECK: store ; CHECK-NOT: store ; CHECK: br i1 +; CHECK: for.body.epil: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0 @@ -287,10 +287,6 @@ ; (original and 8x). ; ; CHECK-LABEL: @runtime_loop_with_enable( -; CHECK: for.body.prol: -; CHECK: store -; CHECK-NOT: store -; CHECK: br i1 ; CHECK: for.body: ; CHECK: store i32 ; CHECK: store i32 @@ -302,6 +298,10 @@ ; CHECK: store i32 ; CHECK-NOT: store i32 ; CHECK: br i1 +; CHECK: for.body.epil: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 define void @runtime_loop_with_enable(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0 @@ -328,16 +328,16 @@ ; should be duplicated (original and 3x unrolled). ; ; CHECK-LABEL: @runtime_loop_with_count3( -; CHECK: for.body.prol: -; CHECK: store -; CHECK-NOT: store -; CHECK: br i1 ; CHECK: for.body ; CHECK: store ; CHECK: store ; CHECK: store ; CHECK-NOT: store ; CHECK: br i1 +; CHECK: for.body.epil: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 define void @runtime_loop_with_count3(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0