Index: llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -436,9 +436,9 @@ /// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits /// is populated with all the loop exit blocks other than the LatchExit block. -static bool canSafelyUnrollMultiExitLoop(Loop *L, BasicBlock *LatchExit, - bool PreserveLCSSA, - bool UseEpilogRemainder) { +static bool canSafelyUnrollMultiExitLoop(Loop *L, bool PreserveLCSSA, + bool UseEpilogRemainder, + unsigned Count) { // We currently have some correctness constrains in unrolling a multi-exit // loop. Check for these below. @@ -448,12 +448,10 @@ if (!PreserveLCSSA) return false; - // FIXME: We bail out of multi-exit unrolling when epilog loop is generated - // and L is an inner loop. This is because in presence of multiple exits, the - // outer loop is incorrect: we do not add the EpilogPreheader and exit to the - // outer loop. This is automatically handled in the prolog case, so we do not - // have that bug in prolog generation. - if (UseEpilogRemainder && L->getParentLoop()) + // FIXME: CloneLoopBlocks doesn't correctly update loopinfo for epilogue + // blocks which are outside the outer loop because we broke the inner + // backedge. + if (UseEpilogRemainder && Count == 2 && L->getParentLoop()) return false; // All constraints have been satisfied. @@ -464,11 +462,11 @@ /// we return true only if UnrollRuntimeMultiExit is set to true. static bool canProfitablyUnrollMultiExitLoop( Loop *L, SmallVectorImpl &OtherExits, BasicBlock *LatchExit, - bool PreserveLCSSA, bool UseEpilogRemainder) { + bool PreserveLCSSA, bool UseEpilogRemainder, unsigned Count) { #if !defined(NDEBUG) - assert(canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA, - UseEpilogRemainder) && + assert(canSafelyUnrollMultiExitLoop(L, PreserveLCSSA, + UseEpilogRemainder, Count) && "Should be safe to unroll before checking profitability!"); #endif @@ -481,6 +479,11 @@ if (!LatchExit->getSinglePredecessor()) return false; + // TODO: We used to bail out for correctness (now fixed). Under what + // circumstances is this case profitable to allow? + if (UseEpilogRemainder && L->getParentLoop()) + return false; + // The main pain point with multi-exit loop unrolling is that once unrolled, // we will not be able to merge all blocks into a straight line code. // There are branches within the unrolled loop that go to the OtherExits. @@ -662,10 +665,10 @@ SmallVector OtherExits; L->getUniqueNonLatchExitBlocks(OtherExits); bool isMultiExitUnrollingEnabled = - canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA, - UseEpilogRemainder) && + canSafelyUnrollMultiExitLoop(L, PreserveLCSSA, + UseEpilogRemainder, Count) && canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit, PreserveLCSSA, - UseEpilogRemainder); + UseEpilogRemainder, Count); // Support only single exit and exiting block unless multi-exit loop unrolling is enabled. if (!isMultiExitUnrollingEnabled && (!L->getExitingBlock() || OtherExits.size())) { @@ -754,6 +757,21 @@ // Split NewExit to insert epilog remainder loop. EpilogPreHeader = SplitBlock(NewExit, NewExitTerminator, DT, LI); EpilogPreHeader->setName(Header->getName() + ".epil.preheader"); + + // If the latch exits from multiple level of nested loops, then + // by assumption there must be another loop exit which branches to the + // outer loop and we must adjust the loop for the newly inserted blocks + // to account for the fact that our epilogue is still in the same outer + // loop. Note that this leaves loopinfo temporarily out of sync with the + // CFG until the actual epilogue loop is inserted. + if (auto *ParentL = L->getParentLoop()) + if (LI->getLoopFor(LatchExit) != ParentL) { + LI->removeBlock(NewExit); + ParentL->addBasicBlockToLoop(NewExit, *LI); + LI->removeBlock(EpilogPreHeader); + ParentL->addBasicBlockToLoop(EpilogPreHeader, *LI); + } + } else { // If prolog remainder // Split the original preheader twice to insert prolog remainder loop Index: llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll +++ llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll @@ -4364,32 +4364,91 @@ } ; Nested loop and inner loop is unrolled -; FIXME: we cannot unroll with epilog remainder currently, because -; the outer loop does not contain the epilog preheader and epilog exit (while -; infact it should). This causes us to choke up on LCSSA form being incorrect in -; outer loop. However, the exit block where LCSSA fails, is infact still within -; the outer loop. For now, we just bail out in presence of outer loop and epilog -; loop is generated. -; The outer loop header is the preheader for the inner loop and the inner header -; branches back to the outer loop. +; FIXME: We can't unroll with an epilog remainder block currently, because +; we don't correctly update loop info for blocks in the epilog "loop" which +; fall out of the outer loop when we break the backedge. define void @test8() { ; EPILOG-LABEL: @test8( ; EPILOG-NEXT: bb: ; EPILOG-NEXT: br label %outerloop +; EPILOG: outerloop.loopexit.loopexit: +; EPILOG-NEXT: br label %outerloop.loopexit +; EPILOG: outerloop.loopexit.loopexit1: +; EPILOG-NEXT: br label %outerloop.loopexit ; EPILOG: outerloop.loopexit: ; EPILOG-NEXT: br label %outerloop ; EPILOG: outerloop: ; EPILOG-NEXT: %i = phi i64 [ 3, %bb ], [ 0, %outerloop.loopexit ] +; EPILOG-NEXT: %0 = sub i64 100, %i +; EPILOG-NEXT: %1 = sub i64 99, %i +; EPILOG-NEXT: %xtraiter = and i64 %0, 7 +; EPILOG-NEXT: %2 = icmp ult i64 %1, 7 +; EPILOG-NEXT: br i1 %2, label %exit.unr-lcssa, label %outerloop.new +; EPILOG: outerloop.new: +; EPILOG-NEXT: %unroll_iter = sub i64 %0, %xtraiter ; EPILOG-NEXT: br label %innerH ; EPILOG: innerH: -; EPILOG-NEXT: %i3 = phi i64 [ %i4, %latch ], [ %i, %outerloop ] +; EPILOG-NEXT: %i3 = phi i64 [ %i, %outerloop.new ], [ %i4.7, %latch.7 ] +; EPILOG-NEXT: %niter = phi i64 [ %unroll_iter, %outerloop.new ], [ %niter.nsub.7, %latch.7 ] ; EPILOG-NEXT: %i4 = add nuw nsw i64 %i3, 1 -; EPILOG-NEXT: br i1 false, label %outerloop.loopexit, label %latch +; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch ; EPILOG: latch: -; EPILOG-NEXT: %i6 = icmp ult i64 %i4, 100 -; EPILOG-NEXT: br i1 %i6, label %innerH, label %exit +; EPILOG-NEXT: %niter.nsub = sub i64 %niter, 1 +; EPILOG-NEXT: %i4.1 = add nuw nsw i64 %i4, 1 +; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.1 +; EPILOG: exit.unr-lcssa.loopexit: +; EPILOG-NEXT: %i3.unr.ph = phi i64 [ %i4.7, %latch.7 ] +; EPILOG-NEXT: br label %exit.unr-lcssa +; EPILOG: exit.unr-lcssa: +; EPILOG-NEXT: %i3.unr = phi i64 [ %i, %outerloop ], [ %i3.unr.ph, %exit.unr-lcssa.loopexit ] +; EPILOG-NEXT: %lcmp.mod = icmp ne i64 %xtraiter, 0 +; EPILOG-NEXT: br i1 %lcmp.mod, label %innerH.epil.preheader, label %exit.loopexit +; EPILOG: innerH.epil.preheader: +; EPILOG-NEXT: br label %innerH.epil +; EPILOG: innerH.epil: +; EPILOG-NEXT: %i3.epil = phi i64 [ %i4.epil, %latch.epil ], [ %i3.unr, %innerH.epil.preheader ] +; EPILOG-NEXT: %epil.iter = phi i64 [ %xtraiter, %innerH.epil.preheader ], [ %epil.iter.sub, %latch.epil ] +; EPILOG-NEXT: %i4.epil = add nuw nsw i64 %i3.epil, 1 +; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit1, label %latch.epil +; EPILOG: latch.epil: +; EPILOG-NEXT: %i6.epil = icmp ult i64 %i4.epil, 100 +; EPILOG-NEXT: %epil.iter.sub = sub i64 %epil.iter, 1 +; EPILOG-NEXT: %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0 +; EPILOG-NEXT: br i1 %epil.iter.cmp, label %innerH.epil, label %exit.epilog-lcssa, !llvm.loop !11 +; EPILOG: exit.epilog-lcssa: +; EPILOG-NEXT: br label %exit +; EPILOG: exit.loopexit: +; EPILOG-NEXT: br label %exit ; EPILOG: exit: ; EPILOG-NEXT: ret void +; EPILOG: latch.1: +; EPILOG-NEXT: %niter.nsub.1 = sub i64 %niter.nsub, 1 +; EPILOG-NEXT: %i4.2 = add nuw nsw i64 %i4.1, 1 +; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.2 +; EPILOG: latch.2: +; EPILOG-NEXT: %niter.nsub.2 = sub i64 %niter.nsub.1, 1 +; EPILOG-NEXT: %i4.3 = add nuw nsw i64 %i4.2, 1 +; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.3 +; EPILOG: latch.3: +; EPILOG-NEXT: %niter.nsub.3 = sub i64 %niter.nsub.2, 1 +; EPILOG-NEXT: %i4.4 = add nuw nsw i64 %i4.3, 1 +; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.4 +; EPILOG: latch.4: +; EPILOG-NEXT: %niter.nsub.4 = sub i64 %niter.nsub.3, 1 +; EPILOG-NEXT: %i4.5 = add nuw nsw i64 %i4.4, 1 +; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.5 +; EPILOG: latch.5: +; EPILOG-NEXT: %niter.nsub.5 = sub i64 %niter.nsub.4, 1 +; EPILOG-NEXT: %i4.6 = add nuw nsw i64 %i4.5, 1 +; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.6 +; EPILOG: latch.6: +; EPILOG-NEXT: %niter.nsub.6 = sub i64 %niter.nsub.5, 1 +; EPILOG-NEXT: %i4.7 = add nuw nsw i64 %i4.6, 1 +; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.7 +; EPILOG: latch.7: +; EPILOG-NEXT: %niter.nsub.7 = sub i64 %niter.nsub.6, 1 +; EPILOG-NEXT: %niter.ncmp.7 = icmp ne i64 %niter.nsub.7, 0 +; EPILOG-NEXT: br i1 %niter.ncmp.7, label %innerH, label %exit.unr-lcssa.loopexit ; ; EPILOG-BLOCK-LABEL: @test8( ; EPILOG-BLOCK-NEXT: bb: @@ -4611,6 +4670,8 @@ define i8 addrspace(1)* @test9(i8* nocapture readonly %arg, i32 %n) { ; EPILOG-LABEL: @test9( ; EPILOG-NEXT: bb: +; EPILOG-NEXT: %0 = add i32 %n, -1 +; EPILOG-NEXT: %1 = add i32 %n, -2 ; EPILOG-NEXT: br label %outerloopHdr ; EPILOG: outerloopHdr: ; EPILOG-NEXT: %trip = add i32 %n, -1 @@ -4618,24 +4679,86 @@ ; EPILOG-NEXT: br i1 %outercnd, label %preheader, label %outerLatch ; EPILOG: preheader: ; EPILOG-NEXT: %i4 = zext i32 0 to i64 +; EPILOG-NEXT: %xtraiter = and i32 %0, 7 +; EPILOG-NEXT: %2 = icmp ult i32 %1, 7 +; EPILOG-NEXT: br i1 %2, label %outerLatch.loopexit.unr-lcssa, label %preheader.new +; EPILOG: preheader.new: +; EPILOG-NEXT: %unroll_iter = sub i32 %0, %xtraiter ; EPILOG-NEXT: br label %header ; EPILOG: header: -; EPILOG-NEXT: %phi = phi i64 [ %i4, %preheader ], [ %iv.next, %latch ] -; EPILOG-NEXT: %i7 = trunc i64 %phi to i32 -; EPILOG-NEXT: br i1 true, label %latch, label %innerexit +; EPILOG-NEXT: %phi = phi i64 [ %i4, %preheader.new ], [ %iv.next.7, %latch.7 ] +; EPILOG-NEXT: %niter = phi i32 [ %unroll_iter, %preheader.new ], [ %niter.nsub.7, %latch.7 ] +; EPILOG-NEXT: br i1 true, label %latch, label %innerexit.loopexit +; EPILOG: innerexit.loopexit: +; EPILOG-NEXT: %trip.lcssa.ph = phi i32 [ %trip, %header ], [ %trip, %latch ], [ %trip, %latch.1 ], [ %trip, %latch.2 ], [ %trip, %latch.3 ], [ %trip, %latch.4 ], [ %trip, %latch.5 ], [ %trip, %latch.6 ] +; EPILOG-NEXT: br label %innerexit +; EPILOG: innerexit.loopexit1: +; EPILOG-NEXT: %trip.lcssa.ph2 = phi i32 [ %trip, %header.epil ] +; EPILOG-NEXT: br label %innerexit ; EPILOG: innerexit: -; EPILOG-NEXT: %trip.lcssa = phi i32 [ %trip, %header ] +; EPILOG-NEXT: %trip.lcssa = phi i32 [ %trip.lcssa.ph, %innerexit.loopexit ], [ %trip.lcssa.ph2, %innerexit.loopexit1 ] ; EPILOG-NEXT: %i9 = call i8 addrspace(1)* @foo(i32 %trip.lcssa) ; EPILOG-NEXT: ret i8 addrspace(1)* %i9 ; EPILOG: latch: -; EPILOG-NEXT: %i11 = add nsw i32 %i7, 1 -; EPILOG-NEXT: %innercnd = icmp slt i32 %i11, %trip ; EPILOG-NEXT: %iv.next = add nuw nsw i64 %phi, 1 -; EPILOG-NEXT: br i1 %innercnd, label %header, label %outerLatch.loopexit +; EPILOG-NEXT: %niter.nsub = sub i32 %niter, 1 +; EPILOG-NEXT: br i1 true, label %latch.1, label %innerexit.loopexit +; EPILOG: outerLatch.loopexit.unr-lcssa.loopexit: +; EPILOG-NEXT: %phi.unr.ph = phi i64 [ %iv.next.7, %latch.7 ] +; EPILOG-NEXT: br label %outerLatch.loopexit.unr-lcssa +; EPILOG: outerLatch.loopexit.unr-lcssa: +; EPILOG-NEXT: %phi.unr = phi i64 [ %i4, %preheader ], [ %phi.unr.ph, %outerLatch.loopexit.unr-lcssa.loopexit ] +; EPILOG-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0 +; EPILOG-NEXT: br i1 %lcmp.mod, label %header.epil.preheader, label %outerLatch.loopexit +; EPILOG: header.epil.preheader: +; EPILOG-NEXT: br label %header.epil +; EPILOG: header.epil: +; EPILOG-NEXT: %phi.epil = phi i64 [ %phi.unr, %header.epil.preheader ], [ %iv.next.epil, %latch.epil ] +; EPILOG-NEXT: %epil.iter = phi i32 [ %xtraiter, %header.epil.preheader ], [ %epil.iter.sub, %latch.epil ] +; EPILOG-NEXT: %i7.epil = trunc i64 %phi.epil to i32 +; EPILOG-NEXT: br i1 true, label %latch.epil, label %innerexit.loopexit1 +; EPILOG: latch.epil: +; EPILOG-NEXT: %i11.epil = add nsw i32 %i7.epil, 1 +; EPILOG-NEXT: %innercnd.epil = icmp slt i32 %i11.epil, %trip +; EPILOG-NEXT: %iv.next.epil = add nuw nsw i64 %phi.epil, 1 +; EPILOG-NEXT: %epil.iter.sub = sub i32 %epil.iter, 1 +; EPILOG-NEXT: %epil.iter.cmp = icmp ne i32 %epil.iter.sub, 0 +; EPILOG-NEXT: br i1 %epil.iter.cmp, label %header.epil, label %outerLatch.loopexit.epilog-lcssa, !llvm.loop !12 +; EPILOG: outerLatch.loopexit.epilog-lcssa: +; EPILOG-NEXT: br label %outerLatch.loopexit ; EPILOG: outerLatch.loopexit: ; EPILOG-NEXT: br label %outerLatch ; EPILOG: outerLatch: ; EPILOG-NEXT: br label %outerloopHdr +; EPILOG: latch.1: +; EPILOG-NEXT: %iv.next.1 = add nuw nsw i64 %iv.next, 1 +; EPILOG-NEXT: %niter.nsub.1 = sub i32 %niter.nsub, 1 +; EPILOG-NEXT: br i1 true, label %latch.2, label %innerexit.loopexit +; EPILOG: latch.2: +; EPILOG-NEXT: %iv.next.2 = add nuw nsw i64 %iv.next.1, 1 +; EPILOG-NEXT: %niter.nsub.2 = sub i32 %niter.nsub.1, 1 +; EPILOG-NEXT: br i1 true, label %latch.3, label %innerexit.loopexit +; EPILOG: latch.3: +; EPILOG-NEXT: %iv.next.3 = add nuw nsw i64 %iv.next.2, 1 +; EPILOG-NEXT: %niter.nsub.3 = sub i32 %niter.nsub.2, 1 +; EPILOG-NEXT: br i1 true, label %latch.4, label %innerexit.loopexit +; EPILOG: latch.4: +; EPILOG-NEXT: %iv.next.4 = add nuw nsw i64 %iv.next.3, 1 +; EPILOG-NEXT: %niter.nsub.4 = sub i32 %niter.nsub.3, 1 +; EPILOG-NEXT: br i1 true, label %latch.5, label %innerexit.loopexit +; EPILOG: latch.5: +; EPILOG-NEXT: %iv.next.5 = add nuw nsw i64 %iv.next.4, 1 +; EPILOG-NEXT: %niter.nsub.5 = sub i32 %niter.nsub.4, 1 +; EPILOG-NEXT: br i1 true, label %latch.6, label %innerexit.loopexit +; EPILOG: latch.6: +; EPILOG-NEXT: %iv.next.6 = add nuw nsw i64 %iv.next.5, 1 +; EPILOG-NEXT: %niter.nsub.6 = sub i32 %niter.nsub.5, 1 +; EPILOG-NEXT: br i1 true, label %latch.7, label %innerexit.loopexit +; EPILOG: latch.7: +; EPILOG-NEXT: %iv.next.7 = add nuw nsw i64 %iv.next.6, 1 +; EPILOG-NEXT: %niter.nsub.7 = sub i32 %niter.nsub.6, 1 +; EPILOG-NEXT: %niter.ncmp.7 = icmp ne i32 %niter.nsub.7, 0 +; EPILOG-NEXT: br i1 %niter.ncmp.7, label %header, label %outerLatch.loopexit.unr-lcssa.loopexit ; ; EPILOG-BLOCK-LABEL: @test9( ; EPILOG-BLOCK-NEXT: bb: