Index: llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -487,9 +487,41 @@ PreserveLCSSA, UseEpilogRemainder) && "Should be safe to unroll before checking profitability!"); #endif + // Priority goes to UnrollRuntimeMultiExit if it's supplied. - return UnrollRuntimeMultiExit.getNumOccurrences() ? UnrollRuntimeMultiExit - : false; + if (UnrollRuntimeMultiExit.getNumOccurrences()) + return UnrollRuntimeMultiExit; + + // The main pain point with multi-exit loop unrolling is that once unrolled, + // we will not be able to merge all blocks into a straight line code. + // There are branches within the unrolled loop that go to the OtherExits. + // The second point is the increase in code size, but this is true + // irrespective of multiple exits. + + // Note: Both the heuristics below are coarse grained. We are essentially + // enabling unrolling of loops that have a single side exit other than the + // normal LatchExit (i.e. exiting into a deoptimize block). + // The heuristics considered are: + // 1. low number of branches in the unrolled version. + // 2. high predictability of these extra branches. + // We avoid unrolling loops that have more than two exiting blocks. This + // limits the total number of branches in the unrolled loop to be atmost + // the unroll factor (since one of the exiting blocks is the latch block). + SmallVector ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + if (ExitingBlocks.size() > 2) + return false; + + // The second heuristic is that L has one exit other than the latchexit and + // that exit is a deoptimize block. We know that deoptimize blocks are rarely + // taken, which also implies the branch leading to the deoptimize block is + // highly predictable. + return (OtherExits.size() == 1 && + OtherExits[0]->getTerminatingDeoptimizeCall()); + // TODO: These can be fine-tuned further to consider code size or deopt states + // that are captured by the deoptimize exit block. + // Also, we can extend this to support more cases, if we actually + // know of kinds of multiexit loops that would benefit from unrolling. } /// Insert code in the prolog/epilog code when unrolling a loop with a Index: llvm/trunk/test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll =================================================================== --- llvm/trunk/test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll +++ llvm/trunk/test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll @@ -0,0 +1,94 @@ +; RUN: opt < %s -loop-unroll -unroll-runtime=true -verify-dom-info -verify-loop-info -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-unroll -unroll-runtime=true -verify-dom-info -unroll-runtime-multi-exit=false -verify-loop-info -S | FileCheck %s -check-prefix=NOUNROLL + +; this tests when unrolling multiple exit loop occurs by default (i.e. without specifying -unroll-runtime-multi-exit) + +; the second exit block is a deopt block. The loop has one exiting block other than the latch. +define i32 @test1(i32* nocapture %a, i64 %n) { +; CHECK-LABEL: test1( +; CHECK-LABEL: header.epil: +; CHECK-NEXT: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %latch.epil ], [ %indvars.iv.unr, %header.epil.preheader ] +; CHECK-LABEL: otherexit.loopexit: +; CHECK-NEXT: %sum.02.lcssa.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ], [ %add.1, %for.exiting_block.2 ], [ %add.2, %for.exiting_block.3 ], [ %add.3, %for.exiting_block.4 ], [ %add.4, %for.exiting_block.5 ], [ %add.5, %for.exiting_block.6 ], +; CHECK-NEXT: br label %otherexit +; CHECK-LABEL: otherexit.loopexit3: +; CHECK-NEXT: br label %otherexit +; CHECK-LABEL: otherexit: +; CHECK-NEXT: %sum.02.lcssa = phi i32 [ %sum.02.lcssa.ph, %otherexit.loopexit ], [ %sum.02.epil, %otherexit.loopexit3 ] +; CHECK-NEXT: %rval = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum.02.lcssa) ] +; CHECK-NEXT: ret i32 %rval +; CHECK-LABEL: latch.7: +; CHECK: add i64 %indvars.iv, 8 + +; NOUNROLL: test1( +; NOUNROLL-NOT: .epil +; NOUNROLL-NOT: .prol +; NOUNROLL: otherexit: +; NOUNROLL-NEXT: %sum.02.lcssa = phi i32 [ %sum.02, %for.exiting_block ] +; NOUNROLL-NEXT: %rval = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum.02.lcssa) ] +entry: + br label %header + +header: + %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ] + %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ] + br label %for.exiting_block + +for.exiting_block: + %cmp = icmp eq i64 %n, 42 + br i1 %cmp, label %otherexit, label %latch + +latch: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.02 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %latchexit, label %header + +latchexit: ; preds = %latch + %sum.0.lcssa = phi i32 [ %add, %latch ] + ret i32 %sum.0.lcssa + +otherexit: + %rval = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum.02) ] + ret i32 %rval +} + +; the exit block is not a deopt block. +define i32 @test2(i32* nocapture %a, i64 %n) { +; CHECK-LABEL: test2( +; CHECK-NOT: .epil +; CHECK-NOT: .prol +; CHECK-LABEL: otherexit: +; CHECK-NEXT: ret i32 %sum.02 + +entry: + br label %header + +header: + %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ] + %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ] + br label %for.exiting_block + +for.exiting_block: + %cmp = icmp eq i64 %n, 42 + br i1 %cmp, label %otherexit, label %latch + +latch: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.02 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %latchexit, label %header + +latchexit: ; preds = %latch + %sum.0.lcssa = phi i32 [ %add, %latch ] + ret i32 %sum.0.lcssa + +otherexit: + %rval = phi i32 [%sum.02, %for.exiting_block ] + ret i32 %rval +} +declare i32 @llvm.experimental.deoptimize.i32(...)