Index: lib/Transforms/Utils/LoopUnrollRuntime.cpp =================================================================== --- lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -428,19 +428,16 @@ return nullptr; } -/// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits -/// is populated with all the loop exit blocks other than the LatchExit block. +/// Returns true if we can safely and profitably unroll a multi-exit/exiting +/// loop. OtherExits is populated with all the loop exit blocks other than the +/// LatchExit block. static bool -canSafelyUnrollMultiExitLoop(Loop *L, SmallVectorImpl &OtherExits, +canUnrollMultiExitLoop(Loop *L, SmallVectorImpl &OtherExits, BasicBlock *LatchExit, bool PreserveLCSSA, bool UseEpilogRemainder) { - // Support runtime unrolling for multiple exit blocks and multiple exiting - // blocks. - if (!UnrollRuntimeMultiExit) - return false; - // Even if runtime multi exit is enabled, we currently have some correctness - // constrains in unrolling a multi-exit loop. + // We currently have some correctness constrains in unrolling a multi-exit + // loop. // We rely on LCSSA form being preserved when the exit blocks are transformed. if (!PreserveLCSSA) return false; @@ -466,12 +463,26 @@ if (UseEpilogRemainder && L->getParentLoop()) return false; - // All constraints have been satisfied. - return true; + // All safety constraints have been satisfied. Now check for profitability. + auto isMultiExitUnrollingProfitable = [](ArrayRef OtherExits) { + + // The heuristic we have for now is that we have exactly one exit other than + // the latchexit and that exit is a deoptimize block. Deoptimize blocks are + // rarely executed and they have no successors. So branches within the + // unrolled loop that have this deopt exit block as the target is highly + // predictable. + // TODO: We can extend this to exit blocks that are targets of low frequency + // branches or make.implicit branches. + return (OtherExits.size() == 1 && + OtherExits[0]->getTerminatingDeoptimizeCall()); + }; + // Priority goes to UnrollRuntimeMultiExit if it's supplied. + return UnrollRuntimeMultiExit.getNumOccurrences() + ? UnrollRuntimeMultiExit + : isMultiExitUnrollingProfitable(OtherExits); } - /// Insert code in the prolog/epilog code when unrolling a loop with a /// run-time trip-count. /// @@ -538,7 +549,7 @@ "one of the loop latch successors should be the exit block!"); // These are exit blocks other than the target of the latch exiting block. SmallVector OtherExits; - bool isMultiExitUnrollingEnabled = canSafelyUnrollMultiExitLoop( + bool isMultiExitUnrollingEnabled = canUnrollMultiExitLoop( L, OtherExits, LatchExit, PreserveLCSSA, UseEpilogRemainder); // Support only single exit and exiting block unless multi-exit loop unrolling is enabled. if (!isMultiExitUnrollingEnabled && Index: test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll @@ -0,0 +1,94 @@ +; RUN: opt < %s -loop-unroll -unroll-runtime=true -verify-dom-info -verify-loop-info -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-unroll -unroll-runtime=true -verify-dom-info -unroll-runtime-multi-exit=false -verify-loop-info -S | FileCheck %s -check-prefix=NOUNROLL + +; this tests when unrolling multiple exit loop occurs by default (i.e. without specifying -unroll-runtime-multi-exit) + +; the second exit block is a deopt block. The loop has one exiting block other than the latch. +define i32 @test1(i32* nocapture %a, i64 %n) { +; CHECK-LABEL: test1( +; CHECK-LABEL: header.epil: +; CHECK-NEXT: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %latch.epil ], [ %indvars.iv.unr, %header.epil.preheader ] +; CHECK-LABEL: otherexit.loopexit: +; CHECK-NEXT: %sum.02.lcssa.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ], [ %add.1, %for.exiting_block.2 ], [ %add.2, %for.exiting_block.3 ], [ %add.3, %for.exiting_block.4 ], [ %add.4, %for.exiting_block.5 ], [ %add.5, %for.exiting_block.6 ], +; CHECK-NEXT: br label %otherexit +; CHECK-LABEL: otherexit.loopexit3: +; CHECK-NEXT: br label %otherexit +; CHECK-LABEL: otherexit: +; CHECK-NEXT: %sum.02.lcssa = phi i32 [ %sum.02.lcssa.ph, %otherexit.loopexit ], [ %sum.02.epil, %otherexit.loopexit3 ] +; CHECK-NEXT: %rval = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum.02.lcssa) ] +; CHECK-NEXT: ret i32 %rval +; CHECK-LABEL: latch.7: +; CHECK: add i64 %indvars.iv, 8 + +; NOUNROLL: test1( +; NOUNROLL-NOT: .epil +; NOUNROLL-NOT: .prol +; NOUNROLL: otherexit: +; NOUNROLL-NEXT: %sum.02.lcssa = phi i32 [ %sum.02, %for.exiting_block ] +; NOUNROLL-NEXT: %rval = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum.02.lcssa) ] +entry: + br label %header + +header: + %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ] + %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ] + br label %for.exiting_block + +for.exiting_block: + %cmp = icmp eq i64 %n, 42 + br i1 %cmp, label %otherexit, label %latch + +latch: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.02 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %latchexit, label %header + +latchexit: ; preds = %latch + %sum.0.lcssa = phi i32 [ %add, %latch ] + ret i32 %sum.0.lcssa + +otherexit: + %rval = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum.02) ] + ret i32 %rval +} + +; the exit block is not a deopt block. +define i32 @test2(i32* nocapture %a, i64 %n) { +; CHECK-LABEL: test2( +; CHECK-NOT: .epil +; CHECK-NOT: .prol +; CHECK-LABEL: otherexit: +; CHECK-NEXT: ret i32 %sum.02 + +entry: + br label %header + +header: + %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ] + %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ] + br label %for.exiting_block + +for.exiting_block: + %cmp = icmp eq i64 %n, 42 + br i1 %cmp, label %otherexit, label %latch + +latch: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.02 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %latchexit, label %header + +latchexit: ; preds = %latch + %sum.0.lcssa = phi i32 [ %add, %latch ] + ret i32 %sum.0.lcssa + +otherexit: + %rval = phi i32 [%sum.02, %for.exiting_block ] + ret i32 %rval +} +declare i32 @llvm.experimental.deoptimize.i32(...)