diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp --- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -997,26 +997,18 @@ } /// Return the minimum of all analyzeable exit counts. This is an upper bound -/// on the actual exit count. If there are not at least two analyzeable exits, -/// returns SCEVCouldNotCompute. -static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE, - DominatorTree &DT, - Loop *L) { - SmallVector ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); - +/// on the actual exit count. +static const SCEV *getMinAnalyzeableBackedgeTakenCount( + ScalarEvolution &SE, Loop *L, + SmallVectorImpl &AnalyzableExitingBlocks) { SmallVector ExitCounts; - for (BasicBlock *ExitingBB : ExitingBlocks) { + for (BasicBlock *ExitingBB : AnalyzableExitingBlocks) { const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); - if (isa(ExitCount)) - continue; - assert(DT.dominates(ExitingBB, L->getLoopLatch()) && - "We should only have known counts for exiting blocks that " - "dominate latch!"); + assert(!isa(ExitCount) && + "Expected analyzable exits only"); ExitCounts.push_back(ExitCount); } - if (ExitCounts.size() < 2) - return SE.getCouldNotCompute(); + assert(!ExitCounts.empty() && "No analyzable exits?"); return SE.getUMinFromMismatchedTypes(ExitCounts); } @@ -1046,6 +1038,7 @@ // instruction. SmallVector ExitingBlocks; + SmallVector AnalyzableExitingBlocks; L->getExitingBlocks(ExitingBlocks); if (ExitingBlocks.empty()) @@ -1071,7 +1064,7 @@ // analyzeable after dropping widenability. { bool Invalidate = false; - + for (auto *ExitingBB : ExitingBlocks) { if (LI->getLoopFor(ExitingBB) != L) continue; @@ -1092,13 +1085,36 @@ SE->forgetLoop(L); } - // The use of umin(all analyzeable exits) instead of latch is subtle, but - // important for profitability. We may have a loop which hasn't been fully - // canonicalized just yet. If the exit we chose to widen is provably never - // taken, we want the widened form to *also* be provably never taken. We - // can't guarantee this as a current unanalyzeable exit may later become - // analyzeable, but we can at least avoid the obvious cases. - const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L); + // Collect all analyzable exits. + for (auto *ExitingBB : ExitingBlocks) { + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + if (isa(ExitCount)) + continue; + assert(DT->dominates(ExitingBB, Latch) && + "We should only have known counts for exiting blocks that " + "dominate latch!"); + AnalyzableExitingBlocks.push_back(ExitingBB); + } + + // Bail out if there are less than two analyzable exits. + if (AnalyzableExitingBlocks.size() < 2) + return false; + + const SCEV *MinEC = nullptr; + bool IsTwoAnalyzableExits = false; + if (AnalyzableExitingBlocks.size() == 2) { + MinEC = SE->getExitCount(L, Latch); + IsTwoAnalyzableExits = true; + } else + // The use of umin(all analyzeable exits) instead of latch is subtle, but + // important for profitability. We may have a loop which hasn't been fully + // canonicalized just yet. If the exit we chose to widen is provably never + // taken, we want the widened form to *also* be provably never taken. We + // can't guarantee this as a current unanalyzeable exit may later become + // analyzeable, but we can at least avoid the obvious cases. + MinEC = + getMinAnalyzeableBackedgeTakenCount(*SE, L, AnalyzableExitingBlocks); + if (isa(MinEC) || MinEC->getType()->isPointerTy() || !SE->isLoopInvariant(MinEC, L) || !isSafeToExpandAt(MinEC, WidenableBR, *SE)) @@ -1115,7 +1131,12 @@ bool Changed = false; Value *MinECV = nullptr; // lazily generated if needed - for (BasicBlock *ExitingBB : ExitingBlocks) { + for (BasicBlock *ExitingBB : AnalyzableExitingBlocks) { + // There is no point in optimizing the latch in case of two exits. If we + // do we will be deopting unconditionally. + if (IsTwoAnalyzableExits && ExitingBB == Latch) + continue; + // If our exiting block exits multiple loops, we can only rewrite the // innermost one. Otherwise, we're changing how many times the innermost // loop runs before it exits. @@ -1132,8 +1153,9 @@ continue; const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); - if (isa(ExitCount) || - ExitCount->getType()->isPointerTy() || + assert(!isa(ExitCount) && + "Only analyzable exits are expected"); + if (ExitCount->getType()->isPointerTy() || !isSafeToExpandAt(ExitCount, WidenableBR, *SE)) continue; diff --git a/llvm/test/Transforms/LoopPredication/predicate-exits.ll b/llvm/test/Transforms/LoopPredication/predicate-exits.ll --- a/llvm/test/Transforms/LoopPredication/predicate-exits.ll +++ b/llvm/test/Transforms/LoopPredication/predicate-exits.ll @@ -11,12 +11,10 @@ ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1 ; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[UMAX]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]] -; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[LENGTH]], [[UMIN]] -; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], [[COND_0:%.*]] -; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[TMP5]], [[WIDENABLE_COND]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LENGTH:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze i1 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = and i1 [[TMP3]], [[COND_0:%.*]] +; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[TMP4]], [[WIDENABLE_COND]] ; CHECK-NEXT: br i1 [[EXIPLICIT_GUARD_COND]], label [[LOOP_PREHEADER:%.*]], label [[DEOPT:%.*]], !prof !0 ; CHECK: deopt: ; CHECK-NEXT: [[DEOPTRET:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ] @@ -94,12 +92,10 @@ ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[LENGTH:%.*]], 1 ; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[LENGTH]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[UMAX]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[LENGTH]], [[TMP1]] -; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[LENGTH]], [[UMIN]] -; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], [[COND_0:%.*]] -; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[TMP5]], [[WIDENABLE_COND]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LENGTH]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze i1 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = and i1 [[TMP3]], [[COND_0:%.*]] +; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[TMP4]], [[WIDENABLE_COND]] ; CHECK-NEXT: br i1 [[EXIPLICIT_GUARD_COND]], label [[LOOP_PREHEADER:%.*]], label [[DEOPT:%.*]], !prof !0 ; CHECK: deopt: ; CHECK-NEXT: [[DEOPTRET:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ] @@ -355,12 +351,10 @@ ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1 ; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[UMAX]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]] -; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[LENGTH]], [[UMIN]] -; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], [[COND_0:%.*]] -; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[TMP5]], [[WIDENABLE_COND]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LENGTH:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze i1 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = and i1 [[TMP3]], [[COND_0:%.*]] +; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[TMP4]], [[WIDENABLE_COND]] ; CHECK-NEXT: br i1 [[EXIPLICIT_GUARD_COND]], label [[LOOP_PREHEADER:%.*]], label [[DEOPT:%.*]], !prof !0 ; CHECK: deopt: ; CHECK-NEXT: [[DEOPTRET:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ] @@ -684,12 +678,10 @@ ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1 ; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[UMAX]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]] -; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[LENGTH]], [[UMIN]] -; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], [[COND_0:%.*]] -; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[TMP5]], [[WIDENABLE_COND]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LENGTH:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze i1 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = and i1 [[TMP3]], [[COND_0:%.*]] +; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[TMP4]], [[WIDENABLE_COND]] ; CHECK-NEXT: br i1 [[EXIPLICIT_GUARD_COND]], label [[LOOP_PREHEADER:%.*]], label [[DEOPT:%.*]], !prof !0 ; CHECK: deopt.loopexit: ; CHECK-NEXT: br label [[DEOPT]] @@ -769,12 +761,10 @@ ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1 ; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[UMAX]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]] -; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[LENGTH]], [[UMIN]] -; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], [[COND_0:%.*]] -; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[WIDENABLE_COND]], [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LENGTH:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze i1 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = and i1 [[TMP3]], [[COND_0:%.*]] +; CHECK-NEXT: [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[WIDENABLE_COND]], [[TMP4]] ; CHECK-NEXT: br i1 [[EXIPLICIT_GUARD_COND]], label [[LOOP_PREHEADER:%.*]], label [[DEOPT:%.*]], !prof !0 ; CHECK: deopt: ; CHECK-NEXT: [[DEOPTRET:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ] @@ -849,13 +839,11 @@ ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1 ; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[UMAX]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]] -; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[LENGTH]], [[UMIN]] -; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LENGTH:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze i1 [[TMP2]] ; CHECK-NEXT: [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition() -; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], [[WIDENABLE_COND]] -; CHECK-NEXT: br i1 [[TMP5]], label [[LOOP_PREHEADER:%.*]], label [[DEOPT:%.*]], !prof !0 +; CHECK-NEXT: [[TMP4:%.*]] = and i1 [[TMP3]], [[WIDENABLE_COND]] +; CHECK-NEXT: br i1 [[TMP4]], label [[LOOP_PREHEADER:%.*]], label [[DEOPT:%.*]], !prof !0 ; CHECK: deopt: ; CHECK-NEXT: [[DEOPTRET:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ] ; CHECK-NEXT: ret i32 [[DEOPTRET]]