diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -179,6 +179,12 @@ void deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, LoopInfo *LI, MemorySSA *MSSA = nullptr); +/// Remove the backedge of the specified loop. Handles loop nests and general +/// loop structures subject to the precondition that the loop has no parent +/// loop and has a single latch block. Preserves all listed analyses. +void breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE, + LoopInfo &LI, MemorySSA *MSSA); + /// Try to promote memory values to scalars by sinking stores out of /// the loop and moving loads to before the loop. We do this by looping over /// the stores in the loop, looking for stores to Must pointers which are diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp --- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -26,6 +26,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/LoopUtils.h" + using namespace llvm; #define DEBUG_TYPE "loop-delete" @@ -38,6 +39,14 @@ Deleted, }; +static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B) { + if (A == LoopDeletionResult::Deleted || B == LoopDeletionResult::Deleted) + return LoopDeletionResult::Deleted; + if (A == LoopDeletionResult::Modified || B == LoopDeletionResult::Modified) + return LoopDeletionResult::Modified; + return LoopDeletionResult::Unmodified; +} + /// Determines if a loop is dead. /// /// This assumes that we've already checked for unique exit and exiting blocks, @@ -126,6 +135,34 @@ return true; } +/// If we can prove the backedge is untaken, remove it. This destroys the +/// loop, but leaves the (now trivially loop invariant) control flow and +/// side effects (if any) in place. +static LoopDeletionResult +breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE, + LoopInfo &LI, MemorySSA *MSSA, + OptimizationRemarkEmitter &ORE) { + assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); + + if (!L->getLoopLatch()) + return LoopDeletionResult::Unmodified; + + auto *BTC = SE.getBackedgeTakenCount(L); + if (!BTC->isZero()) + return LoopDeletionResult::Unmodified; + + // For non-outermost loops, the tricky case is that we can drop blocks + // out of both inner and outer loops at the same time. This results in + // new exiting block for the outer loop appearing, and possibly needing + // an lcssa phi inserted. (See loop_nest_lcssa test case in zero-btc.ll) + // TODO: We can handle a bunch of cases here without much work, revisit. + if (!L->isOutermost()) + return LoopDeletionResult::Unmodified; + + breakLoopBackedge(L, DT, SE, LI, MSSA); + return LoopDeletionResult::Deleted; +} + /// Remove a loop if it is dead. /// /// A loop is considered dead either if it does not impact the observable @@ -236,6 +273,14 @@ // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.MSSA, ORE); + + // If we can prove the backedge isn't taken, just break it and be done. This + // leaves the loop structure in place which means it can handle dispatching + // to the right exit based on whatever loop invariant structure remains. + if (Result != LoopDeletionResult::Deleted) + Result = merge(Result, breakBackedgeIfNotTaken(&L, AR.DT, AR.SE, AR.LI, + AR.MSSA, ORE)); + if (Result == LoopDeletionResult::Unmodified) return PreservedAnalyses::all(); @@ -295,6 +340,12 @@ LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI, MSSA, ORE); + // If we can prove the backedge isn't taken, just break it and be done. This + // leaves the loop structure in place which means it can handle dispatching + // to the right exit based on whatever loop invariant structure remains. + if (Result != LoopDeletionResult::Deleted) + Result = merge(Result, breakBackedgeIfNotTaken(L, DT, SE, LI, MSSA, ORE)); + if (Result == LoopDeletionResult::Deleted) LPM.markLoopAsDeleted(*L); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -762,6 +762,38 @@ } } +void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE, + LoopInfo &LI, MemorySSA *MSSA) { + + assert(L->isOutermost() && "Can't yet preserve LCSSA for this case"); + auto *Latch = L->getLoopLatch(); + assert(Latch && "multiple latches not yet supported"); + auto *Header = L->getHeader(); + + SE.forgetLoop(L); + + // Note: By splitting the backedge, and then explicitly making it unreachable + // we gracefully handle corner cases such as non-bottom tested loops and the + // like. We also have the benefit of being able to reuse existing well tested + // code. It might be worth special casing the common bottom tested case at + // some point to avoid code churn. + + std::unique_ptr MSSAU; + if (MSSA) + MSSAU = std::make_unique(MSSA); + + auto *BackedgeBB = SplitEdge(Latch, Header, &DT, &LI, MSSAU.get()); + + DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager); + (void)changeToUnreachable(BackedgeBB->getTerminator(), /*UseTrap*/false, + /*PreserveLCSSA*/true, &DTU, MSSAU.get()); + + // Erase (and destroy) this loop instance. Handles relinking sub-loops + // and blocks within the loop as needed. + LI.erase(L); +} + + /// Checks if \p L has single exit through latch block except possibly /// "deoptimizing" exits. Returns branch instruction terminating the loop /// latch if above check is successful, nullptr otherwise. diff --git a/llvm/test/Transforms/IndVarSimplify/exit_value_test2.ll b/llvm/test/Transforms/IndVarSimplify/exit_value_test2.ll --- a/llvm/test/Transforms/IndVarSimplify/exit_value_test2.ll +++ b/llvm/test/Transforms/IndVarSimplify/exit_value_test2.ll @@ -89,8 +89,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[UNKNOWN_NEXT:%.*]] = load volatile i32, i32* [[UNKNOWN_MEM:%.*]] -; CHECK-NEXT: br i1 false, label [[LOOP]], label [[LEAVE:%.*]] +; CHECK-NEXT: [[UNKNOWN_NEXT:%.*]] = load volatile i32, i32* [[UNKNOWN_MEM:%.*]], align 4 +; CHECK-NEXT: br i1 false, label [[LOOP_LOOP_CRIT_EDGE:%.*]], label [[LEAVE:%.*]] +; CHECK: loop.loop_crit_edge: +; CHECK-NEXT: unreachable ; CHECK: leave: ; CHECK-NEXT: ret i32 [[UNKNOWN_INIT:%.*]] ; diff --git a/llvm/test/Transforms/LoopDeletion/update-scev.ll b/llvm/test/Transforms/LoopDeletion/update-scev.ll --- a/llvm/test/Transforms/LoopDeletion/update-scev.ll +++ b/llvm/test/Transforms/LoopDeletion/update-scev.ll @@ -44,7 +44,8 @@ %conv10 = zext i1 %cmp9 to i32 %and = and i32 %conv10, %g.138 %inc = add i32 %h.039, 1 - br i1 undef, label %for.inc11, label %for.body6 + %exit = icmp eq i32 %inc, 20000 + br i1 %exit, label %for.inc11, label %for.body6 for.inc11: ; preds = %for.body6 %and.lcssa = phi i32 [ %and, %for.body6 ] diff --git a/llvm/test/Transforms/LoopDeletion/zero-btc.ll b/llvm/test/Transforms/LoopDeletion/zero-btc.ll --- a/llvm/test/Transforms/LoopDeletion/zero-btc.ll +++ b/llvm/test/Transforms/LoopDeletion/zero-btc.ll @@ -9,7 +9,9 @@ ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: store i32 0, i32* @G, align 4 -; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 false, label [[LOOP_LOOP_CRIT_EDGE:%.*]], label [[EXIT:%.*]] +; CHECK: loop.loop_crit_edge: +; CHECK-NEXT: unreachable ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -30,11 +32,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: store i32 0, i32* @G, align 4 -; CHECK-NEXT: [[IV_INC]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[IV_INC:%.*]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[BE_TAKEN:%.*]] = icmp ne i32 [[IV_INC]], 1 -; CHECK-NEXT: br i1 [[BE_TAKEN]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[BE_TAKEN]], label [[LOOP_LOOP_CRIT_EDGE:%.*]], label [[EXIT:%.*]] +; CHECK: loop.loop_crit_edge: +; CHECK-NEXT: unreachable ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -57,13 +61,15 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_INC:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: store i32 0, i32* @G, align 4 -; CHECK-NEXT: [[IV_INC]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[IV_INC:%.*]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[BE_TAKEN:%.*]] = icmp ne i32 [[IV_INC]], 1 -; CHECK-NEXT: br i1 [[BE_TAKEN]], label [[LATCH]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[BE_TAKEN]], label [[LATCH:%.*]], label [[EXIT:%.*]] ; CHECK: latch: -; CHECK-NEXT: br label [[LOOP]] +; CHECK-NEXT: br label [[LATCH_SPLIT:%.*]] +; CHECK: latch.split: +; CHECK-NEXT: unreachable ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -88,15 +94,17 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_INC:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: store i32 0, i32* @G, align 4 -; CHECK-NEXT: [[IV_INC]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[IV_INC:%.*]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[BE_TAKEN:%.*]] = icmp ne i32 [[IV_INC]], 1 -; CHECK-NEXT: br i1 [[BE_TAKEN]], label [[LATCH]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[BE_TAKEN]], label [[LATCH:%.*]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: store i32 1, i32* @G, align 4 ; CHECK-NEXT: [[COND2:%.*]] = icmp ult i32 [[IV_INC]], 30 -; CHECK-NEXT: br i1 [[COND2]], label [[LOOP]], label [[EXIT]] +; CHECK-NEXT: br i1 [[COND2]], label [[LATCH_LOOP_CRIT_EDGE:%.*]], label [[EXIT]] +; CHECK: latch.loop_crit_edge: +; CHECK-NEXT: unreachable ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -127,7 +135,9 @@ ; CHECK-NEXT: br i1 true, label [[LATCH:%.*]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: store i32 1, i32* @G, align 4 -; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT]] +; CHECK-NEXT: br i1 false, label [[LATCH_LOOP_CRIT_EDGE:%.*]], label [[EXIT]] +; CHECK: latch.loop_crit_edge: +; CHECK-NEXT: unreachable ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -218,7 +228,9 @@ ; CHECK-NEXT: br i1 true, label [[LATCH:%.*]], label [[EXIT1:%.*]] ; CHECK: latch: ; CHECK-NEXT: store i32 1, i32* @G, align 4 -; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT2:%.*]] +; CHECK-NEXT: br i1 false, label [[LATCH_LOOP_CRIT_EDGE:%.*]], label [[EXIT2:%.*]] +; CHECK: latch.loop_crit_edge: +; CHECK-NEXT: unreachable ; CHECK: exit1: ; CHECK-NEXT: ret void ; CHECK: exit2: @@ -254,7 +266,9 @@ ; CHECK-NEXT: [[CND:%.*]] = icmp ult i32 [[IV_INC]], 200 ; CHECK-NEXT: br i1 [[CND]], label [[INNER]], label [[LATCH:%.*]] ; CHECK: latch: -; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 false, label [[LATCH_LOOP_CRIT_EDGE:%.*]], label [[EXIT:%.*]] +; CHECK: latch.loop_crit_edge: +; CHECK-NEXT: unreachable ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -317,3 +331,45 @@ exit: ret void } + +; Key point is that inner_latch drops out of the outer loop when +; the inner loop is deleted, and thus the lcssa phi needs to be +; in the inner_latch block to preserve LCSSA. We either have to +; insert the LCSSA phi, or not break the inner backedge. +define void @loop_nest_lcssa() { +; CHECK-LABEL: @loop_nest_lcssa( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 1, 2 +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer_header: +; CHECK-NEXT: br label [[INNER_HEADER:%.*]] +; CHECK: inner_header: +; CHECK-NEXT: br i1 false, label [[INNER_LATCH:%.*]], label [[OUTER_LATCH:%.*]] +; CHECK: inner_latch: +; CHECK-NEXT: br i1 false, label [[INNER_HEADER]], label [[LOOPEXIT:%.*]] +; CHECK: outer_latch: +; CHECK-NEXT: br label [[OUTER_HEADER]] +; CHECK: loopexit: +; CHECK-NEXT: [[DOTLCSSA32:%.*]] = phi i32 [ [[TMP0]], [[INNER_LATCH]] ] +; CHECK-NEXT: unreachable +; +entry: + br label %outer_header + +outer_header: + %0 = add i32 1, 2 + br label %inner_header + +inner_header: + br i1 false, label %inner_latch, label %outer_latch + +inner_latch: + br i1 false, label %inner_header, label %loopexit + +outer_latch: + br label %outer_header + +loopexit: + %.lcssa32 = phi i32 [ %0, %inner_latch ] + unreachable +}