Index: lib/Transforms/Scalar/LoopPassManager.cpp =================================================================== --- lib/Transforms/Scalar/LoopPassManager.cpp +++ lib/Transforms/Scalar/LoopPassManager.cpp @@ -42,6 +42,13 @@ break; } +#ifndef NDEBUG + // Verify the loop structure and LCSSA form before visiting the loop. + L.verifyLoop(); + assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) && + "Loops must remain in LCSSA form!"); +#endif + // Update the analysis manager as each pass runs and potentially // invalidates analyses. AM.invalidate(L, PassPA); Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -114,6 +114,16 @@ cl::desc("Allows loops to be peeled when the dynamic " "trip count is known to be low.")); +static cl::opt UnrollRevisitChildLoops( + "unroll-revisit-child-loops", cl::Hidden, + cl::desc("This tells the unroller to enqueue child loops in the loop pass " + "manager to be re-visited after unrolling. Typically, this " + "shouldn't be needed as child loops were already visited (or the " + "child loop they are cloned from was already visited). This " + "option allows experimenting with re-running the loop pipeline " + "over them after unrolling in case this exposes further " + "opportunities.")); + /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. @@ -1117,7 +1127,7 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, - LPMUpdater &) { + LPMUpdater &Updater) { const auto &FAM = AM.getResult(L, AR).getManager(); Function *F = L.getHeader()->getParent(); @@ -1128,6 +1138,15 @@ report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not " "cached at a higher level"); + // Keep track of the previous loop structure so we can identify new loops + // created by unrolling. + Loop *ParentL = L.getParentLoop(); + SmallPtrSet OldLoops; + if (ParentL) + OldLoops.insert(ParentL->begin(), ParentL->end()); + else + OldLoops.insert(AR.LI.begin(), AR.LI.end()); + bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE, /*PreserveLCSSA*/ true, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, @@ -1135,5 +1154,58 @@ if (!Changed) return PreservedAnalyses::all(); + // The parent must not be damaged by unrolling! + if (ParentL) + ParentL->verifyLoop(); + + // Unrolling can do several things to introduce new loops into a loop nest: + // - Partial unrolling clones child loops within the current loop. + // - Full unrolling clones child loops within the current loop but then + // removes the current loop making all of the children appear to be new + // sibling loops. + // - Loop peeling can directly introduce new sibling loops by peeling one + // iteration. + // + // When a new loop appears as a sibling loop, either from peeling an + // iteration or fully unrolling, its nesting structure has fundamentally + // changed and we want to revisit it to reflect that. + // + // When unrolling has removed the current loop, we need to tell the + // infrastructure that it is gone. + // + // Finally, we support a debugging/testing mode where we revisit child loops + // as well. These are not expected to require further optimizations as either + // they or the loop they were cloned from have been directly visited already. + // But the debugging mode allows us to check this assumption. + bool IsCurrentLoopValid = false; + SmallVector SibLoops; + if (ParentL) + SibLoops.append(ParentL->begin(), ParentL->end()); + else + SibLoops.append(AR.LI.begin(), AR.LI.end()); + erase_if(SibLoops, [&](Loop *SibLoop) { + if (SibLoop == &L) { + IsCurrentLoopValid = true; + return true; + } + + // Otherwise erase the loop from the list if it was in the old loops. + return OldLoops.count(SibLoop) != 0; + }); + Updater.addSiblingLoops(SibLoops); + + if (!IsCurrentLoopValid) { + Updater.markLoopAsDeleted(L); + } else { + // We can only walk child loops if the current loop remained valid. + if (UnrollRevisitChildLoops) { + // Walk *all* of the child loops. This is a highly speculative mode + // anyways so look for any simplifications that arose from partial + // unrolling or peeling off of iterations. + SmallVector ChildLoops(L.begin(), L.end()); + Updater.addChildLoops(ChildLoops); + } + } + return getLoopPassPreservedAnalyses(); } Index: test/Transforms/LoopUnroll/basic.ll =================================================================== --- test/Transforms/LoopUnroll/basic.ll +++ test/Transforms/LoopUnroll/basic.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -loop-unroll -S | FileCheck %s +; RUN: opt < %s -passes='require,loop(unroll)' -S | FileCheck %s ; This should not unroll since the address of the loop header is taken. Index: test/Transforms/LoopUnroll/full-unroll-bad-cost.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-bad-cost.ll +++ test/Transforms/LoopUnroll/full-unroll-bad-cost.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -loop-unroll < %s | FileCheck %s +; RUN: opt < %s -passes='require,loop(unroll)' -S | FileCheck %s ; LLVM should not try to fully unroll this loop. Index: test/Transforms/LoopUnroll/full-unroll-crashers.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-crashers.ll +++ test/Transforms/LoopUnroll/full-unroll-crashers.ll @@ -1,5 +1,6 @@ ; Check that we don't crash on corner cases. ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=1 -unroll-max-percent-threshold-boost=200 -o /dev/null +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=1 -unroll-max-percent-threshold-boost=200 -o /dev/null target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @known_constant = internal unnamed_addr constant [10 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16 Index: test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 Index: test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @known_constant = internal unnamed_addr constant [10 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16 Index: test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=12 -unroll-max-percent-threshold-boost=400 | FileCheck %s +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=12 -unroll-max-percent-threshold-boost=400 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @known_constant = internal unnamed_addr constant [10 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0], align 16 Index: test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; When examining gep-instructions we shouldn't consider them simplified if the Index: test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define i64 @propagate_loop_phis() { Index: test/Transforms/LoopUnroll/full-unroll-keep-first-exit.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-keep-first-exit.ll +++ test/Transforms/LoopUnroll/full-unroll-keep-first-exit.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -loop-unroll < %s | FileCheck %s +; RUN: opt -S -passes='require,loop(unroll)' < %s | FileCheck %s ; Unroll twice, with first loop exit kept ; CHECK-LABEL: @s32_max1 Index: test/Transforms/LoopUnroll/revisit.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/revisit.ll @@ -0,0 +1,150 @@ +; This test checks that nested loops are revisited in various scenarios when +; unrolling. Note that if we ever start doing outer loop peeling a test case +; for that should be added here that will look essentially like a hybrid of the +; current two cases. +; +; RUN: opt < %s -disable-output -debug-pass-manager 2>&1 \ +; RUN: -passes='require,loop(unroll)' \ +; RUN: | FileCheck %s +; +; Also run in a special mode that visits children. +; RUN: opt < %s -disable-output -debug-pass-manager -unroll-revisit-child-loops 2>&1 \ +; RUN: -passes='require,loop(unroll)' \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-CHILDREN + +; Basic test is fully unrolled and we revisit the post-unroll new sibling +; loops, including the ones that used to be child loops. +define void @full_unroll(i1* %ptr) { +; CHECK-LABEL: FunctionToLoopPassAdaptor{{.*}} on full_unroll +; CHECK-NOT: LoopUnrollPass + +entry: + br label %l0 + +l0: + %cond.0 = load volatile i1, i1* %ptr + br i1 %cond.0, label %l0.0.ph, label %exit + +l0.0.ph: + br label %l0.0 + +l0.0: + %iv = phi i32 [ %iv.next, %l0.0.latch ], [ 0, %l0.0.ph ] + %iv.next = add i32 %iv, 1 + br label %l0.0.0.ph + +l0.0.0.ph: + br label %l0.0.0 + +l0.0.0: + %cond.0.0.0 = load volatile i1, i1* %ptr + br i1 %cond.0.0.0, label %l0.0.0, label %l0.0.1.ph +; CHECK: LoopUnrollPass on Loop at depth 3 containing: %l0.0.0
+; CHECK-NOT: LoopUnrollPass + +l0.0.1.ph: + br label %l0.0.1 + +l0.0.1: + %cond.0.0.1 = load volatile i1, i1* %ptr + br i1 %cond.0.0.1, label %l0.0.1, label %l0.0.latch +; CHECK: LoopUnrollPass on Loop at depth 3 containing: %l0.0.1
+; CHECK-NOT: LoopUnrollPass + +l0.0.latch: + %cmp = icmp slt i32 %iv.next, 2 + br i1 %cmp, label %l0.0, label %l0.latch +; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0 +; CHECK-NOT: LoopUnrollPass +; +; Unrolling occurs, so we visit what were the inner loops twice over. First we +; visit their clones, and then we visit the original loops re-parented. +; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.1.1
+; CHECK-NOT: LoopUnrollPass +; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.0.1
+; CHECK-NOT: LoopUnrollPass +; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.1
+; CHECK-NOT: LoopUnrollPass +; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.0
+; CHECK-NOT: LoopUnrollPass + +l0.latch: + br label %l0 +; CHECK: LoopUnrollPass on Loop at depth 1 containing: %l0
+; CHECK-NOT: LoopUnrollPass + +exit: + ret void +} + +; Now we test forced runtime partial unrolling with metadata. Here we end up +; duplicating child loops without changing their structure and so they aren't by +; default visited, but will be visited with a special parameter. +define void @partial_unroll(i32 %count, i1* %ptr) { +; CHECK-LABEL: FunctionToLoopPassAdaptor{{.*}} on partial_unroll +; CHECK-NOT: LoopUnrollPass + +entry: + br label %l0 + +l0: + %cond.0 = load volatile i1, i1* %ptr + br i1 %cond.0, label %l0.0.ph, label %exit + +l0.0.ph: + br label %l0.0 + +l0.0: + %iv = phi i32 [ %iv.next, %l0.0.latch ], [ 0, %l0.0.ph ] + %iv.next = add i32 %iv, 1 + br label %l0.0.0.ph + +l0.0.0.ph: + br label %l0.0.0 + +l0.0.0: + %cond.0.0.0 = load volatile i1, i1* %ptr + br i1 %cond.0.0.0, label %l0.0.0, label %l0.0.1.ph +; CHECK: LoopUnrollPass on Loop at depth 3 containing: %l0.0.0
+; CHECK-NOT: LoopUnrollPass + +l0.0.1.ph: + br label %l0.0.1 + +l0.0.1: + %cond.0.0.1 = load volatile i1, i1* %ptr + br i1 %cond.0.0.1, label %l0.0.1, label %l0.0.latch +; CHECK: LoopUnrollPass on Loop at depth 3 containing: %l0.0.1
+; CHECK-NOT: LoopUnrollPass + +l0.0.latch: + %cmp = icmp slt i32 %iv.next, %count + br i1 %cmp, label %l0.0, label %l0.latch, !llvm.loop !1 +; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0 +; CHECK-NOT: LoopUnrollPass +; +; Partial unrolling occurs which introduces new child loops but not new sibling +; loops. We only visit the child loops in a special mode, not by default. +; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 3 containing: %l0.0.0
+; CHECK-CHILDREN-NOT: LoopUnrollPass +; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 3 containing: %l0.0.1
+; CHECK-CHILDREN-NOT: LoopUnrollPass +; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 3 containing: %l0.0.0.1
+; CHECK-CHILDREN-NOT: LoopUnrollPass +; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 3 containing: %l0.0.1.1
+; CHECK-CHILDREN-NOT: LoopUnrollPass +; +; When we revisit children, we also revisit the current loop. +; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 2 containing: %l0.0
+; CHECK-CHILDREN-NOT: LoopUnrollPass + +l0.latch: + br label %l0 +; CHECK: LoopUnrollPass on Loop at depth 1 containing: %l0
+; CHECK-NOT: LoopUnrollPass + +exit: + ret void +} +!1 = !{!1, !2} +!2 = !{!"llvm.loop.unroll.count", i32 2} Index: test/Transforms/LoopUnroll/runtime-loop.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop.ll +++ test/Transforms/LoopUnroll/runtime-loop.ll @@ -1,6 +1,9 @@ ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true | FileCheck %s -check-prefix=EPILOG ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-runtime=true -unroll-runtime-epilog=true | FileCheck %s -check-prefix=EPILOG +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG + target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Tests for unrolling loops with run-time trip counts Index: test/Transforms/LoopUnroll/runtime-loop1.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop1.ll +++ test/Transforms/LoopUnroll/runtime-loop1.ll @@ -1,6 +1,9 @@ ; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=true | FileCheck %s -check-prefix=EPILOG ; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=true | FileCheck %s -check-prefix=EPILOG +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG + ; This tests that setting the unroll count works Index: test/Transforms/LoopUnroll/runtime-loop2.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop2.ll +++ test/Transforms/LoopUnroll/runtime-loop2.ll @@ -1,6 +1,9 @@ ; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-partial-threshold=25 -unroll-runtime -unroll-runtime-epilog=true -unroll-count=8 | FileCheck %s -check-prefix=EPILOG ; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-partial-threshold=25 -unroll-runtime -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-threshold=25 -unroll-partial-threshold=25 -unroll-runtime -unroll-runtime-epilog=true -unroll-count=8 | FileCheck %s -check-prefix=EPILOG +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-threshold=25 -unroll-partial-threshold=25 -unroll-runtime -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG + ; Choose a smaller, power-of-two, unroll count if the loop is too large. ; This test makes sure we're not unrolling 'odd' counts Index: test/Transforms/LoopUnroll/runtime-loop3.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop3.ll +++ test/Transforms/LoopUnroll/runtime-loop3.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts ; RUN: opt < %s -disable-output -stats -loop-unroll -unroll-runtime -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS +; RUN: opt < %s -disable-output -stats -passes='require,loop(unroll)' -unroll-runtime -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS ; Test that nested loops can be unrolled. We need to increase threshold to do it Index: test/Transforms/LoopUnroll/runtime-loop5.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop5.ll +++ test/Transforms/LoopUnroll/runtime-loop5.ll @@ -1,6 +1,9 @@ ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-count=16 | FileCheck --check-prefix=UNROLL-16 %s ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-count=4 | FileCheck --check-prefix=UNROLL-4 %s +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-runtime=true -unroll-count=16 | FileCheck --check-prefix=UNROLL-16 %s +; RUN: opt < %s -S -passes='require,loop(unroll)' -unroll-runtime=true -unroll-count=4 | FileCheck --check-prefix=UNROLL-4 %s + ; Given that the trip-count of this loop is a 3-bit value, we cannot ; safely unroll it with a count of anything more than 8. Index: test/Transforms/LoopUnroll/unloop.ll =================================================================== --- test/Transforms/LoopUnroll/unloop.ll +++ test/Transforms/LoopUnroll/unloop.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -S -loop-unroll -verify-loop-info | FileCheck %s -; RUN: opt < %s -S -passes='function(require,require,require,loop(unroll),verify)' | FileCheck %s +; RUN: opt < %s -S -passes='require,loop(unroll),verify' | FileCheck %s ; ; Unit tests for LoopInfo::markAsRemoved. Index: test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll =================================================================== --- test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll +++ test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll @@ -1,4 +1,5 @@ ; RUN: opt -S < %s -loop-unroll -block-freq | FileCheck %s +; RUN: opt -S < %s -passes='require,loop(unroll),require' | FileCheck %s ; Crasher from PR20987. ; CHECK: define void @update_loop_info_in_subloops