diff --git a/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h b/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h --- a/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h +++ b/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h @@ -15,9 +15,7 @@ namespace llvm { -class Loop; -struct LoopStandardAnalysisResults; -class LPMUpdater; +class Function; /// A simple loop rotation transformation. class LoopUnrollAndJamPass : public PassInfoMixin { @@ -25,8 +23,7 @@ public: explicit LoopUnrollAndJamPass(int OptLevel = 2) : OptLevel(OptLevel) {} - PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, LPMUpdater &U); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; } // end namespace llvm diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -970,8 +970,7 @@ // across the loop nests. // We do UnrollAndJam in a separate LPM to ensure it happens before unroll if (EnableUnrollAndJam && PTO.LoopUnrolling) { - OptimizePM.addPass( - createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level))); + OptimizePM.addPass(LoopUnrollAndJamPass(Level)); } OptimizePM.addPass(LoopUnrollPass( LoopUnrollOptions(Level, /*OnlyWhenForced=*/!PTO.LoopUnrolling, diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -235,6 +235,7 @@ FUNCTION_PASS("sroa", SROA()) FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass()) +FUNCTION_PASS("unroll-and-jam", LoopUnrollAndJamPass()) FUNCTION_PASS("verify", VerifierPass()) FUNCTION_PASS("verify", DominatorTreeVerifierPass()) FUNCTION_PASS("verify", LoopVerifierPass()) @@ -307,7 +308,6 @@ LOOP_PASS("strength-reduce", LoopStrengthReducePass()) LOOP_PASS("indvars", IndVarSimplifyPass()) LOOP_PASS("irce", IRCEPass()) -LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass()) LOOP_PASS("unroll-full", LoopFullUnrollPass()) LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs())) LOOP_PASS("print", DDGAnalysisPrinterPass(dbgs())) diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -427,51 +427,64 @@ return UnrollResult; } +static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI, + ScalarEvolution &SE, + const TargetTransformInfo &TTI, + AssumptionCache &AC, DependenceInfo &DI, + OptimizationRemarkEmitter &ORE, + int OptLevel) { + SmallPriorityWorklist Worklist; + internal::appendLoopsToWorklist(reverse(LI), Worklist); + bool DidSomething = false; + do { + Loop *L = Worklist.pop_back_val(); + formLCSSA(*L, DT, &LI, &SE); + LoopUnrollResult Result = + tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel); + if (Result != LoopUnrollResult::Unmodified) + DidSomething = true; + } while (!Worklist.empty()); + + return DidSomething; +} + namespace { -class LoopUnrollAndJam : public LoopPass { +class LoopUnrollAndJam : public FunctionPass { public: static char ID; // Pass ID, replacement for typeid unsigned OptLevel; - LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) { + LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) { initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM) override { - if (skipLoop(L)) + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; - Function &F = *L->getHeader()->getParent(); - auto &DT = getAnalysis().getDomTree(); - LoopInfo *LI = &getAnalysis().getLoopInfo(); + LoopInfo &LI = getAnalysis().getLoopInfo(); ScalarEvolution &SE = getAnalysis().getSE(); const TargetTransformInfo &TTI = getAnalysis().getTTI(F); auto &AC = getAnalysis().getAssumptionCache(F); auto &DI = getAnalysis().getDI(); - // For the old PM, we can't use OptimizationRemarkEmitter as an analysis - // pass. Function analyses need to be preserved across loop transformations - // but ORE cannot be preserved (see comment before the pass definition). - OptimizationRemarkEmitter ORE(&F); - - LoopUnrollResult Result = - tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); + auto &ORE = getAnalysis().getORE(); - if (Result == LoopUnrollResult::FullyUnrolled) - LPM.markLoopAsDeleted(*L); - - return Result != LoopUnrollResult::Unmodified; + return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); } /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); - getLoopAnalysisUsage(AU); + AU.addRequired(); } }; @@ -492,26 +505,18 @@ return new LoopUnrollAndJam(OptLevel); } -PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, - LPMUpdater &) { - const auto &FAM = - AM.getResult(L, AR).getManager(); - Function *F = L.getHeader()->getParent(); - - auto *ORE = FAM.getCachedResult(*F); - // FIXME: This should probably be optional rather than required. - if (!ORE) - report_fatal_error( - "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at " - "a higher level"); - - DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI); - - LoopUnrollResult Result = tryToUnrollAndJamLoop( - &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel); - - if (Result == LoopUnrollResult::Unmodified) +PreservedAnalyses LoopUnrollAndJamPass::run(Function &F, + FunctionAnalysisManager &AM) { + ScalarEvolution &SE = AM.getResult(F); + LoopInfo &LI = AM.getResult(F); + TargetTransformInfo &TTI = AM.getResult(F); + AssumptionCache &AC = AM.getResult(F); + DominatorTree &DT = AM.getResult(F); + DependenceInfo &DI = AM.getResult(F); + OptimizationRemarkEmitter &ORE = + AM.getResult(F); + + if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll --- a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll +++ b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll @@ -10,10 +10,13 @@ define void @fore_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i store i32 1, i32* %arrayidx, align 4 br label %for.inner @@ -35,7 +38,10 @@ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72 store i32 %add, i32* %arrayidx8, align 4 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -50,10 +56,13 @@ define void @fore_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i store i32 1, i32* %arrayidx, align 4 br label %for.inner @@ -75,7 +84,10 @@ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i store i32 %add, i32* %arrayidx8, align 4 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -88,10 +100,13 @@ define void @fore_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i store i32 1, i32* %arrayidx, align 4 br label %for.inner @@ -113,7 +128,10 @@ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72 store i32 %add, i32* %arrayidx8, align 4 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -128,10 +146,13 @@ define void @fore_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i store i32 1, i32* %arrayidx, align 4 br label %for.inner @@ -153,7 +174,10 @@ for.latch: %add7 = add nuw nsw i32 %i, 1 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -168,10 +192,13 @@ define void @fore_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i store i32 1, i32* %arrayidx, align 4 br label %for.inner @@ -193,7 +220,10 @@ for.latch: %add7 = add nuw nsw i32 %i, 1 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -206,10 +236,13 @@ define void @fore_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i store i32 1, i32* %arrayidx, align 4 br label %for.inner @@ -231,7 +264,10 @@ for.latch: %add7 = add nuw nsw i32 %i, 1 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -246,10 +282,13 @@ define void @sub_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] br label %for.inner for.inner: @@ -271,7 +310,10 @@ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72 store i32 %add, i32* %arrayidx8, align 4 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -286,10 +328,13 @@ define void @sub_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] br label %for.inner for.inner: @@ -311,7 +356,10 @@ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i store i32 %add, i32* %arrayidx8, align 4 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -324,10 +372,13 @@ define void @sub_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] br label %for.inner for.inner: @@ -349,7 +400,10 @@ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72 store i32 %add, i32* %arrayidx8, align 4 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -362,10 +416,13 @@ define void @sub_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] br label %for.inner for.inner: @@ -387,7 +444,10 @@ for.latch: %add7 = add nuw nsw i32 %i, 1 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -400,10 +460,13 @@ define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] br label %for.inner for.inner: @@ -425,7 +488,10 @@ for.latch: %add7 = add nuw nsw i32 %i, 1 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void @@ -438,10 +504,13 @@ define void @sub_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 - br i1 %cmp, label %for.outer, label %cleanup + br i1 %cmp, label %for.outer.preheader, label %cleanup + +for.outer.preheader: + br label %for.outer for.outer: - %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + %i = phi i32 [ %add7, %for.latch ], [ 0, %for.outer.preheader ] br label %for.inner for.inner: @@ -463,7 +532,10 @@ for.latch: %add7 = add nuw nsw i32 %i, 1 %exitcond29 = icmp eq i32 %add7, %N - br i1 %exitcond29, label %cleanup, label %for.outer + br i1 %exitcond29, label %cleanup.loopexit, label %for.outer + +cleanup.loopexit: + br label %cleanup cleanup: ret void diff --git a/llvm/test/Transforms/LoopUnrollAndJam/disable.ll b/llvm/test/Transforms/LoopUnrollAndJam/disable.ll --- a/llvm/test/Transforms/LoopUnrollAndJam/disable.ll +++ b/llvm/test/Transforms/LoopUnrollAndJam/disable.ll @@ -44,7 +44,10 @@ %inc8 = add nuw nsw i32 %b.028, 1 %add10 = add nuw nsw i32 %i.029, 1 %exitcond30 = icmp eq i32 %add10, %I - br i1 %exitcond30, label %return, label %for.outer + br i1 %exitcond30, label %return.loopexit, label %for.outer + +return.loopexit: + br label %return return: ret void @@ -71,11 +74,14 @@ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.032 %0 = load i32, i32* %arrayidx, align 4 %tobool = icmp eq i32 %0, 0 - br i1 %tobool, label %for.latch, label %for.inner + br i1 %tobool, label %for.latch, label %for.inner.preheader + +for.inner.preheader: + br label %for.inner for.inner: - %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.outer ] - %sum1.029 = phi i32 [ %sum1.1, %for.inner ], [ 0, %for.outer ] + %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ] + %sum1.029 = phi i32 [ %sum1.1, %for.inner ], [ 0, %for.inner.preheader ] %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.030 %1 = load i32, i32* %arrayidx6, align 4 %tobool7 = icmp eq i32 %1, 0 @@ -84,15 +90,21 @@ %sum1.1 = select i1 %tobool7, i32 %sum1.029, i32 %add %inc = add nuw i32 %j.030, 1 %exitcond = icmp eq i32 %inc, %J - br i1 %exitcond, label %for.latch, label %for.inner + br i1 %exitcond, label %for.latch.loopexit, label %for.inner + +for.latch.loopexit: + br label %for.latch for.latch: - %sum1.1.lcssa = phi i32 [ 0, %for.outer ], [ %sum1.1, %for.inner ] + %sum1.1.lcssa = phi i32 [ 0, %for.outer ], [ %sum1.1, %for.latch.loopexit ] %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %i.032 store i32 %sum1.1.lcssa, i32* %arrayidx11, align 4 %add13 = add nuw i32 %i.032, 1 %exitcond33 = icmp eq i32 %add13, %I - br i1 %exitcond33, label %for.end14, label %for.outer + br i1 %exitcond33, label %for.end14.loopexit, label %for.outer + +for.end14.loopexit: + br label %for.end14 for.end14: ret void @@ -190,6 +202,9 @@ %exitcond24 = icmp eq i32 %indvars.iv.next, %I br i1 %exitcond24, label %for.end9, label %for.outer +for.end9.loopexit: + br label %for.end9 + for.end9: ret void } @@ -306,6 +321,9 @@ %exitcond = icmp eq i32 %add9, %J br i1 %exitcond, label %for.cond3.for.cond.cleanup5_crit_edge, label %for.body6 +for.end12.loopexit: + br label %for.end12 + for.end12: ret void } @@ -359,6 +377,9 @@ %exitcond = icmp eq i32 %add13, %J br i1 %exitcond, label %for.latch, label %for.inner +for.end18.loopexit: + br label %for.end18 + for.end18: ret void } @@ -576,11 +597,14 @@ %add = add i32 %0, %sum1 %inc = add nuw i32 %j, 1 %exitcond = icmp eq i32 %inc, %J - br i1 %exitcond, label %for.inner2, label %for.inner + br i1 %exitcond, label %for.inner2.preheader, label %for.inner + +for.inner2.preheader: + br label %for.inner2 for.inner2: - %j2 = phi i32 [ 0, %for.inner ], [ %inc2, %for.inner2 ] - %sum12 = phi i32 [ 0, %for.inner ], [ %add2, %for.inner2 ] + %j2 = phi i32 [ %inc2, %for.inner2 ], [ 0, %for.inner2.preheader ] + %sum12 = phi i32 [ %add2, %for.inner2 ], [ 0, %for.inner2.preheader ] %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %j2 %l0 = load i32, i32* %arrayidx2, align 4 %add2 = add i32 %l0, %sum12 @@ -622,11 +646,14 @@ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ] %add8 = add nuw i32 %i, 1 %exitcond23 = icmp eq i32 %add8, %I - br i1 %exitcond23, label %for.end.loopexit, label %for.inner + br i1 %exitcond23, label %for.end.loopexit, label %for.inner.preheader + +for.inner.preheader: + br label %for.inner for.inner: - %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ] - %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ] + %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ] + %sum1 = phi i32 [ %add, %for.inner ], [ 0, %for.inner.preheader ] %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j %0 = load i32, i32* %arrayidx, align 4 %add = add i32 %0, %sum1 @@ -667,11 +694,14 @@ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ] %add8 = add nuw i32 %i, 1 %exitcond25 = icmp eq i32 %add8, %I - br i1 %exitcond25, label %for.end.loopexit, label %for.inner + br i1 %exitcond25, label %for.end.loopexit, label %for.inner.preheader + +for.inner.preheader: + br label %for.inner for.inner: - %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ] - %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ] + %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ] + %sum1 = phi i32 [ %add, %for.inner ], [ 0, %for.inner.preheader ] %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j %0 = load i32, i32* %arrayidx, align 4 %add = add i32 %0, %sum1 diff --git a/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll b/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll --- a/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll +++ b/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll @@ -64,9 +64,12 @@ %add.ptr.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %idx.ext %inc30.us = add nuw nsw i32 %y.055.us, 1 %exitcond58 = icmp eq i32 %inc30.us, %height - br i1 %exitcond58, label %for.cond.cleanup, label %for.cond1.preheader.us, !llvm.loop !5 + br i1 %exitcond58, label %for.cond.cleanup.loopexit, label %for.cond1.preheader.us, !llvm.loop !5 -for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %for.cond1.preheader.lr.ph, %entry ret void } @@ -132,9 +135,12 @@ %add.ptr.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %idx.ext %inc30.us = add nuw nsw i32 %y.055.us, 1 %exitcond58 = icmp eq i32 %inc30.us, %height - br i1 %exitcond58, label %for.cond.cleanup, label %for.cond1.preheader.us, !llvm.loop !7 + br i1 %exitcond58, label %for.cond.cleanup.loopexit, label %for.cond1.preheader.us, !llvm.loop !7 + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup -for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %for.cond1.preheader.lr.ph, %entry ret void } diff --git a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll --- a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll +++ b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll @@ -297,7 +297,10 @@ store i32 %add, i32* %arrayidx6, align 4, !tbaa !5 %add8 = add nuw nsw i32 %i, 1 %exitcond23 = icmp eq i32 %add8, 4 - br i1 %exitcond23, label %for.end, label %for.outer + br i1 %exitcond23, label %for.end.loopexit, label %for.outer + +for.end.loopexit: + br label %for.end for.end: ret void @@ -344,7 +347,10 @@ store i32 %add, i32* %arrayidx6, align 4, !tbaa !5 %add8 = add nuw nsw i32 %i, 1 %exitcond23 = icmp eq i32 %add8, 1 - br i1 %exitcond23, label %for.end, label %for.outer + br i1 %exitcond23, label %for.end.loopexit, label %for.outer + +for.end.loopexit: + br label %for.end for.end: ret void @@ -509,7 +515,7 @@ for.latch: store i32 %add9, i32* %arrayidx, align 4, !tbaa !5 %exitcond30 = icmp eq i32 %add, %I - br i1 %exitcond30, label %for.end, label %for.outer + br i1 %exitcond30, label %for.end.loopexit, label %for.outer for.inner: %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ] @@ -521,6 +527,9 @@ %exitcond = icmp eq i32 %add10, %J br i1 %exitcond, label %for.latch, label %for.inner +for.end.loopexit: + br label %for.end + for.end: ret void } @@ -597,7 +606,10 @@ for.cleanup: %inc = add nuw nsw i32 %x.038, 1 %exitcond41 = icmp eq i32 %inc, 5 - br i1 %exitcond41, label %for.end, label %for.outest + br i1 %exitcond41, label %for.end.loopexit, label %for.outest + +for.end.loopexit: + br label %for.end for.end: ret void