Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -92,6 +92,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Vectorize.h" @@ -2134,8 +2135,6 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequiredID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -7169,9 +7168,7 @@ INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) @@ -7543,6 +7540,8 @@ DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } + formLCSSARecursively(*L, *DT, LI, SE); + using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); @@ -7618,6 +7617,16 @@ if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) return false; + bool Changed = false; + + // The vectorizer requires loops to be in simplified form. + // Since simplification may add new inner loops, it has to run before the + // legality and profitability checks. This means running the loop vectorizer + // will simplify all loops, regardless of whether anything end up being + // vectorized. + for (auto &L : *LI) + Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */); + // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops // and can invalidate iterators across the loops. @@ -7629,7 +7638,6 @@ LoopsAnalyzed += Worklist.size(); // Now walk the identified inner loops. - bool Changed = false; while (!Worklist.empty()) Changed |= processLoop(Worklist.pop_back_val()); Index: llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll +++ llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll @@ -0,0 +1,54 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s +; We vectorize the inner loop, so we have to put it in LCSSA form. +; However, there's no reason to touch the outer loop. + +; CHECK-LABEL: @foo +; CHECK-LABEL: for.end.inner.loopexit: +; CHECK: %[[LCSSAPHI:.*]] = phi i64 [ %indvars.iv, %for.body.inner ], [ %{{.*}}, %middle.block ] +; CHECK: store i64 %[[LCSSAPHI]], i64* %O1, align 4 +; CHECK-LABEL: for.end.outer.loopexit +; CHECK: store i64 %indvars.outer, i64* %O2, align 4 + + +define i64 @foo(i32* nocapture %A, i32* nocapture %B, i64 %n, i64 %m, i64* %O1, i64* %O2) { +entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer + +for.body.outer.preheader: ; preds = %entry + br label %for.body.outer + +for.body.outer: ; preds = %for.body.outer.preheader, %for.end.inner + %indvars.outer = phi i64 [ %indvars.outer.next, %for.end.inner ], [ 0, %for.body.outer.preheader ] + %cmp2 = icmp sgt i64 %m, 0 + br i1 %cmp2, label %for.body.inner.preheader, label %for.end.inner + +for.body.inner.preheader: ; preds = %for.body.outer + br label %for.body.inner + +for.body.inner: ; preds = %for.body.inner.preheader, %for.body.inner + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.inner ], [ 0, %for.body.inner.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %v = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %v, i32* %arrayidx2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, %n + br i1 %exitcond, label %for.end.inner.loopexit, label %for.body.inner + +for.end.inner.loopexit: ; preds = %for.body.inner + store i64 %indvars.iv, i64 *%O1, align 4 + br label %for.end.inner + +for.end.inner: ; preds = %for.end.inner.loopexit, %for.body.outer + %indvars.outer.next = add i64 %indvars.outer, 1 + %exitcond.outer = icmp eq i64 %indvars.outer, %m + br i1 %exitcond.outer, label %for.end.outer.loopexit, label %for.body.outer + +for.end.outer.loopexit: ; preds = %for.end.inner + store i64 %indvars.outer, i64 *%O2, align 4 + br label %for.end.outer + +for.end.outer: ; preds = %for.end.outer.loopexit, %entry + ret i64 undef +} Index: llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll +++ llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll @@ -9,13 +9,6 @@ ; Since %inc54 is the IV of the outer loop, and %0 equivalent to it, ; we get the situation described above. -; This test uses the new PM, because with the old PM, running loop-vectorize -; would explicitly run loop-simplify. Even though this loop is already in -; simplified form, loop-simplify would still clean up the phi. -; The reason this matters is that in a real optimizer pipeline, LICM can create -; such PHIs, and since it preserves loop simplified form, the cleanup has -; no chance to run. - ; Code that leads to this situation can look something like: ; ; int a, b[1], c; @@ -28,11 +21,14 @@ ; ; The PHI is an artifact of the register promotion of c. +; Note that we can no longer get the vectorizer to actually see such PHIs, +; because LV now simplifies the loop internally, but the test is still +; useful as a regression test, and in case loop-simplify behavior changes. + @c = external global i32, align 4 @a = external global i32, align 4 @b = external global [1 x i32], align 4 -; CHECK: LV: PHI is a recurrence with respect to an outer loop. ; CHECK: LV: Not vectorizing: Cannot prove legality. ; CHECK-LABEL: @test define void @test() {