Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -92,6 +92,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -2134,8 +2135,6 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequiredID(LoopSimplifyID);
-    AU.addRequiredID(LCSSAID);
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
@@ -7169,9 +7168,7 @@
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
@@ -7543,6 +7540,8 @@
     DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
   }
 
+  formLCSSARecursively(*L, *DT, LI, SE);
+
   using namespace ore;
   if (!VectorizeLoop) {
     assert(IC > 1 && "interleave count should not be 1 or 0");
@@ -7618,6 +7617,16 @@
   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
     return false;
 
+  bool Changed = false;
+
+  // The vectorizer requires loops to be in simplified form.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop vectorizer
+  // will simplify all loops, regardless of whether anything end up being
+  // vectorized.
+  for (auto &L : *LI)
+    Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
+
   // Build up a worklist of inner-loops to vectorize. This is necessary as
   // the act of vectorizing or partially unrolling a loop creates new loops
   // and can invalidate iterators across the loops.
@@ -7629,7 +7638,6 @@
   LoopsAnalyzed += Worklist.size();
 
   // Now walk the identified inner loops.
-  bool Changed = false;
   while (!Worklist.empty())
     Changed |= processLoop(Worklist.pop_back_val());
 
Index: llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll
===================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll
+++ llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; We vectorize the inner loop, so we have to put it in LCSSA form.
+; However, there's no reason to touch the outer loop.
+
+; CHECK-LABEL: @foo
+; CHECK-LABEL: for.end.inner.loopexit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i64 [ %indvars.iv, %for.body.inner ], [ %{{.*}}, %middle.block ]
+; CHECK: store i64 %[[LCSSAPHI]], i64* %O1, align 4
+; CHECK-LABEL: for.end.outer.loopexit
+; CHECK: store i64 %indvars.outer, i64* %O2, align 4
+
+
+define i64 @foo(i32* nocapture %A, i32* nocapture %B, i64 %n, i64 %m, i64* %O1, i64* %O2) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
+
+for.body.outer.preheader:                         ; preds = %entry
+  br label %for.body.outer
+
+for.body.outer:                                   ; preds = %for.body.outer.preheader, %for.end.inner
+  %indvars.outer = phi i64 [ %indvars.outer.next, %for.end.inner ], [ 0, %for.body.outer.preheader ]
+  %cmp2 = icmp sgt i64 %m, 0
+  br i1 %cmp2, label %for.body.inner.preheader, label %for.end.inner
+
+for.body.inner.preheader:                         ; preds = %for.body.outer
+  br label %for.body.inner
+
+for.body.inner:                                   ; preds = %for.body.inner.preheader, %for.body.inner
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.inner ], [ 0, %for.body.inner.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %v = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %v, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, %n
+  br i1 %exitcond, label %for.end.inner.loopexit, label %for.body.inner
+
+for.end.inner.loopexit:                           ; preds = %for.body.inner
+  store i64 %indvars.iv, i64 *%O1, align 4
+  br label %for.end.inner
+
+for.end.inner:                                    ; preds = %for.end.inner.loopexit, %for.body.outer
+  %indvars.outer.next = add i64 %indvars.outer, 1
+  %exitcond.outer = icmp eq i64 %indvars.outer, %m
+  br i1 %exitcond.outer, label %for.end.outer.loopexit, label %for.body.outer
+
+for.end.outer.loopexit:                           ; preds = %for.end.inner
+  store i64 %indvars.outer, i64 *%O2, align 4
+  br label %for.end.outer
+
+for.end.outer:                                    ; preds = %for.end.outer.loopexit, %entry
+  ret i64 undef
+}
Index: llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll
===================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll
+++ llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll
@@ -9,13 +9,6 @@
 ; Since %inc54 is the IV of the outer loop, and %0 equivalent to it,
 ; we get the situation described above.
 
-; This test uses the new PM, because with the old PM, running loop-vectorize
-; would explicitly run loop-simplify. Even though this loop is already in
-; simplified form, loop-simplify would still clean up the phi.
-; The reason this matters is that in a real optimizer pipeline, LICM can create
-; such PHIs, and since it preserves loop simplified form, the cleanup has
-; no chance to run.
-
 ; Code that leads to this situation can look something like:
 ;
 ; int a, b[1], c;
@@ -28,11 +21,14 @@
 ;
 ; The PHI is an artifact of the register promotion of c.
 
+; Note that we can no longer get the vectorizer to actually see such PHIs,
+; because LV now simplifies the loop internally, but the test is still
+; useful as a regression test, and in case loop-simplify behavior changes.
+
 @c = external global i32, align 4
 @a = external global i32, align 4
 @b = external global [1 x i32], align 4
 
-; CHECK: LV: PHI is a recurrence with respect to an outer loop.
 ; CHECK: LV: Not vectorizing: Cannot prove legality.
 ; CHECK-LABEL: @test
 define void @test() {