Index: llvm/trunk/include/llvm/Analysis/Utils/Local.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/Utils/Local.h
+++ llvm/trunk/include/llvm/Analysis/Utils/Local.h
@@ -141,6 +141,18 @@
 bool RecursivelyDeleteTriviallyDeadInstructions(Value *V,
                                         const TargetLibraryInfo *TLI = nullptr);
 
+/// Delete all of the instructions in `DeadInsts`, and all other instructions
+/// that deleting these in turn causes to be trivially dead.
+///
+/// The initial instructions in the provided vector must all have empty use
+/// lists and satisfy `isInstructionTriviallyDead`.
+///
+/// `DeadInsts` will be used as scratch storage for this routine and will be
+/// empty afterward.
+void RecursivelyDeleteTriviallyDeadInstructions(
+    SmallVectorImpl<Instruction *> &DeadInsts,
+    const TargetLibraryInfo *TLI = nullptr);
+
 /// If the specified value is an effectively dead PHI node, due to being a
 /// def-use chain of single-use nodes that either forms a cycle or is terminated
 /// by a trivially dead instruction, delete it. If that makes any of its
Index: llvm/trunk/lib/Transforms/Scalar/LoopInstSimplify.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ llvm/trunk/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/Utils/Local.h"
@@ -45,118 +46,116 @@
 
 STATISTIC(NumSimplified, "Number of redundant instructions simplified");
 
-static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
-                             AssumptionCache *AC,
-                             const TargetLibraryInfo *TLI) {
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  L->getUniqueExitBlocks(ExitBlocks);
-  array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
-
+static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                             AssumptionCache &AC,
+                             const TargetLibraryInfo &TLI) {
+  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+
+  // On the first pass over the loop body we try to simplify every instruction.
+  // On subsequent passes, we can restrict this to only simplifying instructions
+  // where the inputs have been updated. We end up needing two sets: one
+  // containing the instructions we are simplifying in *this* pass, and one for
+  // the instructions we will want to simplify in the *next* pass. We use
+  // pointers so we can swap between two stably allocated sets.
   SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
 
-  // The bit we are stealing from the pointer represents whether this basic
-  // block is the header of a subloop, in which case we only process its phis.
-  using WorklistItem = PointerIntPair<BasicBlock *, 1>;
-  SmallVector<WorklistItem, 16> VisitStack;
-  SmallPtrSet<BasicBlock *, 32> Visited;
+  // Track the PHI nodes that have already been visited during each iteration so
+  // that we can identify when it is necessary to iterate.
+  SmallPtrSet<PHINode *, 4> VisitedPHIs;
+
+  // While simplifying we may discover dead code or cause code to become dead.
+  // Keep track of all such instructions and we will delete them at the end.
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  // First we want to create an RPO traversal of the loop body. By processing in
+  // RPO we can ensure that definitions are processed prior to uses (for non PHI
+  // uses) in all cases. This ensures we maximize the simplifications in each
+  // iteration over the loop and minimizes the possible causes for continuing to
+  // iterate.
+  LoopBlocksRPO RPOT(&L);
+  RPOT.perform(&LI);
 
   bool Changed = false;
-  bool LocalChanged;
-  do {
-    LocalChanged = false;
-
-    VisitStack.clear();
-    Visited.clear();
-
-    VisitStack.push_back(WorklistItem(L->getHeader(), false));
-
-    while (!VisitStack.empty()) {
-      WorklistItem Item = VisitStack.pop_back_val();
-      BasicBlock *BB = Item.getPointer();
-      bool IsSubloopHeader = Item.getInt();
-      const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
-      // Simplify instructions in the current basic block.
-      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-        Instruction *I = &*BI++;
-
-        // The first time through the loop ToSimplify is empty and we try to
-        // simplify all instructions. On later iterations ToSimplify is not
-        // empty and we only bother simplifying instructions that are in it.
-        if (!ToSimplify->empty() && !ToSimplify->count(I))
+  for (;;) {
+    for (BasicBlock *BB : RPOT) {
+      for (Instruction &I : *BB) {
+        if (auto *PI = dyn_cast<PHINode>(&I))
+          VisitedPHIs.insert(PI);
+
+        if (I.use_empty()) {
+          if (isInstructionTriviallyDead(&I, &TLI))
+            DeadInsts.push_back(&I);
           continue;
-
-        // Don't bother simplifying unused instructions.
-        if (!I->use_empty()) {
-          Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC});
-          if (V && LI->replacementPreservesLCSSAForm(I, V)) {
-            // Mark all uses for resimplification next time round the loop.
-            for (User *U : I->users())
-              Next->insert(cast<Instruction>(U));
-
-            I->replaceAllUsesWith(V);
-            LocalChanged = true;
-            ++NumSimplified;
-          }
-        }
-        if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
-          // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
-          // instruction, so simply incrementing the iterator does not work.
-          // When instructions get deleted re-iterate instead.
-          BI = BB->begin();
-          BE = BB->end();
-          LocalChanged = true;
         }
 
-        if (IsSubloopHeader && !isa<PHINode>(I))
-          break;
-      }
+        // We special case the first iteration which we can detect due to the
+        // empty `ToSimplify` set.
+        bool IsFirstIteration = ToSimplify->empty();
 
-      // Add all successors to the worklist, except for loop exit blocks and the
-      // bodies of subloops. We visit the headers of loops so that we can
-      // process
-      // their phis, but we contract the rest of the subloop body and only
-      // follow
-      // edges leading back to the original loop.
-      for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
-           ++SI) {
-        BasicBlock *SuccBB = *SI;
-        if (!Visited.insert(SuccBB).second)
+        if (!IsFirstIteration && !ToSimplify->count(&I))
           continue;
 
-        const Loop *SuccLoop = LI->getLoopFor(SuccBB);
-        if (SuccLoop && SuccLoop->getHeader() == SuccBB &&
-            L->contains(SuccLoop)) {
-          VisitStack.push_back(WorklistItem(SuccBB, true));
-
-          SmallVector<BasicBlock *, 8> SubLoopExitBlocks;
-          SuccLoop->getExitBlocks(SubLoopExitBlocks);
-
-          for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
-            BasicBlock *ExitBB = SubLoopExitBlocks[i];
-            if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
-              VisitStack.push_back(WorklistItem(ExitBB, false));
-          }
-
+        Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I));
+        if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
           continue;
-        }
 
-        bool IsExitBlock =
-            std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB);
-        if (IsExitBlock)
-          continue;
+        for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+             UI != UE;) {
+          Use &U = *UI++;
+          auto *UserI = cast<Instruction>(U.getUser());
+          U.set(V);
+
+          // If the instruction is used by a PHI node we have already processed
+          // we'll need to iterate on the loop body to converge, so add it to
+          // the next set.
+          if (auto *UserPI = dyn_cast<PHINode>(UserI))
+            if (VisitedPHIs.count(UserPI)) {
+              Next->insert(UserPI);
+              continue;
+            }
+
+          // If we are only simplifying targeted instructions and the user is an
+          // instruction in the loop body, add it to our set of targeted
+          // instructions. Because we process defs before uses (outside of PHIs)
+          // we won't have visited it yet.
+          //
+          // We also skip any uses outside of the loop being simplified. Those
+          // should always be PHI nodes due to LCSSA form, and we don't want to
+          // try to simplify those away.
+          assert((L.contains(UserI) || isa<PHINode>(UserI)) &&
+                 "Uses outside the loop should be PHI nodes due to LCSSA!");
+          if (!IsFirstIteration && L.contains(UserI))
+            ToSimplify->insert(UserI);
+        }
 
-        VisitStack.push_back(WorklistItem(SuccBB, false));
+        assert(I.use_empty() && "Should always have replaced all uses!");
+        if (isInstructionTriviallyDead(&I, &TLI))
+          DeadInsts.push_back(&I);
+        ++NumSimplified;
+        Changed = true;
       }
     }
 
-    // Place the list of instructions to simplify on the next loop iteration
-    // into ToSimplify.
-    std::swap(ToSimplify, Next);
-    Next->clear();
+    // Delete any dead instructions found thus far now that we've finished an
+    // iteration over all instructions in all the loop blocks.
+    if (!DeadInsts.empty()) {
+      Changed = true;
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI);
+    }
 
-    Changed |= LocalChanged;
-  } while (LocalChanged);
+    // If we never found a PHI that needs to be simplified in the next
+    // iteration, we're done.
+    if (Next->empty())
+      break;
+
+    // Otherwise, put the next set in place for the next iteration and reset it
+    // and the visited PHIs for that iteration.
+    std::swap(Next, ToSimplify);
+    Next->clear();
+    VisitedPHIs.clear();
+    DeadInsts.clear();
+  }
 
   return Changed;
 }
@@ -174,21 +173,20 @@
   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
     if (skipLoop(L))
       return false;
-    DominatorTreeWrapperPass *DTWP =
-        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-    DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    AssumptionCache *AC =
-        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
             *L->getHeader()->getParent());
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
-    return SimplifyLoopInst(L, DT, LI, AC, TLI);
+    return simplifyLoopInst(*L, DT, LI, AC, TLI);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.setPreservesCFG();
     getLoopAnalysisUsage(AU);
@@ -200,7 +198,7 @@
 PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
                                             LoopStandardAnalysisResults &AR,
                                             LPMUpdater &) {
-  if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
+  if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI))
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
Index: llvm/trunk/lib/Transforms/Utils/Local.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Utils/Local.cpp
+++ llvm/trunk/lib/Transforms/Utils/Local.cpp
@@ -434,18 +434,31 @@
 
   SmallVector<Instruction*, 16> DeadInsts;
   DeadInsts.push_back(I);
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
 
-  do {
-    I = DeadInsts.pop_back_val();
-    salvageDebugInfo(*I);
+  return true;
+}
+
+void llvm::RecursivelyDeleteTriviallyDeadInstructions(
+    SmallVectorImpl<Instruction *> &DeadInsts, const TargetLibraryInfo *TLI) {
+  // Process the dead instruction list until empty.
+  while (!DeadInsts.empty()) {
+    Instruction &I = *DeadInsts.pop_back_val();
+    assert(I.use_empty() && "Instructions with uses are not dead.");
+    assert(isInstructionTriviallyDead(&I, TLI) &&
+           "Live instruction found in dead worklist!");
+
+    // Don't lose the debug info while deleting the instructions.
+    salvageDebugInfo(I);
 
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      Value *OpV = I->getOperand(i);
-      I->setOperand(i, nullptr);
+    for (Use &OpU : I.operands()) {
+      Value *OpV = OpU.get();
+      OpU.set(nullptr);
 
-      if (!OpV->use_empty()) continue;
+      if (!OpV->use_empty())
+        continue;
 
       // If the operand is an instruction that became dead as we nulled out the
       // operand, and if it is 'trivially' dead, delete it in a future loop
@@ -455,10 +468,8 @@
           DeadInsts.push_back(OpI);
     }
 
-    I->eraseFromParent();
-  } while (!DeadInsts.empty());
-
-  return true;
+    I.eraseFromParent();
+  }
 }
 
 /// areAllUsesEqual - Check whether the uses of a value are all the same.
Index: llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll
===================================================================
--- llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll
+++ llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -passes=loop-instsimplify | FileCheck %s
+
+; Test very basic folding and propagation occurs within a loop body. This should
+; collapse to the loop iteration structure and the LCSSA PHI node.
+define i32 @test1(i32 %n, i32 %x) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %x.add = add nsw i32 %x, 0
+  %x.sub = sub i32 %x.add, 0
+  %x.and = and i32 %x.sub, -1
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.and, %loop ]
+  ret i32 %x.lcssa
+}
+
+; Test basic loop structure that still has a simplification feed a prior PHI.
+define i32 @test2(i32 %n, i32 %x) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %x.loop = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+  %x.next = add nsw i32 %x.loop, 0
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.loop, %loop ]
+  ret i32 %x.lcssa
+}
+
+; Test a diamond CFG with inner PHI nodes.
+define i32 @test3(i32 %n, i32 %x) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[X_CMP:%.*]] = icmp slt i32 [[I]], 42
+; CHECK-NEXT:    br i1 [[X_CMP]], label [[LOOP_LHS:%.*]], label [[LOOP_RHS:%.*]]
+; CHECK:       loop.lhs:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.rhs:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %x.loop = phi i32 [ %x, %entry ], [ %x.phi, %loop.latch ]
+  %x.add = add nsw i32 %x.loop, 0
+  %x.cmp = icmp slt i32 %i, 42
+  br i1 %x.cmp, label %loop.lhs, label %loop.rhs
+
+loop.lhs:
+  %x.l.add = add nsw i32 %x.add, 0
+  br label %loop.latch
+
+loop.rhs:
+  %x.r.sub = sub nsw i32 %x.add, 0
+  br label %loop.latch
+
+loop.latch:
+  %x.phi = phi i32 [ %x.l.add, %loop.lhs ], [ %x.r.sub, %loop.rhs ]
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.loop, %loop.latch ]
+  ret i32 %x.lcssa
+}
+
+; Test an inner loop that is only simplified when processing the outer loop, and
+; an outer loop only simplified when processing the inner loop.
+define i32 @test4(i32 %n, i32 %m, i32 %x) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_INNER:%.*]]
+; CHECK:       loop.inner:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[J_NEXT:%.*]], [[LOOP_INNER]] ]
+; CHECK-NEXT:    [[J_NEXT]] = add nsw i32 [[J]], 1
+; CHECK-NEXT:    [[J_CMP:%.*]] = icmp slt i32 [[J_NEXT]], [[M:%.*]]
+; CHECK-NEXT:    br i1 [[J_CMP]], label [[LOOP_INNER]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %x.loop = phi i32 [ %x, %entry ], [ %x.inner.lcssa, %loop.latch ]
+  %x.add = add nsw i32 %x.loop, 0
+  br label %loop.inner
+
+loop.inner:
+  %j = phi i32 [ 0, %loop ], [ %j.next, %loop.inner ]
+  %x.inner.loop = phi i32 [ %x.add, %loop ], [ %x.inner.add, %loop.inner ]
+  %x.inner.add = add nsw i32 %x.inner.loop, 0
+  %j.next = add nsw i32 %j, 1
+  %j.cmp = icmp slt i32 %j.next, %m
+  br i1 %j.cmp, label %loop.inner, label %loop.latch
+
+loop.latch:
+  %x.inner.lcssa = phi i32 [ %x.inner.loop, %loop.inner ]
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.loop, %loop.latch ]
+  ret i32 %x.lcssa
+}