Index: include/llvm/Analysis/LoopUnrollAnalyzer.h
===================================================================
--- include/llvm/Analysis/LoopUnrollAnalyzer.h
+++ include/llvm/Analysis/LoopUnrollAnalyzer.h
@@ -89,6 +89,7 @@
   bool visitLoad(LoadInst &I);
   bool visitCastInst(CastInst &I);
   bool visitCmpInst(CmpInst &I);
+  bool visitPHINode(PHINode &PN);
 };
 }
 #endif
Index: lib/Analysis/LoopUnrollAnalyzer.cpp
===================================================================
--- lib/Analysis/LoopUnrollAnalyzer.cpp
+++ lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -189,3 +189,11 @@
 
   return Base::visitCmpInst(I);
 }
+
+bool UnrolledInstAnalyzer::visitPHINode(PHINode &PN) {
+  // The loop induction PHI nodes are definitionally free.
+  if (PN.getParent() == L->getHeader())
+    return true;
+
+  return Base::visitPHINode(PN);
+}
Index: lib/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll
===================================================================
--- /dev/null
+++ lib/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=80 | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@known_constant = internal unnamed_addr constant [10 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0], align 16
+
+; If a load becomes a constant after loop unrolling, we sometimes can simplify
+; CFG. This test verifies that we handle such cases.
+; After one operand in an instruction is constant-folded and the
+; instruction is simplified, the other operand might become dead.
+; In this test we have::
+; for i in 1..10:
+;   r += A[i] * B[i]
+; A[i] is 0 almost at every iteration, so there is no need in loading B[i] at
+; all.
+
+
+; CHECK-LABEL: @unroll_dce
+; CHECK-NOT:   br i1 %exitcond, label %for.end, label %for.body
+define i32 @unroll_dce(i32* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv.0 = phi i64 [ 0, %entry ], [ %iv.1, %for.body ]
+  %r.0 = phi i32 [ 0, %entry ], [ %r.1, %for.body ]
+  %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv.0
+  %x1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv.0
+  %x2 = load i32, i32* %arrayidx2, align 4
+  %mul = mul i32 %x1, %x2
+  %r.1 = add i32 %mul, %r.0
+  %iv.1 = add nuw nsw i64 %iv.0, 1
+  %exitcond = icmp eq i64 %iv.1, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %r.1
+}
Index: lib/Transforms/Scalar/LoopUnrollPass.cpp
===================================================================
--- lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -171,6 +171,40 @@
 }
 
 namespace {
+/// A struct to densely store the state of an instruction after unrolling at
+/// each iteration.
+///
+/// This is designed to work like a tuple of <Instruction *, int> for the
+/// purposes of hashing and lookup, but to be able to associate two boolean
+/// states with each key.
+struct UnrolledInstState {
+  Instruction *I;
+  int Iteration : 30;
+  unsigned IsFree : 1;
+  unsigned IsCounted : 1;
+};
+
+/// Hashing and equality testing for a set of the instruction states.
+struct UnrolledInstStateKeyInfo {
+  typedef DenseMapInfo<Instruction *> PtrInfo;
+  typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo;
+  static inline UnrolledInstState getEmptyKey() {
+    return {PtrInfo::getEmptyKey(), 0, 0, 0};
+  }
+  static inline UnrolledInstState getTombstoneKey() {
+    return {PtrInfo::getTombstoneKey(), 0, 0, 0};
+  }
+  static inline unsigned getHashValue(const UnrolledInstState &S) {
+    return PairInfo::getHashValue({S.I, S.Iteration});
+  }
+  static inline bool isEqual(const UnrolledInstState &LHS,
+                             const UnrolledInstState &RHS) {
+    return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
+  }
+};
+}
+
+namespace {
 struct EstimatedUnrollCost {
   /// \brief The estimated cost after unrolling.
   int UnrolledCost;
@@ -204,18 +238,25 @@
   assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
          "The unroll iterations max is too large!");
 
+  // Only analyze inner loops. We can't properly estimate cost of nested loops
+  // and we won't visit inner loops again anyway.
+  if (!L->empty())
+    return None;
+
   // Don't simulate loops with a big or unknown tripcount
   if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
       TripCount > UnrollMaxIterationsCountToAnalyze)
     return None;
 
   SmallSetVector<BasicBlock *, 16> BBWorklist;
+  SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
   DenseMap<Value *, Constant *> SimplifiedValues;
   SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
 
   // The estimated cost of the unrolled form of the loop. We try to estimate
   // this by simplifying as much as we can while computing the estimate.
   int UnrolledCost = 0;
+
   // We also track the estimated dynamic (that is, actually executed) cost in
   // the rolled form. This helps identify cases when the savings from unrolling
   // aren't just exposing dead control flows, but actual reduced dynamic
@@ -223,6 +264,95 @@
   // unrolling.
   int RolledDynamicCost = 0;
 
+  // We track the simplification of each instruction in each iteration. We use
+  // this to recursively merge costs into the unrolled cost on-demand so that
+  // we don't count the cost of any dead code. This is essentially a map from
+  // <instrution, int> to <bool, bool>, but stored as a densely packed struct.
+  DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
+
+  // A small worklist used to accumulate cost of instructions from each
+  // observable and reached root in the loop.
+  SmallVector<Instruction *, 16> CostWorklist;
+
+  // PHI-used worklist used between iterations while accumulating cost.
+  SmallVector<Instruction *, 4> PHIUsedList;
+
+  // Helper function to accumulate cost for instructions in the loop.
+  auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
+    assert(Iteration >= 0 && "Cannot have a negative iteration!");
+    assert(CostWorklist.empty() && "Must start with an empty cost list");
+    assert(PHIUsedList.empty() && "Must start with an empty phi used list");
+    CostWorklist.push_back(&RootI);
+    for (;; --Iteration) {
+      do {
+        Instruction *I = CostWorklist.pop_back_val();
+
+        auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
+        if (CostIter == InstCostMap.end())
+          // If an input to a PHI node comes from a dead path through the loop
+          // we may have no cost data for it here. What that actually means is
+          // that it is free.
+          continue;
+        auto &Cost = *CostIter;
+        if (Cost.IsCounted)
+          // Already counted this instruction.
+          continue;
+
+        // Mark that we are counting the cost of this instruction now.
+        Cost.IsCounted = true;
+
+        // If this is a PHI node in the loop header, just add it to the PHI set.
+        if (auto *PhiI = dyn_cast<PHINode>(I))
+          if (PhiI->getParent() == L->getHeader()) {
+            assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
+                                  "inherently simplify during unrolling.");
+            if (Iteration == 0)
+              continue;
+
+            // Push the incoming value from the backedge into the PHI used list
+            // if it is an in-loop instruction. We'll use this to populate the
+            // cost worklist for the next iteration (as we count backwards).
+            if (auto *OpI = dyn_cast<Instruction>(
+                    PhiI->getIncomingValueForBlock(L->getLoopLatch())))
+              if (L->contains(OpI))
+                PHIUsedList.push_back(OpI);
+            continue;
+          }
+
+        // First accumulate the cost of this instruction.
+        if (!Cost.IsFree) {
+          UnrolledCost += TTI.getUserCost(I);
+          DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
+                       << "): ");
+          DEBUG(I->dump());
+        }
+
+        // We must count the cost of every operand which is not free,
+        // recursively. If we reach a loop PHI node, simply add it to the set
+        // to be considered on the next iteration (backwards!).
+        for (Value *Op : I->operands()) {
+          // Check whether this operand is free due to being a constant or
+          // outside the loop.
+          auto *OpI = dyn_cast<Instruction>(Op);
+          if (!OpI || !L->contains(OpI))
+            continue;
+
+          // Otherwise accumulate its cost.
+          CostWorklist.push_back(OpI);
+        }
+      } while (!CostWorklist.empty());
+
+      if (PHIUsedList.empty())
+        // We've exhausted the search.
+        break;
+
+      assert(Iteration > 0 &&
+             "Cannot track PHI-used values past the first iteration!");
+      CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
+      PHIUsedList.clear();
+    }
+  };
+
   // Ensure that we don't violate the loop structure invariants relied on by
   // this analysis.
   assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
@@ -277,22 +407,31 @@
       // it.  We don't change the actual IR, just count optimization
       // opportunities.
       for (Instruction &I : *BB) {
-        int InstCost = TTI.getUserCost(&I);
+        // Track this instruction's expected baseline cost when executing the
+        // rolled loop form.
+        RolledDynamicCost += TTI.getUserCost(&I);
 
         // Visit the instruction to analyze its loop cost after unrolling,
-        // and if the visitor returns false, include this instruction in the
-        // unrolled cost.
-        if (!Analyzer.visit(I))
-          UnrolledCost += InstCost;
-        else {
-          DEBUG(dbgs() << "  " << I
-                       << " would be simplified if loop is unrolled.\n");
-          (void)0;
-        }
+        // and if the visitor returns true, mark the instruction as free after
+        // unrolling and continue.
+        bool IsFree = Analyzer.visit(I);
+        bool Inserted = InstCostMap.insert({&I, (int)Iteration, IsFree,
+                                            /*IsCounted*/ false})
+                            .second;
+        (void)Inserted;
+        assert(Inserted && "Cannot have a state for an unvisited instruction!");
+
+        if (IsFree)
+          continue;
 
-        // Also track this instructions expected cost when executing the rolled
-        // loop form.
-        RolledDynamicCost += InstCost;
+        // If the instruction might have a side-effect recursively account for
+        // the cost of it and all the instructions leading up to it.
+        if (I.mayHaveSideEffects())
+          AddCostRecursively(I, Iteration);
+
+        // Can't properly modeal a cost of a call.
+        if(isa<CallInst>(&I))
+          return None;
 
         // If unrolled body turns out to be too big, bail out.
         if (UnrolledCost > MaxUnrolledLoopSize) {
@@ -321,6 +460,8 @@
                   cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0);
             if (L->contains(Succ))
               BBWorklist.insert(Succ);
+            else
+              ExitWorklist.insert({BB, Succ});
             continue;
           }
         }
@@ -336,6 +477,8 @@
                        .getCaseSuccessor();
           if (L->contains(Succ))
             BBWorklist.insert(Succ);
+          else
+            ExitWorklist.insert({BB, Succ});
           continue;
         }
       }
@@ -344,6 +487,8 @@
       for (BasicBlock *Succ : successors(BB))
         if (L->contains(Succ))
           BBWorklist.insert(Succ);
+        else
+          ExitWorklist.insert({BB, Succ});
     }
 
     // If we found no optimization opportunities on the first iteration, we
@@ -354,6 +499,23 @@
       return None;
     }
   }
+
+  while (!ExitWorklist.empty()) {
+    BasicBlock *ExitingBB, *ExitBB;
+    std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
+
+    for (Instruction &I : *ExitBB) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+
+      Value *Op = PN->getIncomingValueForBlock(ExitingBB);
+      if (auto *OpI = dyn_cast<Instruction>(Op))
+        if (L->contains(OpI))
+          AddCostRecursively(*OpI, TripCount - 1);
+    }
+  }
+
   DEBUG(dbgs() << "Analysis finished:\n"
                << "UnrolledCost: " << UnrolledCost << ", "
                << "RolledDynamicCost: " << RolledDynamicCost << "\n");
Index: test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll
===================================================================
--- test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll
+++ test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10  -unroll-percent-dynamic-cost-saved-threshold=50 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10  -unroll-percent-dynamic-cost-saved-threshold=70 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 @unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
Index: unittests/Analysis/UnrollAnalyzer.cpp
===================================================================
--- unittests/Analysis/UnrollAnalyzer.cpp
+++ unittests/Analysis/UnrollAnalyzer.cpp
@@ -133,6 +133,7 @@
       "  br label %outer.loop\n"
       "outer.loop:\n"
       "  %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %outer.loop.latch ]\n"
+      "  %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n"
       "  br label %inner.loop\n"
       "inner.loop:\n"
       "  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]\n"
@@ -140,7 +141,6 @@
       "  %exitcond.inner = icmp eq i64 %iv.inner.next, 1000\n"
       "  br i1 %exitcond.inner, label %outer.loop.latch, label %inner.loop\n"
       "outer.loop.latch:\n"
-      "  %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n"
       "  %exitcond.outer = icmp eq i64 %iv.outer.next, 40\n"
       "  br i1 %exitcond.outer, label %exit, label %outer.loop\n"
       "exit:\n"
@@ -161,11 +161,15 @@
   BasicBlock *InnerBody = &*FI++;
 
   BasicBlock::iterator BBI = Header->begin();
-  Instruction *Y1 = &*BBI++;
+  BBI++;
+  Instruction *Y1 = &*BBI;
   BBI = InnerBody->begin();
-  Instruction *Y2 = &*BBI++;
+  BBI++;
+  Instruction *Y2 = &*BBI;
   // Check that we can simplify IV of the outer loop, but can't simplify the IV
   // of the inner loop if we only know the iteration number of the outer loop.
+  //
+  //  Y1 is %iv.outer.next, Y2 is %iv.inner.next
   auto I1 = SimplifiedValuesVector[0].find(Y1);
   EXPECT_TRUE(I1 != SimplifiedValuesVector[0].end());
   auto I2 = SimplifiedValuesVector[0].find(Y2);