Index: lib/Transforms/Scalar/LoopUnrollPass.cpp
===================================================================
--- lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -357,6 +357,11 @@
   // post-unrolling.
   DenseMap<Value *, Constant *> SimplifiedValues;
 
+  // Similarly, we keep track of all instructions that become dead.
+  // We don't need to map them to a value, that's why we use Set instead of Map
+  // here.
+  SmallPtrSet<Instruction *, 16> DeadInstructions;
+
   // To avoid requesting SCEV info on every iteration, request it once, and
   // for each value that would become ConstAddress+Constant after loop
   // unrolling, save the corresponding data.
@@ -525,6 +530,7 @@
     // we literally have to go through all loop's iterations.
     for (Iteration = 0; Iteration < TripCount; ++Iteration) {
       SimplifiedValues.clear();
+      DeadInstructions.clear();
       BBWorklist.clear();
       BBWorklist.insert(L->getHeader());
       // Note that we *must not* cache the size, this loop grows the worklist.
@@ -557,6 +563,25 @@
         UnrolledLoopSize = UINT_MAX;
         return;
       }
+
+      for (unsigned Idx = BBWorklist.size() - 1; Idx != 0; --Idx) {
+        BasicBlock *BB = BBWorklist[Idx];
+        if (BB->empty())
+          continue;
+        for (BasicBlock::reverse_iterator I = BB->rbegin(), E = BB->rend(); I != E; ++I) {
+          if (SimplifiedValues.count(&*I))
+            continue;
+          if (DeadInstructions.count(&*I))
+            continue;
+          if (std::all_of(I->user_begin(), I->user_end(), [&](User *U) {
+                return SimplifiedValues.count(cast<Instruction>(U)) +
+                       DeadInstructions.count(cast<Instruction>(U));
+                })) {
+            NumberOfOptimizedInstructions += TTI.getUserCost(&*I);
+            DeadInstructions.insert(&*I);
+          }
+        }
+      }
     }
 
     // If we can overflow computing percentage of optimized instructions, just