Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -129,6 +129,11 @@
   "enable-lsr-phielim", cl::Hidden, cl::init(true),
   cl::desc("Enable LSR phi elimination"));
 
+// The flag adds instruction count to the solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+  "lsr-insns-cost", cl::Hidden, cl::init(false),
+  cl::desc("Enable LSR phi elimination"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@@ -323,6 +328,8 @@
 
   bool unscale();
 
+  bool hasZeroEnd() const;
+
   size_t getNumRegs() const;
   Type *getType() const;
 
@@ -457,6 +464,14 @@
   return true;
 }
 
+bool Formula::hasZeroEnd() const {
+  if (UnfoldedOffset || BaseOffset)
+    return false;
+  if (BaseRegs.size() != 1 || ScaledReg)
+    return false;
+  return true;
+}
+
 /// Return the total number of register operands used by this formula. This does
 /// not include register uses implied by non-constant addrec strides.
 size_t Formula::getNumRegs() const {
@@ -882,6 +897,7 @@
 class Cost {
   /// TODO: Some of these could be merged. Also, a lexical ordering
   /// isn't always optimal.
+  unsigned Insns;
   unsigned NumRegs;
   unsigned AddRecCost;
   unsigned NumIVMuls;
@@ -892,8 +908,8 @@
 
 public:
   Cost()
-    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0), ScaleCost(0) {}
+    : Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0),
+      ImmCost(0), SetupCost(0), ScaleCost(0) {}
 
   bool operator<(const Cost &Other) const;
 
@@ -902,9 +918,9 @@
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
-    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+    return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
              | ImmCost | SetupCost | ScaleCost) != ~0u)
-      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+      || ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
            & ImmCost & SetupCost & ScaleCost) == ~0u);
   }
 #endif
@@ -1151,6 +1167,9 @@
                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
   assert(F.isCanonical() && "Cost is accurate only for canonical formula");
   // Tally up the registers.
+  unsigned PrevAddRecCost = AddRecCost;
+  unsigned PrevNumRegs = NumRegs;
+  unsigned PrevNumBaseAdds = NumBaseAdds;
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
       Lose();
@@ -1170,6 +1189,18 @@
       return;
   }
 
+  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+  // additional instruction.
+  unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+  if (NumRegs > TTIRegNum) {
+    // Cost already exceeded TTIRegNum, then only newly added register can add
+    // new instructions.
+    if (NumRegs > TTIRegNum)
+      Insns += (NumRegs - PrevNumRegs);
+    else
+      Insns += (NumRegs - TTIRegNum);
+  }
+
   // Determine how many (unfolded) adds we'll need inside the loop.
   size_t NumBaseParts = F.getNumRegs();
   if (NumBaseParts > 1)
@@ -1178,10 +1209,8 @@
     NumBaseAdds +=
         NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
   NumBaseAdds += (F.UnfoldedOffset != 0);
-
   // Accumulate non-free scaling amounts.
   ScaleCost += getScalingFactorCost(TTI, LU, F);
-
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
     int64_t O = Fixup.Offset;
@@ -1198,11 +1227,21 @@
         !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
       NumBaseAdds++;
   }
+
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+    Insns++;
+  // Each new AddRec adds 1 instruction to calculation.
+  Insns += (AddRecCost - PrevAddRecCost);
+
+  // BaseAdds adds instructions for unfolded registers.
+  if (LU.Kind != LSRUse::ICmpZero)
+    Insns += NumBaseAdds - PrevNumBaseAdds;
   assert(isValid() && "invalid cost");
 }
 
 /// Set this cost to a losing value.
 void Cost::Lose() {
+  Insns = ~0u;
   NumRegs = ~0u;
   AddRecCost = ~0u;
   NumIVMuls = ~0u;
@@ -1214,6 +1253,8 @@
 
 /// Choose the lower cost.
 bool Cost::operator<(const Cost &Other) const {
+  if (InsnsCost && Insns != Other.Insns)
+    return Insns < Other.Insns;
   return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
                   ImmCost, SetupCost) <
          std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
@@ -1222,6 +1263,7 @@
 }
 
 void Cost::print(raw_ostream &OS) const {
+  OS << Insns << " instruction" << (Insns == 1 ? " " : "s ");
   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
   if (AddRecCost != 0)
     OS << ", with addrec cost " << AddRecCost;
Index: test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
+++ test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -stats 2>&1 | grep "asm-printer" | grep 7
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that LSR optimize compare for static counter.
+; That means that instead of creating the following:
+;   movl %ecx, (%rdx,%rax,4)
+;   incq %rax
+;   cmpq $1024, %rax
+; LSR should optimize out cmp:
+;   movl %ecx, 4096(%rdx,%rax)
+;   addq $4, %rax
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -stats 2>&1 | grep "asm-printer" | grep 7
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that LSR optimize compare for static counter.
+; That means that instead of creating the following:
+;   movl %ecx, (%rdx,%rax,4)
+;   incq %rax
+;   cmpq $1024, %rax
+; LSR should optimize out cmp:
+;   movl %ecx, 4096(%rdx,%rax)
+;   addq $4, %rax
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
Index: test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
+++ test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -stats 2>&1 | grep "asm-printer" | grep 11
+
+; Check that LSR prefers less instructions to less registers.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q, i32 %n) {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -stats 2>&1 | grep "asm-printer" | grep 11
+
+; Check that LSR prefers less instructions to less registers.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q, i32 %n) {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}