Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -140,6 +140,13 @@
   cl::desc("Narrow LSR complex solution using"
            " expectation of registers number"));
 
+// Flag to narrow search space by filtering non-optimal formulae with
+// the same ScaledReg and Scale.
+static cl::opt<bool> FilterSameScaledReg(
+    "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
+    cl::desc("Narrow LSR search space by filtering non-optimal formulae"
+             " with the same ScaledReg and Scale"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@@ -1902,6 +1909,7 @@
   void NarrowSearchSpaceByDetectingSupersets();
   void NarrowSearchSpaceByCollapsingUnrolledCode();
   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
   void NarrowSearchSpaceByDeletingCostlyFormulas();
   void NarrowSearchSpaceByPickingWinnerRegs();
   void NarrowSearchSpaceUsingHeuristics();
@@ -4306,6 +4314,104 @@
   }
 }
 
+/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
+/// Pick the best one and delete the others.
+/// This narrowing heuristic is to keep as many formulae with different
+/// Scale and ScaledReg pair as possible while narrowing the search space.
+/// The benefit is that it is more likely to find out a better solution
+/// from a formulae set with more Scale and ScaledReg variations than
+/// a formulae set with the same Scale and ScaledReg. The picking winner
+/// reg heurstic will often keep the formulae with the same Scale and
+/// ScaledReg and filter others, and we want to avoid that if possible.
+void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+
+  DEBUG(dbgs() << "The search space is too complex.\n"
+                  "Narrowing the search space by choosing the best Formula "
+                  "from the Formulae with the same Scale and ScaledReg.\n");
+
+  // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
+  typedef DenseMap<std::pair<const SCEV *, int64_t>, size_t> BestFormulaeTy;
+  BestFormulaeTy BestFormulae;
+#ifndef NDEBUG
+  bool ChangedFormulae = false;
+#endif
+  DenseSet<const SCEV *> VisitedRegs;
+  SmallPtrSet<const SCEV *, 16> Regs;
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+    // Return true if Formula FA is better than Formula FB.
+    auto IsBetterThan = [&](Formula &FA, Formula &FB) {
+      // First we will try to choose the Formula with fewer new registers.
+      // For a register used by current Formula, the more the register is
+      // shared among LSRUses, the less we increase the register number
+      // counter of the formula.
+      size_t FARegNum = 0;
+      for (const SCEV *Reg : FA.BaseRegs) {
+        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+        FARegNum += (NumUses - UsedByIndices.count() + 1);
+      }
+      size_t FBRegNum = 0;
+      for (const SCEV *Reg : FB.BaseRegs) {
+        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+        FBRegNum += (NumUses - UsedByIndices.count() + 1);
+      }
+      if (FARegNum != FBRegNum)
+        return FARegNum < FBRegNum;
+
+      // If the new register numbers are the same, choose the Formula with
+      // less Cost.
+      Cost CostFA, CostFB;
+      Regs.clear();
+      CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU);
+      Regs.clear();
+      CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU);
+      return CostFA.isLess(CostFB, TTI);
+    };
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+         ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+      if (!F.ScaledReg)
+        continue;
+      auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
+      if (P.second)
+        continue;
+
+      Formula &Best = LU.Formulae[P.first->second];
+      if (IsBetterThan(F, Best))
+        std::swap(F, Best);
+      DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+            dbgs() << "\n"
+                      "    in favor of formula ";
+            Best.print(dbgs()); dbgs() << '\n');
+#ifndef NDEBUG
+      ChangedFormulae = true;
+#endif
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
+    }
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    // Reset this to prepare for the next use.
+    BestFormulae.clear();
+  }
+
+  DEBUG(if (ChangedFormulae) {
+    dbgs() << "\n"
+              "After filtering out undesirable candidates:\n";
+    print_uses(dbgs());
+  });
+}
+
 /// The function delete formulas with high registers number expectation.
 /// Assuming we don't know the value of each formula (already delete
 /// all inefficient), generate probability of not selecting for each
@@ -4516,6 +4622,8 @@
   NarrowSearchSpaceByDetectingSupersets();
   NarrowSearchSpaceByCollapsingUnrolledCode();
   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  if (FilterSameScaledReg)
+    NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
   if (LSRExpNarrow)
     NarrowSearchSpaceByDeletingCostlyFormulas();
   else
Index: test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
===================================================================
--- test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
+++ test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
+; RUN: llc -lsr-filter-same-scaled-reg=false < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
 ; Test case for the recoloring of broken hints.
 ; This is tricky to have something reasonably small to kick this optimization since
 ; it requires that spliting and spilling occur.
Index: test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
+++ test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
@@ -14,8 +14,8 @@
 ; current LSR cost model.
 ; CHECK-NOT: = ptrtoint i8* undef to i64
 ; CHECK: .lr.ph
-; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1
-; CHECK: sub i64 [[TMP]], %tmp6
+; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp{{[0-9]+}}, -1
+; CHECK: sub i64 [[TMP]], %tmp{{[0-9]+}}
 ; CHECK: ret void
 define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 {
 bb:
Index: test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll
+++ test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll
@@ -0,0 +1,163 @@
+; RUN: opt < %s -loop-reduce -lsr-filter-same-scaled-reg=true -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%class.A = type { i8, i8, [5 x i32], i64, i64, i64 }
+%class.ZippyScatteredWriter = type { i8, i8*, i8* }
+@e = local_unnamed_addr global %class.A { i8 0, i8 0, [5 x i32] zeroinitializer, i64 1, i64 0, i64 1 }, align 8
+@f = local_unnamed_addr global %class.ZippyScatteredWriter* null, align 8
+declare void @_ZN20ZippyScatteredWriter5m_fn2Ev(%class.ZippyScatteredWriter*) local_unnamed_addr
+
+define void @foo() local_unnamed_addr {
+entry:
+  %tmp = load %class.ZippyScatteredWriter*, %class.ZippyScatteredWriter** @f, align 8
+  %op_ptr_.i.i = getelementptr inbounds %class.ZippyScatteredWriter, %class.ZippyScatteredWriter* %tmp, i64 0, i32 1
+  %sub.ptr.rhs.cast.i.i = ptrtoint %class.ZippyScatteredWriter* %tmp to i64
+  %op_limit_.i.i = getelementptr inbounds %class.ZippyScatteredWriter, %class.ZippyScatteredWriter* %tmp, i64 0, i32 2
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i, %entry
+  %tmp1 = load i64, i64* getelementptr inbounds (%class.A, %class.A* @e, i64 0, i32 3), align 8
+  %and.i = and i64 %tmp1, 1792
+  %tmp2 = load i64, i64* getelementptr inbounds (%class.A, %class.A* @e, i64 0, i32 4), align 8
+  %add.i = add i64 %and.i, %tmp2
+  %tmp3 = load i64, i64* getelementptr inbounds (%class.A, %class.A* @e, i64 0, i32 5), align 8
+  %tmp4 = load i8*, i8** %op_ptr_.i.i, align 8
+  %add.ptr.i.i = getelementptr inbounds i8, i8* %tmp4, i64 %tmp3
+  %sub.i.i = add i64 %add.i, -1
+  %sub.ptr.lhs.cast.i.i = ptrtoint i8* %tmp4 to i64
+  %sub.ptr.sub.i.i = sub i64 %sub.ptr.lhs.cast.i.i, %sub.ptr.rhs.cast.i.i
+  %cmp.i.i = icmp ult i64 %sub.i.i, %sub.ptr.sub.i.i
+  br i1 %cmp.i.i, label %land.rhs.i.i, label %_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i
+
+land.rhs.i.i:                                     ; preds = %for.cond.i
+  %tmp5 = load i8*, i8** %op_limit_.i.i, align 8
+  %cmp3.i.i = icmp ugt i8* %add.ptr.i.i, %tmp5
+  br i1 %cmp3.i.i, label %_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i, label %if.then.i.i
+
+if.then.i.i:                                      ; preds = %land.rhs.i.i
+  %idx.neg.i.i = sub i64 0, %add.i
+  %add.ptr5.i.i = getelementptr inbounds i8, i8* %tmp4, i64 %idx.neg.i.i
+  %sub.ptr.rhs.cast.i.i.i = ptrtoint i8* %add.ptr5.i.i to i64
+  %tmp6 = sub i64 0, %sub.ptr.rhs.cast.i.i.i
+  %add.ptr.i.i.i = inttoptr i64 %tmp6 to i8*
+  %add.ptr1.i.i.i = getelementptr inbounds i8, i8* %tmp5, i64 -6
+  br label %while.cond.i.i.i
+
+; Without filtering non-optimal formulae with the same ScaledReg and Scale, the strategy
+; to narrow LSR search space by picking winner reg will generate one lsr.iv and unoptimal
+; result.
+; CHECK-LABEL: @foo(
+; CHECK: while.cond.i.i.i:
+; CHECK-NOT: %lsr.iv = phi i64
+; CHECK-NEXT: %p1.addr.0.i.i.i = phi i8*
+; CHECK-NEXT: %b.0.i.i.i = phi i8*
+
+while.cond.i.i.i:                                 ; preds = %while.body.i.i.i, %if.then.i.i
+  %p1.addr.0.i.i.i = phi i8* [ %add.ptr5.i.i, %if.then.i.i ], [ %add.ptr4.i.i.i, %while.body.i.i.i ]
+  %b.0.i.i.i = phi i8* [ %add.ptr.i.i.i, %if.then.i.i ], [ %add.ptr5.i.i.i, %while.body.i.i.i ]
+  %cmp.i.i.i = icmp ugt i8* %b.0.i.i.i, %add.ptr1.i.i.i
+  br i1 %cmp.i.i.i, label %while.end.i.i.i, label %while.body.i.i.i
+
+while.body.i.i.i:                                 ; preds = %while.cond.i.i.i
+  %a.sroa.0.0..sroa_cast2.i.i.i.i = bitcast i8* %p1.addr.0.i.i.i to i64*
+  %a.sroa.0.0.copyload.i.i.i.i = load i64, i64* %a.sroa.0.0..sroa_cast2.i.i.i.i, align 1
+  %a.sroa.0.0..sroa_cast3.i.i.i.i = bitcast i8* %b.0.i.i.i to i64*
+  store i64 %a.sroa.0.0.copyload.i.i.i.i, i64* %a.sroa.0.0..sroa_cast3.i.i.i.i, align 1
+  %add.ptr2.i.i.i = getelementptr inbounds i8, i8* %p1.addr.0.i.i.i, i64 8
+  %add.ptr3.i.i.i = getelementptr inbounds i8, i8* %b.0.i.i.i, i64 8
+  %a.sroa.0.0..sroa_cast2.i39.i.i.i = bitcast i8* %add.ptr2.i.i.i to i64*
+  %a.sroa.0.0.copyload.i40.i.i.i = load i64, i64* %a.sroa.0.0..sroa_cast2.i39.i.i.i, align 1
+  %a.sroa.0.0..sroa_cast3.i41.i.i.i = bitcast i8* %add.ptr3.i.i.i to i64*
+  store i64 %a.sroa.0.0.copyload.i40.i.i.i, i64* %a.sroa.0.0..sroa_cast3.i41.i.i.i, align 1
+  %add.ptr4.i.i.i = getelementptr inbounds i8, i8* %p1.addr.0.i.i.i, i64 16
+  %add.ptr5.i.i.i = getelementptr inbounds i8, i8* %b.0.i.i.i, i64 16
+  %cmp6.i.i.i = icmp ult i8* %add.ptr5.i.i.i, %add.ptr.i.i
+  br i1 %cmp6.i.i.i, label %while.cond.i.i.i, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit30
+
+while.end.i.i.i:                                  ; preds = %while.cond.i.i.i
+  %add.ptr8.i.i.i = getelementptr inbounds i8, i8* %tmp5, i64 -8
+  %cmp9.i.i.i = icmp ugt i8* %b.0.i.i.i, %add.ptr8.i.i.i
+  br i1 %cmp9.i.i.i, label %if.end16.i.i.i, label %if.then13.i.i.i
+
+if.then13.i.i.i:                                  ; preds = %while.end.i.i.i
+  %a.sroa.0.0..sroa_cast2.i42.i.i.i = bitcast i8* %p1.addr.0.i.i.i to i64*
+  %a.sroa.0.0.copyload.i43.i.i.i = load i64, i64* %a.sroa.0.0..sroa_cast2.i42.i.i.i, align 1
+  %a.sroa.0.0..sroa_cast3.i44.i.i.i = bitcast i8* %b.0.i.i.i to i64*
+  store i64 %a.sroa.0.0.copyload.i43.i.i.i, i64* %a.sroa.0.0..sroa_cast3.i44.i.i.i, align 1
+  %add.ptr14.i.i.i = getelementptr inbounds i8, i8* %p1.addr.0.i.i.i, i64 8
+  %add.ptr15.i.i.i = getelementptr inbounds i8, i8* %b.0.i.i.i, i64 8
+  br label %if.end16.i.i.i
+
+if.end16.i.i.i:                                   ; preds = %if.then13.i.i.i, %while.end.i.i.i
+  %p1.addr.1.i.i.i = phi i8* [ %add.ptr14.i.i.i, %if.then13.i.i.i ], [ %p1.addr.0.i.i.i, %while.end.i.i.i ]
+  %b.1.i.i.i = phi i8* [ %add.ptr15.i.i.i, %if.then13.i.i.i ], [ %b.0.i.i.i, %while.end.i.i.i ]
+  %b.1.i.i.i13 = ptrtoint i8* %b.1.i.i.i to i64
+  %cmp4.i.i.i.i = icmp ult i8* %b.1.i.i.i, %add.ptr.i.i
+  br i1 %cmp4.i.i.i.i, label %while.body.i.i.i.i.preheader, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i
+
+while.body.i.i.i.i.preheader:                     ; preds = %if.end16.i.i.i
+  %tmp7 = sub i64 %tmp3, %b.1.i.i.i13
+  %scevgep = getelementptr i8, i8* %tmp4, i64 %tmp7
+  %exitcount.ptrcnt.to.int = ptrtoint i8* %scevgep to i64
+  %min.iters.check = icmp ult i8* %scevgep, inttoptr (i64 32 to i8*)
+  br i1 %min.iters.check, label %while.body.i.i.i.i.preheader29, label %min.iters.checked
+
+min.iters.checked:                                ; preds = %while.body.i.i.i.i.preheader
+  %n.vec = and i64 %exitcount.ptrcnt.to.int, -32
+  %cmp.zero = icmp eq i64 %n.vec, 0
+  br i1 %cmp.zero, label %while.body.i.i.i.i.preheader29, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %min.iters.checked
+  %tmp8 = sub i64 %tmp3, %b.1.i.i.i13
+  %scevgep14 = getelementptr i8, i8* %tmp4, i64 %tmp8
+  %scevgep1415 = ptrtoint i8* %scevgep14 to i64
+  %scevgep16 = getelementptr i8, i8* %p1.addr.1.i.i.i, i64 %scevgep1415
+  %bound0 = icmp ult i8* %b.1.i.i.i, %scevgep16
+  %bound1 = icmp ult i8* %p1.addr.1.i.i.i, %add.ptr.i.i
+  %memcheck.conflict = and i1 %bound0, %bound1
+  %ind.end = getelementptr i8, i8* %p1.addr.1.i.i.i, i64 %n.vec
+  %ind.end18 = getelementptr i8, i8* %b.1.i.i.i, i64 %n.vec
+  br i1 %memcheck.conflict, label %while.body.i.i.i.i.preheader29, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  %tmp9 = add i64 %n.vec, -32
+  %tmp10 = lshr exact i64 %tmp9, 5
+  %tmp11 = add nuw nsw i64 %tmp10, 1
+  %xtraiter = and i64 %tmp11, 3
+  %tmp12 = icmp ult i64 %tmp9, 96
+  br i1 %tmp12, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit, label %vector.body.preheader.new
+
+vector.body.preheader.new:                        ; preds = %vector.body.preheader
+  %cmp.n = icmp eq i64 %n.vec, %exitcount.ptrcnt.to.int
+  br i1 %cmp.n, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i, label %while.body.i.i.i.i.preheader29
+
+while.body.i.i.i.i.preheader29:                   ; preds = %vector.body.preheader.new, %vector.memcheck, %min.iters.checked, %while.body.i.i.i.i.preheader
+  %p1.addr.06.i.i.i.i.ph = phi i8* [ %ind.end, %vector.body.preheader.new ], [ %p1.addr.1.i.i.i, %while.body.i.i.i.i.preheader ], [ %p1.addr.1.i.i.i, %min.iters.checked ], [ %p1.addr.1.i.i.i, %vector.memcheck ]
+  %p2.addr.05.i.i.i.i.ph = phi i8* [ %ind.end18, %vector.body.preheader.new ], [ %b.1.i.i.i, %while.body.i.i.i.i.preheader ], [ %b.1.i.i.i, %min.iters.checked ], [ %b.1.i.i.i, %vector.memcheck ]
+  br label %while.body.i.i.i.i
+
+while.body.i.i.i.i:                               ; preds = %while.body.i.i.i.i, %while.body.i.i.i.i.preheader29
+  %p1.addr.06.i.i.i.i = phi i8* [ %incdec.ptr.i.i.i.i, %while.body.i.i.i.i ], [ %p1.addr.06.i.i.i.i.ph, %while.body.i.i.i.i.preheader29 ]
+  %p2.addr.05.i.i.i.i = phi i8* [ %incdec.ptr1.i.i.i.i, %while.body.i.i.i.i ], [ %p2.addr.05.i.i.i.i.ph, %while.body.i.i.i.i.preheader29 ]
+  %incdec.ptr.i.i.i.i = getelementptr inbounds i8, i8* %p1.addr.06.i.i.i.i, i64 1
+  %t43 = load i8, i8* %p1.addr.06.i.i.i.i, align 1
+  %incdec.ptr1.i.i.i.i = getelementptr inbounds i8, i8* %p2.addr.05.i.i.i.i, i64 1
+  store i8 %t43, i8* %p2.addr.05.i.i.i.i, align 1
+  %exitcond.i.i.i.i = icmp eq i8* %incdec.ptr1.i.i.i.i, %add.ptr.i.i
+  br i1 %exitcond.i.i.i.i, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit, label %while.body.i.i.i.i
+
+_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit:    ; preds = %while.body.i.i.i.i, %vector.body.preheader
+  br label %_Z15IncrementalCopyPKcPcS1_.exit.i.i
+
+_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit30:  ; preds = %while.body.i.i.i
+  br label %_Z15IncrementalCopyPKcPcS1_.exit.i.i
+
+_Z15IncrementalCopyPKcPcS1_.exit.i.i:             ; preds = %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit30, %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit, %vector.body.preheader.new, %if.end16.i.i.i
+  store i8* %add.ptr.i.i, i8** %op_ptr_.i.i, align 8
+  br label %_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i
+
+_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i:        ; preds = %_Z15IncrementalCopyPKcPcS1_.exit.i.i, %land.rhs.i.i, %for.cond.i
+  tail call void @_ZN20ZippyScatteredWriter5m_fn2Ev(%class.ZippyScatteredWriter* nonnull %tmp)
+  br label %for.cond.i
+}