Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1022,6 +1022,9 @@
   int64_t MinOffset;
   int64_t MaxOffset;
 
+  /// Number of same base addresses.
+  unsigned SameBaseAddress;
+
   /// This records whether all of the fixups using this LSRUse are outside of
   /// the loop, in which case some special-case heuristics may be used.
   bool AllFixupsOutsideLoop;
@@ -1049,7 +1052,7 @@
 
   LSRUse(KindType K, MemAccessTy AT)
       : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
-        AllFixupsOutsideLoop(true), RigidFormula(false),
+        SameBaseAddress(0), AllFixupsOutsideLoop(true), RigidFormula(false),
         WidestFixupType(nullptr) {}
 
   LSRFixup &getNewFixup() {
@@ -1183,9 +1186,11 @@
   ScaleCost += getScalingFactorCost(TTI, LU, F);
 
   // Tally up the non-zero immediates.
+  int64_t AvgOffset = 0;
   for (const LSRFixup &Fixup : LU.Fixups) {
     int64_t O = Fixup.Offset;
     int64_t Offset = (uint64_t)O + F.BaseOffset;
+    AvgOffset += O;
     if (F.BaseGV)
       ImmCost += 64; // Handle symbolic values conservatively.
                      // TODO: This should probably be the pointer size.
@@ -1198,6 +1203,17 @@
         !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
       NumBaseAdds++;
   }
+
+  if (LU.Kind == LSRUse::Address && LU.SameBaseAddress) {
+    if (LU.Fixups.size())
+      AvgOffset /= LU.Fixups.size();
+    int64_t Offset = AvgOffset + F.BaseOffset;
+    if (F.BaseGV)
+      ImmCost += 64 * LU.SameBaseAddress;
+    else if (F.BaseOffset != 0)
+      ImmCost +=
+          APInt(64, Offset, true).getMinSignedBits() * LU.SameBaseAddress;
+  }
   assert(isValid() && "invalid cost");
 }
 
@@ -1708,7 +1724,7 @@
   SmallPtrSet<Use*, MaxChains> IVIncSet;
 
   void OptimizeShadowIV();
-  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
+  bool FindIVUserForInst(Instruction *Cond, IVStrideUse *&CondUse);
   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
   void OptimizeLoopTermCond();
 
@@ -1743,6 +1759,8 @@
 
   void CollectLoopInvariantFixupsAndFormulae();
 
+  void CountSameBaseUses();
+
   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
                               unsigned Depth = 0);
 
@@ -1932,7 +1950,7 @@
 
 /// If Cond has an operand that is an expression of an IV, set the IV user and
 /// stride information and return true, otherwise return false.
-bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+bool LSRInstance::FindIVUserForInst(Instruction *Cond, IVStrideUse *&CondUse) {
   for (IVStrideUse &U : IU)
     if (U.getUser() == Cond) {
       // NOTE: we could handle setcc instructions with multiple uses here, but
@@ -2148,7 +2166,7 @@
     // Search IVUsesByStride to find Cond's IVUse if there is one.
     IVStrideUse *CondUse = nullptr;
     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
-    if (!FindIVUserForCond(Cond, CondUse))
+    if (!FindIVUserForInst(Cond, CondUse))
       continue;
 
     // If the trip count is computed in terms of a max (due to ScalarEvolution
@@ -3206,6 +3224,28 @@
   }
 }
 
+/// Set LU.SameBaseAddress to number of Incs for each profitable Chain.
+void
+LSRInstance::CountSameBaseUses() {
+  for (const IVChain &Chain : IVChainVec) {
+    // Get an LSRUse.
+    if (Chain.Incs.size() < 2)
+      continue;
+    IVStrideUse *IUse;
+    if (!FindIVUserForInst(Chain.Incs[0].UserInst, IUse))
+      continue;
+    const SCEV *S = IU.getExpr(*IUse);
+    ExtractImmediate(S, SE);
+    LSRUse::SCEVUseKindPair P = LSRUse::SCEVUseKindPair(S, LSRUse::Address);
+    if (UseMap.count(P)) {
+      // A use already existed with this base.
+      size_t LUIdx = UseMap.find(P)->second;
+      LSRUse &LU = Uses[LUIdx];
+      LU.SameBaseAddress += (Chain.Incs.size() - 1);
+    }
+  }
+}
+
 /// Split S into subexpressions which can be pulled out into separate
 /// registers. If C is non-null, multiply each subexpression by C.
 ///
@@ -4883,6 +4923,9 @@
   CollectFixupsAndInitialFormulae();
   CollectLoopInvariantFixupsAndFormulae();
 
+  // Calculate address uses with same base address.
+  CountSameBaseUses();
+
   assert(!Uses.empty() && "IVUsers reported at least one use");
   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
         print_uses(dbgs()));
Index: test/Transforms/LoopStrengthReduce/X86/imm-cost.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/imm-cost.ll
+++ test/Transforms/LoopStrengthReduce/X86/imm-cost.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s
+; The test checks that imm cost is counted for both loads %0 and %1
+; If it counts only for the first load %0 LSR will prefer to leave
+; set up add.
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @foo(i32 %n, i8* nocapture readonly %a) {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %add.ptr = getelementptr inbounds i8, i8* %a, i64 -1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %inptr.014 = phi i8* [ %incdec.ptr1, %for.body ], [ %add.ptr, %for.body.preheader ]
+  %s.013 = phi i32 [ %add3, %for.body ], [ 0, %for.body.preheader ]
+  %i.012 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %inptr.014, i64 1
+  %tmp = load i8, i8* %inptr.014, align 1
+  %conv = sext i8 %tmp to i32
+  %add = add nsw i32 %conv, %s.013
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %inptr.014, i64 2
+  %tmp1 = load i8, i8* %incdec.ptr, align 1
+  %conv2 = sext i8 %tmp1 to i32
+  %add3 = add nsw i32 %add, %conv2
+  %inc = add nuw nsw i32 %i.012, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  ret i32 %s.0.lcssa
+}