Index: include/llvm/Analysis/IVUsers.h
===================================================================
--- include/llvm/Analysis/IVUsers.h
+++ include/llvm/Analysis/IVUsers.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_ANALYSIS_IVUSERS_H
 #define LLVM_ANALYSIS_IVUSERS_H
 
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
@@ -99,6 +100,7 @@
   DominatorTree *DT;
   ScalarEvolution *SE;
   SmallPtrSet<Instruction*, 16> Processed;
+  DenseMap<Value*, SmallPtrSet<Value*, 2>> IVOperandsMap;
 
   /// IVUses - A list of all tracked IV uses of induction variable expressions
   /// we are interested in.
@@ -114,7 +116,8 @@
   IVUsers(IVUsers &&X)
       : L(std::move(X.L)), AC(std::move(X.AC)), DT(std::move(X.DT)),
         SE(std::move(X.SE)), Processed(std::move(X.Processed)),
-        IVUses(std::move(X.IVUses)), EphValues(std::move(X.EphValues)) {
+        IVOperandsMap(std::move(X.IVOperandsMap)), IVUses(std::move(X.IVUses)),
+        EphValues(std::move(X.EphValues)) {
     for (IVStrideUse &U : IVUses)
       U.Parent = this;
   }
@@ -152,6 +155,13 @@
     return Processed.count(Inst);
   }
 
+  /// Return IV operands used by \p Use.
+  SmallPtrSetImpl<Value *> *getIVsUsedBy(Value *Use) {
+    if (!IVOperandsMap.count(Use))
+      return nullptr;
+    return &IVOperandsMap[Use];
+  }
+
   void releaseMemory();
 
   void print(raw_ostream &OS, const Module * = nullptr) const;
Index: lib/Analysis/IVUsers.cpp
===================================================================
--- lib/Analysis/IVUsers.cpp
+++ lib/Analysis/IVUsers.cpp
@@ -225,6 +225,9 @@
     if (!isSimplifiedLoopNest(UseBB, DT, LI, SimpleLoopNests))
       return false;
 
+    // Track IV operands.
+    IVOperandsMap[User].insert(I);
+
     // Descend recursively, but not into PHI nodes outside the current loop.
     // It's important to see the entire expression outside the loop to get
     // choices that depend on addressing mode use right, although we won't
@@ -349,6 +352,7 @@
 void IVUsers::releaseMemory() {
   Processed.clear();
   IVUses.clear();
+  IVOperandsMap.clear();
 }
 
 IVUsersWrapperPass::IVUsersWrapperPass() : LoopPass(ID) {
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -65,6 +65,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
@@ -123,12 +124,19 @@
 
 #define DEBUG_TYPE "loop-reduce"
 
+STATISTIC(NumCheaperInput, "Number of cheaper LSR input");
+
 /// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
 /// bail out. This threshold is far beyond the number of users that LSR can
 /// conceivably solve, so it should not affect generated code, but catches the
 /// worst cases before LSR burns too much compile time and stack space.
 static const unsigned MaxIVUsers = 200;
 
+static cl::opt<bool>
+    EnableInitalCostCheck("enable-lsr-input-cost-check", cl::Hidden,
+                          cl::init(true),
+                          cl::desc("Enable LSR input cost check"));
+
 // Temporary flag to cleanup congruent phis after LSR phi expansion.
 // It's currently disabled until we can determine whether it's truly useful or
 // not. The flag should be removed after the v3.0 release.
@@ -1020,6 +1028,8 @@
 
   void Lose();
 
+  void ApplyWeight();
+
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
@@ -1404,6 +1414,20 @@
   C.ScaleCost = std::numeric_limits<unsigned>::max();
 }
 
+/// Apply some weight on this cost.
+void Cost::ApplyWeight() {
+  // FIXME: For now, simply increase Insns and NumRegs by one and ignore other
+  // cost criteria.
+  ++C.Insns;
+  ++C.NumRegs;
+  C.AddRecCost = std::numeric_limits<unsigned>::max();
+  C.NumIVMuls = std::numeric_limits<unsigned>::max();
+  C.NumBaseAdds = std::numeric_limits<unsigned>::max();
+  C.ImmCost = std::numeric_limits<unsigned>::max();
+  C.SetupCost = std::numeric_limits<unsigned>::max();
+  C.ScaleCost = std::numeric_limits<unsigned>::max();
+}
+
 /// Choose the lower cost.
 bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
   if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
@@ -1946,6 +1970,35 @@
   std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
                                     MemAccessTy AccessTy);
 
+  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+                                    MemAccessTy AccessTy,
+                                    SmallVectorImpl<LSRUse> &LsrUses,
+                                    UseMapTy &UseMap);
+
+  LSRUse &GetOrCreateLSRUse(Instruction *UserInst, Value *IVOp, const SCEV *&S,
+                            LSRUse::KindType Kind, MemAccessTy &AccessTy,
+                            PostIncLoopSet &TmpPostIncLoops, UseMapTy &UseMap,
+                            SmallVectorImpl<LSRUse> &LsrUses, size_t &LUIdx);
+
+  bool FormInputGEPFormula(Value *Op, LSRUse &LU, Formula &F);
+
+  bool
+  FormInputLSRUsesAndFormulae(Instruction *UserInst, Value *OperandValToReplace,
+                              DenseSet<std::pair<Value *, Value *>> &Visited,
+                              SmallVectorImpl<LSRUse> &InputUses,
+                              UseMapTy &InputUseMap);
+
+  bool FormInputLSRUseAndFormula(Instruction *UserInst, Value *Op,
+                                 DenseSet<std::pair<Value *, Value *>> &Visited,
+                                 SmallVectorImpl<LSRUse> &InputUses,
+                                 UseMapTy &InputUseMap);
+
+  bool CollectInputFormulae(SmallVectorImpl<LSRUse> &LSRUses);
+
+  bool IsInputCostCheaperThanSolutionCost(Cost &SolutionCost);
+
+  bool IsInputCostStillCheap(Cost &SolutionCost, Cost &InputCost);
+
   void DeleteUse(LSRUse &LU, size_t LUIdx);
 
   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
@@ -1953,7 +2006,8 @@
   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   void CountRegisters(const Formula &F, size_t LUIdx);
-  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F,
+                     bool CountReg = true);
 
   void CollectLoopInvariantFixupsAndFormulae();
 
@@ -1996,7 +2050,9 @@
                     const Cost &CurCost,
                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
                     DenseSet<const SCEV *> &VisitedRegs) const;
-  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
+
+  void Solve(SmallVectorImpl<const Formula *> &Solution,
+             Cost &SolutionCost) const;
 
   BasicBlock::iterator
     HoistInsertPosition(BasicBlock::iterator IP,
@@ -2532,6 +2588,14 @@
 std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
                                                LSRUse::KindType Kind,
                                                MemAccessTy AccessTy) {
+  return getUse(Expr, Kind, AccessTy, Uses, UseMap);
+}
+
+std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
+                                               LSRUse::KindType Kind,
+                                               MemAccessTy AccessTy,
+                                               SmallVectorImpl<LSRUse> &LUs,
+                                               UseMapTy &UMap) {
   const SCEV *Copy = Expr;
   int64_t Offset = ExtractImmediate(Expr, SE);
 
@@ -2543,21 +2607,22 @@
   }
 
   std::pair<UseMapTy::iterator, bool> P =
-    UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
+      UMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
+
   if (!P.second) {
     // A use already existed with this base.
     size_t LUIdx = P.first->second;
-    LSRUse &LU = Uses[LUIdx];
+    LSRUse &LU = LUs[LUIdx];
     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
       // Reuse this use.
       return std::make_pair(LUIdx, Offset);
   }
 
   // Create a new use.
-  size_t LUIdx = Uses.size();
+  size_t LUIdx = LUs.size();
   P.first->second = LUIdx;
-  Uses.push_back(LSRUse(Kind, AccessTy));
-  LSRUse &LU = Uses[LUIdx];
+  LUs.push_back(LSRUse(Kind, AccessTy));
+  LSRUse &LU = LUs[LUIdx];
 
   LU.MinOffset = Offset;
   LU.MaxOffset = Offset;
@@ -3204,6 +3269,126 @@
   }
 }
 
+bool LSRInstance::FormInputLSRUsesAndFormulae(
+    Instruction *UserInst, Value *OperandValToReplace,
+    DenseSet<std::pair<Value *, Value *>> &Visited,
+    SmallVectorImpl<LSRUse> &InputUses, UseMapTy &InputUseMap) {
+  SmallVector<std::pair<Instruction *, Value *>, 4> Worklist;
+  Worklist.push_back(std::make_pair(UserInst, OperandValToReplace));
+
+  while (!Worklist.empty()) {
+    std::pair<Instruction *, Value *> Element = Worklist.pop_back_val();
+    Value *Op = Element.second;
+    Instruction *Inst = Element.first;
+
+    if (!FormInputLSRUseAndFormula(Inst, Op, Visited, InputUses, InputUseMap))
+      return false;
+
+    if (!IU.getIVsUsedBy(Op)) {
+      assert(isa<PHINode>(Op) && "Expect only a PHI is the root of IV operand");
+      continue;
+    }
+    for (auto IV : *IU.getIVsUsedBy(Op))
+      Worklist.push_back(std::make_pair(cast<Instruction>(Op), IV));
+  }
+  return true;
+}
+
+bool LSRInstance::FormInputLSRUseAndFormula(
+    Instruction *UserInst, Value *Op,
+    DenseSet<std::pair<Value *, Value *>> &Visited,
+    SmallVectorImpl<LSRUse> &InputUses, UseMapTy &InputUseMap) {
+  if (!Visited.insert(std::make_pair(UserInst, Op)).second)
+    return true;
+
+  LSRUse::KindType Kind = LSRUse::Basic;
+  MemAccessTy AccessTy;
+  if (UserInst && isAddressUse(TTI, UserInst, Op)) {
+    Kind = LSRUse::Address;
+    AccessTy = getAccessType(TTI, UserInst, Op);
+  }
+
+  const SCEV *S = SE.getSCEV(Op);
+  size_t LUIdx;
+  PostIncLoopSet TmpPostIncLoops;
+  LSRUse &LU =
+      GetOrCreateLSRUse(UserInst, Op, S, Kind, AccessTy, TmpPostIncLoops,
+                        InputUseMap, InputUses, LUIdx);
+
+  Formula F;
+
+  if (isa<GetElementPtrInst>(Op)) {
+    // TODO: For now we only consider GEPs directly used in addressing.
+    if (!isAddressUse(TTI, UserInst, Op))
+      return false;
+    // Since we need to model the situation where IV operands are all kept we
+    // should break down GEP's SCEV in terms of its IV operands so that we can
+    // reuse Regs from IV operands in GEP. For example, in IR below,
+    //
+    //   %iv = i32 phi [0, %Preheader],[%iv.next, %LoopBody]
+    //   getelementptr i32* %Base, i32 %iv
+    //   %iv.next = add i32 %iv, #1
+    //
+    // we cannot simply use the original initial formula (reg(%Base,+,4)) from
+    // SE.getSCEV(). Instead, we have to break down the formula in terms of the
+    // IV operand like reg(%Base) + 4*reg(0,+,1) so that we can let the cost
+    // model capture reg(0,+,1) is shared.
+    if (!FormInputGEPFormula(Op, LU, F))
+      return false;
+  } else
+    F.initialMatch(S, L, SE);
+
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return false;
+
+  InsertFormula(LU, LUIdx, F, /* CountReg= */ false);
+
+  // TODO: For now, expect only one formula for a LU. If we encounter multiple
+  // formulae, it must be due to other GEPs in the same LU created in different
+  // form in FormInputGEPFormula(). We do not handle such case for now.
+  if (LU.Formulae.size() != 1)
+    return false;
+
+  return true;
+}
+
+LSRUse &LSRInstance::GetOrCreateLSRUse(
+    Instruction *UserInst, Value *IVOp, const SCEV *&S, LSRUse::KindType Kind,
+    MemAccessTy &AccessTy, PostIncLoopSet &TmpPostIncLoops, UseMapTy &LsrUseMap,
+    SmallVectorImpl<LSRUse> &LsrUses, size_t &LUIdx) {
+  // Get or create an LSRUse.
+  std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy, LsrUses, LsrUseMap);
+  LUIdx = P.first;
+  int64_t Offset = P.second;
+  LSRUse &LU = LsrUses[LUIdx];
+
+  // Record the fixup.
+  LSRFixup &LF = LU.getNewFixup();
+  LF.UserInst = UserInst;
+  LF.OperandValToReplace = IVOp;
+  LF.PostIncLoops = TmpPostIncLoops;
+  LF.Offset = Offset;
+  LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+
+  if (!LU.WidestFixupType ||
+      SE.getTypeSizeInBits(LU.WidestFixupType) <
+          SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+    LU.WidestFixupType = LF.OperandValToReplace->getType();
+
+  return LU;
+}
+
+bool LSRInstance::CollectInputFormulae(SmallVectorImpl<LSRUse> &InputUses) {
+  DenseSet<std::pair<Value *, Value *>> Visited;
+  UseMapTy InputUseMap;
+
+  for (const IVStrideUse &U : IU)
+    if (!FormInputLSRUsesAndFormulae(U.getUser(), U.getOperandValToReplace(),
+                                     Visited, InputUses, InputUseMap))
+      return false;
+  return true;
+}
+
 void LSRInstance::CollectFixupsAndInitialFormulae() {
   for (const IVStrideUse &U : IU) {
     Instruction *UserInst = U.getUser();
@@ -3262,24 +3447,10 @@
         Factors.insert(-1);
       }
 
-    // Get or create an LSRUse.
-    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
-    size_t LUIdx = P.first;
-    int64_t Offset = P.second;
-    LSRUse &LU = Uses[LUIdx];
-
-    // Record the fixup.
-    LSRFixup &LF = LU.getNewFixup();
-    LF.UserInst = UserInst;
-    LF.OperandValToReplace = U.getOperandValToReplace();
-    LF.PostIncLoops = TmpPostIncLoops;
-    LF.Offset = Offset;
-    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
-
-    if (!LU.WidestFixupType ||
-        SE.getTypeSizeInBits(LU.WidestFixupType) <
-        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
-      LU.WidestFixupType = LF.OperandValToReplace->getType();
+    size_t LUIdx;
+    LSRUse &LU =
+        GetOrCreateLSRUse(UserInst, U.getOperandValToReplace(), S, Kind,
+                          AccessTy, TmpPostIncLoops, UseMap, Uses, LUIdx);
 
     // If this is the first use of this LSRUse, give it a formula.
     if (LU.Formulae.empty()) {
@@ -3327,7 +3498,8 @@
 
 /// If the given formula has not yet been inserted, add it to the list, and
 /// return true. Return false otherwise.
-bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F,
+                                bool CountReg) {
   // Do not insert formula that we will not be able to expand.
   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
          "Formula is illegal");
@@ -3335,7 +3507,95 @@
   if (!LU.InsertFormula(F, *L))
     return false;
 
-  CountRegisters(F, LUIdx);
+  if (CountReg)
+    CountRegisters(F, LUIdx);
+  return true;
+}
+
+bool LSRInstance::FormInputGEPFormula(Value *Op, LSRUse &LU, Formula &F) {
+  assert(IU.getIVsUsedBy(Op) && "Expect at least one IV operand in GEP");
+  GEPOperator *GEP = cast<GEPOperator>(Op);
+  SmallVector<const SCEV *, 4> NonIVOpExprs;
+  SmallVector<std::pair<const SCEV *, const SCEV *>, 4> IVOpExprs;
+  const SCEV *BaseExpr = SE.getSCEV(GEP->getPointerOperand());
+  Type *IntPtrTy = SE.getEffectiveSCEVType(BaseExpr->getType());
+  SCEV::NoWrapFlags Wrap =
+      GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
+  Type *CurTy = ArrayType::get(GEP->getSourceElementType(), 0);
+
+  if (!IU.getIVsUsedBy(Op)->count(GEP->getPointerOperand()))
+    NonIVOpExprs.push_back(BaseExpr);
+  else
+    IVOpExprs.push_back(std::make_pair(SE.getOne(IntPtrTy), BaseExpr));
+
+  // Note that this loop is inspired by SE.getGEPExpr().
+  for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index) {
+    Value *GEPIndex = *Index;
+    const SCEV *IndexExpr = SE.getSCEV(GEPIndex);
+
+    // Compute the (potentially symbolic) offset in bytes for this index.
+    if (StructType *STy = dyn_cast<StructType>(CurTy)) {
+      // For a struct, add the member offset.
+      ConstantInt *Index = cast<SCEVConstant>(IndexExpr)->getValue();
+      unsigned FieldNo = Index->getZExtValue();
+      const SCEV *FieldOffset = SE.getOffsetOfExpr(IntPtrTy, STy, FieldNo);
+
+      // The field offset will be added as a part of NonIVOps.
+      NonIVOpExprs.push_back(FieldOffset);
+
+      // Update CurTy to the type of the field at Index.
+      CurTy = STy->getTypeAtIndex(Index);
+    } else {
+      // Update CurTy to its element type.
+      CurTy = cast<SequentialType>(CurTy)->getElementType();
+      // For an array, find the element offset, explicitly scaled.
+      const SCEV *ElementSize = SE.getSizeOfExpr(IntPtrTy, CurTy);
+      // Getelementptr indices are signed.
+      IndexExpr = SE.getTruncateOrSignExtend(IndexExpr, IntPtrTy);
+
+      // Multiply the index by the element size to compute the element offset.
+      const SCEV *LocalOffset = SE.getMulExpr(IndexExpr, ElementSize, Wrap);
+
+      if (!IU.getIVsUsedBy(Op)->count(GEPIndex))
+        NonIVOpExprs.push_back(LocalOffset);
+      else
+        IVOpExprs.push_back(std::make_pair(ElementSize, IndexExpr));
+    }
+  }
+
+  const SCEV *NonIVOps;
+  if (NonIVOpExprs.size() < 1)
+    NonIVOps = SE.getZero(IntPtrTy);
+  else
+    NonIVOps = SE.getAddExpr(NonIVOpExprs);
+
+  Formula TempF;
+  TempF.initialMatch(NonIVOps, L, SE);
+
+  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2 because we
+  // want to place IV in ScaleReg.
+  TempF.unscale();
+
+  for (auto IVOpExpr : IVOpExprs) {
+    const SCEV *ElementSize = IVOpExpr.first;
+    const SCEV *IVOp = IVOpExpr.second;
+    int64_t Scale =
+        (cast<SCEVConstant>(ElementSize))->getAPInt().getSExtValue();
+
+    if (!TempF.BaseRegs.empty() && !TempF.ScaledReg) {
+      TempF.Scale = Scale;
+      TempF.ScaledReg = IVOp;
+    } else {
+      const SCEV *Offset = SE.getMulExpr(IVOp, ElementSize, Wrap);
+      TempF.BaseRegs.push_back(Offset);
+      TempF.HasBaseReg = true;
+    }
+  }
+
+  if (!TempF.isCanonical(*L))
+    return false;
+
+  F = TempF;
   return true;
 }
 
@@ -4841,9 +5101,9 @@
 
 /// Choose one formula from each use. Return the results in the given Solution
 /// vector.
-void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
+void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution,
+                        Cost &SolutionCost) const {
   SmallVector<const Formula *, 8> Workspace;
-  Cost SolutionCost;
   SolutionCost.Lose();
   Cost CurCost;
   SmallPtrSet<const SCEV *, 16> CurRegs;
@@ -4874,6 +5134,43 @@
   assert(Solution.size() == Uses.size() && "Malformed solution!");
 }
 
+bool LSRInstance::IsInputCostStillCheap(Cost &SolutionCost, Cost &InputCost) {
+  if (!InputCost.isLess(SolutionCost, TTI))
+    return false;
+
+  // By giving some weight on the input cost, we conservatively skip using
+  // LSR's selected solution.
+  Cost WeightedInputCost = InputCost;
+  WeightedInputCost.ApplyWeight();
+  return WeightedInputCost.isLess(SolutionCost, TTI);
+}
+
+bool LSRInstance::IsInputCostCheaperThanSolutionCost(Cost &SolutionCost) {
+  SmallVector<LSRUse, 16> InputUses;
+  if (!CollectInputFormulae(InputUses) || InputUses.size() < 1)
+    return false;
+
+  DenseSet<const SCEV *> VisitedRegs;
+  SmallPtrSet<const SCEV *, 16> Regs;
+  Cost InputCost;
+  for (size_t i = 0, e = InputUses.size(); i != e; ++i) {
+    const LSRUse &LU = InputUses[i];
+    assert(LU.Formulae.size() == 1 && "Expect only one formula");
+    InputCost.RateFormula(TTI, LU.Formulae[0], Regs, VisitedRegs, L, SE, DT,
+                          LU);
+
+    if (!IsInputCostStillCheap(SolutionCost, InputCost))
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "\n"
+                       "The input requires ";
+             InputCost.print(dbgs()); dbgs() << ":\n"
+                                                "The chosen solution requires ";
+             SolutionCost.print(dbgs()); dbgs() << ":\n";);
+  return true;
+}
+
 /// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
 /// we can go while still being dominated by the input positions. This helps
 /// canonicalize the insert position, which encourages sharing.
@@ -5414,7 +5711,8 @@
   NarrowSearchSpaceUsingHeuristics();
 
   SmallVector<const Formula *, 8> Solution;
-  Solve(Solution);
+  Cost SolutionCost;
+  Solve(Solution, SolutionCost);
 
   // Release memory that is no longer needed.
   Factors.clear();
@@ -5424,6 +5722,13 @@
   if (Solution.empty())
     return;
 
+  if (EnableInitalCostCheck &&
+      IsInputCostCheaperThanSolutionCost(SolutionCost)) {
+    ++NumCheaperInput;
+    LLVM_DEBUG(dbgs() << "Skip using LSR's solution.\n");
+    return;
+  }
+
 #ifndef NDEBUG
   // Formulae should be legal.
   for (const LSRUse &LU : Uses) {
Index: test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
+++ test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-reduce -S < %s | FileCheck %s
+; RUN: opt -loop-reduce -S -enable-lsr-input-cost-check=false < %s | FileCheck %s
 ;
 ; LTO of clang, which mistakenly uses no TargetLoweringInfo, causes a
 ; miscompile. ReuseOrCreateCast replace ptrtoint operand with undef.
Index: test/Transforms/LoopStrengthReduce/AArch64/skip-lsr-solution.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopStrengthReduce/AArch64/skip-lsr-solution.ll
@@ -0,0 +1,91 @@
+; REQUIRES: asserts
+; RUN: llc <  %s  -mtriple=aarch64 -lsr-insns-cost=true -debug-only=loop-reduce  2>&1 | FileCheck %s
+
+; ModuleID = 'loophmmer.c'
+source_filename = "loophmmer.c"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-dcg-linux-gnu"
+
+;void test(int M, int *P, int *P2, int *P3, int *P4,
+;          int *P5, int *P6, int *P7, int *P8) {
+;
+;  int t = 0;
+;  for (int k = 1; k <= M; ++k) {
+;    P[k-1] = P[k-1] + P2[k-1];
+;    P[k-1] = P3[k-1] + P4[k-1];
+;    P[k-1] = P6[k-1] + P7[k-1];
+;
+;    if (k < M) {
+;      P6[k] = P8[k] + P[k];
+;      P[k] = P4[k] + P5[k];
+;    }
+;  }
+;}
+
+
+; CHECK: Skip using LSR's solution
+; CHECK-LABEL: test:
+; CHECK: lsl [[K_IDX:x[0-9]+]], [[K:x[0-9]+]], #2
+; CHECK: sub [[K_1_IDX:x[0-9]+]], [[K_IDX]], #4
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, [[K_1_IDX]]
+; CHECK: add  [[K]], [[K]], #1
+
+define void @test(i32 %M, i32*  %P, i32*  readonly %P2, i32*  readonly %P3, i32*  readonly %P4, i32*  readonly %P5, i32*  %P6, i32*  readonly %P7, i32*  readonly %P8){
+entry:
+  %cmp69 = icmp slt i32 %M, 1
+  br i1 %cmp69, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = sext i32 %M to i64
+  %1 = add i32 %M, 1
+  %wide.trip.count = zext i32 %1 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  ret void
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 1, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %2 = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds i32, i32* %P, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %P2, i64 %2
+  %4 = load i32, i32* %arrayidx3, align 4
+  %add = add nsw i32 %4, %3
+  store i32 %add, i32* %arrayidx, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %P3, i64 %2
+  %5 = load i32, i32* %arrayidx9, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %P4, i64 %2
+  %6 = load i32, i32* %arrayidx12, align 4
+  %add13 = add nsw i32 %6, %5
+  store i32 %add13, i32* %arrayidx, align 4
+  %arrayidx19 = getelementptr inbounds i32, i32* %P6, i64 %2
+  %7 = load i32, i32* %arrayidx19, align 4
+  %arrayidx22 = getelementptr inbounds i32, i32* %P7, i64 %2
+  %8 = load i32, i32* %arrayidx22, align 4
+  %add23 = add nsw i32 %8, %7
+  store i32 %add23, i32* %arrayidx, align 4
+  %cmp27 = icmp slt i64 %indvars.iv, %0
+  br i1 %cmp27, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx29 = getelementptr inbounds i32, i32* %P8, i64 %indvars.iv
+  %9 = load i32, i32* %arrayidx29, align 4
+  %arrayidx31 = getelementptr inbounds i32, i32* %P, i64 %indvars.iv
+  %10 = load i32, i32* %arrayidx31, align 4
+  %add32 = add nsw i32 %10, %9
+  %arrayidx34 = getelementptr inbounds i32, i32* %P6, i64 %indvars.iv
+  store i32 %add32, i32* %arrayidx34, align 4
+  %arrayidx36 = getelementptr inbounds i32, i32* %P4, i64 %indvars.iv
+  %11 = load i32, i32* %arrayidx36, align 4
+  %arrayidx38 = getelementptr inbounds i32, i32* %P5, i64 %indvars.iv
+  %12 = load i32, i32* %arrayidx38, align 4
+  %add39 = add nsw i32 %12, %11
+  store i32 %add39, i32* %arrayidx31, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}