Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -305,6 +305,11 @@
   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg, int64_t Scale) const;
 
+  /// \brief Return true when the AddRec can be folded into other insn. This is
+  /// possible when the target supports pre/postinc and the stride of the AddRec
+  /// is an acceptable const.
+  bool isAddRecFoldable(int64_t stride) const;
+
   /// \brief Return true if the target works with masked instruction
   /// AVX2 allows masks for consecutive load and store for i32 and i64 elements.
   /// AVX-512 architecture will also allow masks for non-consecutive memory
@@ -537,6 +542,7 @@
   virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0;
   virtual bool isLegalAddImmediate(int64_t Imm) = 0;
   virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
+  virtual bool isAddRecFoldable(int64_t stride) = 0;
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                      int64_t BaseOffset, bool HasBaseReg,
                                      int64_t Scale) = 0;
@@ -647,6 +653,9 @@
   bool isLegalICmpImmediate(int64_t Imm) override {
     return Impl.isLegalICmpImmediate(Imm);
   }
+  bool isAddRecFoldable(int64_t stride) override {
+    return Impl.isAddRecFoldable(stride);
+  }
   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg, int64_t Scale) override {
     return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -206,6 +206,8 @@
 
   bool isLegalICmpImmediate(int64_t Imm) { return false; }
 
+  bool isAddRecFoldable(int64_t stride) { return false; }
+
   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg, int64_t Scale) {
     // Guess that reg+reg addressing is allowed. This heuristic is taken from
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -124,6 +124,10 @@
     return getTLI()->isLegalICmpImmediate(imm);
   }
 
+  bool isAddRecFoldable(int64_t stride) {
+    return getTLI()->isAddRecFoldable(stride);
+  }
+
   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg, int64_t Scale) {
     TargetLoweringBase::AddrMode AM;
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -1462,6 +1462,11 @@
     return true;
   }
 
+  /// \brief Return true when the AddRec can be folded into other insn. This is
+  /// possible when the target supports pre/postinc and the stride of the AddRec
+  /// is an acceptable const.
+  virtual bool isAddRecFoldable(int64_t stride) const { return false; }
+
   /// Return true if the specified immediate is legal add immediate, that is the
   /// target has add instructions which can add a register with the immediate
   /// without having to materialize the immediate into a register.
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -97,6 +97,10 @@
   return TTIImpl->isLegalICmpImmediate(Imm);
 }
 
+bool TargetTransformInfo::isAddRecFoldable(int64_t stride) const {
+  return TTIImpl->isAddRecFoldable(stride);
+}
+
 bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                                 int64_t BaseOffset,
                                                 bool HasBaseReg,
Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -307,6 +307,7 @@
 
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
+  bool isAddRecFoldable(int64_t stride) const override;
 
   EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                           bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6766,6 +6766,12 @@
   return isLegalAddImmediate(Immed);
 }
 
+/// AArch64 supports pre/postinc so the stride of the AddRec may be folded
+/// into other load/store insn.
+bool AArch64TargetLowering::isAddRecFoldable(int64_t stride) const {
+  return true;
+}
+
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -294,6 +294,10 @@
     /// the immediate into a register.
     bool isLegalICmpImmediate(int64_t Imm) const override;
 
+    /// isAddRecFoldable - Return true if the const stride of AddRec can be
+    /// folded into previous load/store insn via pre/postinc addressing mode.
+    bool isAddRecFoldable(int64_t stride) const override;
+
     /// isLegalAddImmediate - Return true if the specified immediate is legal
     /// add immediate, that is the target has add instructions which can
     /// add a register and the immediate without having to materialize
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -10299,6 +10299,10 @@
   return Imm >= 0 && Imm <= 255;
 }
 
+/// ARM supports pre/postinc so the stride of the AddRec may be folded
+/// into other load/store insn.
+bool ARMTargetLowering::isAddRecFoldable(int64_t stride) const { return true; }
+
 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
 /// *or sub* immediate, that is the target has add or sub instructions which can
 /// add a register with the immediate without having to materialize the
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -103,6 +103,23 @@
 static bool StressIVChain = false;
 #endif
 
+enum LSRModel { BasicMod, AggrModUnlimitedReg, AggrModEstReg, AggrModFewReg };
+
+static cl::opt<LSRModel> LSRCostModel(
+    "lsrmodel", cl::Hidden, cl::init(BasicMod),
+    cl::desc("Choose the LSR cost model:"),
+    cl::values(
+        clEnumValN(BasicMod, "basic", "enable basic cost model"),
+        clEnumValN(
+            AggrModUnlimitedReg, "aggr-unlimited",
+            "enable aggressive cost model and assume unlimited hard regs"),
+        clEnumValN(AggrModEstReg, "aggr-estimate",
+                   "enable aggressive cost model and estimate reg pressure"),
+        clEnumValN(
+            AggrModFewReg, "aggr-few",
+            "enable aggressive cost model and assume very few hard regs"),
+        clEnumValEnd));
+
 namespace {
 
 /// RegSortData - This class holds data which is used to order reuse candidates.
@@ -847,36 +864,72 @@
 /// accurate cost model.
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  const LSRUse &LU, const Formula &F);
-// Get the cost of the scaling factor used in F for LU.
-static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F);
 
 namespace {
 
+/// Types of the cost models.
+enum CostTypes { BasicCostType, AggressiveCostType };
+
 /// Cost - This class is used to measure and compare candidate formulae.
 class Cost {
+protected:
+  unsigned NumRegs;
+  unsigned SetupCost;
+
+  /// it shows the class type.
+  unsigned CostType;
+
+public:
+  static Cost &Create();
+  static void Delete(Cost &);
+
+  virtual ~Cost(){};
+  virtual bool operator<(const Cost &Other) const = 0;
+  virtual void operator=(const Cost &Other) = 0;
+  virtual void Lose() = 0;
+#ifndef NDEBUG
+  virtual bool isValid() = 0;
+#endif
+  virtual void clear() = 0;
+  virtual bool isLoser() = 0;
+  virtual void
+  RateFormula(const TargetTransformInfo &TTI, const Formula &F,
+              SmallPtrSetImpl<const SCEV *> &Regs,
+              const DenseSet<const SCEV *> &VisitedRegs, const Loop *L,
+              const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE,
+              DominatorTree &DT, const LSRUse &LU,
+              SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr) = 0;
+
+  virtual void print(raw_ostream &OS) const = 0;
+  void dump();
+  unsigned getCostType() const { return CostType; }
+};
+
+class BasicCost : public Cost {
   /// TODO: Some of these could be merged. Also, a lexical ordering
   /// isn't always optimal.
-  unsigned NumRegs;
   unsigned AddRecCost;
   unsigned NumIVMuls;
   unsigned NumBaseAdds;
   unsigned ImmCost;
-  unsigned SetupCost;
   unsigned ScaleCost;
 
 public:
-  Cost()
-    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0), ScaleCost(0) {}
+  void clear();
+
+  BasicCost() {
+    CostType = BasicCostType;
+    clear();
+  }
 
-  bool operator<(const Cost &Other) const;
+  bool operator<(const Cost &Other) const override;
+  void operator=(const Cost &Other) override;
 
   void Lose();
 
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
-  bool isValid() {
+  virtual bool isValid() {
     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
              | ImmCost | SetupCost | ScaleCost) != ~0u)
       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
@@ -889,38 +942,124 @@
     return NumRegs == ~0u;
   }
 
-  void RateFormula(const TargetTransformInfo &TTI,
-                   const Formula &F,
-                   SmallPtrSetImpl<const SCEV *> &Regs,
-                   const DenseSet<const SCEV *> &VisitedRegs,
-                   const Loop *L,
-                   const SmallVectorImpl<int64_t> &Offsets,
-                   ScalarEvolution &SE, DominatorTree &DT,
-                   const LSRUse &LU,
-                   SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
+  virtual void RateFormula(const TargetTransformInfo &TTI, const Formula &F,
+                           SmallPtrSetImpl<const SCEV *> &Regs,
+                           const DenseSet<const SCEV *> &VisitedRegs,
+                           const Loop *L,
+                           const SmallVectorImpl<int64_t> &Offsets,
+                           ScalarEvolution &SE, DominatorTree &DT,
+                           const LSRUse &LU,
+                           SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
+
+  virtual void print(raw_ostream &OS) const;
 
-  void print(raw_ostream &OS) const;
-  void dump() const;
+  static inline bool classof(const Cost *CT) {
+    return CT->getCostType() == BasicCostType;
+  }
 
 private:
-  void RateRegister(const SCEV *Reg,
-                    SmallPtrSetImpl<const SCEV *> &Regs,
-                    const Loop *L,
-                    ScalarEvolution &SE, DominatorTree &DT);
-  void RatePrimaryRegister(const SCEV *Reg,
+  void RateRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs,
+                    const Loop *L, ScalarEvolution &SE, DominatorTree &DT);
+  void RatePrimaryRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs,
+                           const Loop *L, ScalarEvolution &SE,
+                           DominatorTree &DT,
+                           SmallPtrSetImpl<const SCEV *> *LoserRegs);
+  unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                const LSRUse &LU, const Formula &F);
+};
+
+class AggressiveCost : public Cost {
+  /// Inst number cost.
+  unsigned InstNumCost;
+  /// Cost of regs which may spill.
+  unsigned SpillRegsCost;
+  /// Inst complexity cost.
+  unsigned InstCmplxCost;
+
+public:
+  void clear();
+
+  AggressiveCost() {
+    CostType = AggressiveCostType;
+    clear();
+  }
+
+  bool operator<(const Cost &Other) const override;
+  void operator=(const Cost &Other) override;
+
+  void Lose();
+
+#ifndef NDEBUG
+  // Once any of the metrics loses, they must all remain losers.
+  bool isValid() {
+    return (
+        ((InstNumCost | SpillRegsCost | InstCmplxCost | SetupCost) != ~0u) ||
+        ((InstNumCost & SpillRegsCost & InstCmplxCost & SetupCost) == ~0u));
+  }
+#endif
+
+  bool isLoser() {
+    assert(isValid() && "invalid cost");
+    return InstNumCost == ~0u;
+  }
+
+  virtual void RateFormula(const TargetTransformInfo &TTI, const Formula &F,
                            SmallPtrSetImpl<const SCEV *> &Regs,
+                           const DenseSet<const SCEV *> &VisitedRegs,
                            const Loop *L,
+                           const SmallVectorImpl<int64_t> &Offsets,
                            ScalarEvolution &SE, DominatorTree &DT,
-                           SmallPtrSetImpl<const SCEV *> *LoserRegs);
+                           const LSRUse &LU,
+                           SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
+
+  virtual void print(raw_ostream &OS) const;
+
+  static inline bool classof(const Cost *CT) {
+    return CT->getCostType() == AggressiveCostType;
+  }
+
+private:
+  void RateRegister(const TargetTransformInfo &TTI, const SCEV *Reg,
+                    SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L,
+                    ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU,
+                    bool &IsIndexedLoadStore);
+  void RatePrimaryRegister(const TargetTransformInfo &TTI, const SCEV *Reg,
+                           SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L,
+                           ScalarEvolution &SE, DominatorTree &DT,
+                           SmallPtrSetImpl<const SCEV *> *LoserRegs,
+                           const LSRUse &LU, bool &IsIndexedLoadStore);
+  unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                const LSRUse &LU, const Formula &F);
+  void UpdateSpillRegsCost(const TargetTransformInfo &TTI);
+  unsigned EvaluateBaseParts(const Formula &F, ScalarEvolution &SE,
+                             const Loop *L);
 };
 
 }
 
+Cost &Cost::Create() {
+  if (LSRCostModel == BasicMod)
+    return *(new BasicCost());
+  else
+    return *(new AggressiveCost());
+}
+
+void Cost::Delete(Cost &cost) { delete &cost; }
+
+void BasicCost::clear() {
+  NumRegs = 0;
+  AddRecCost = 0;
+  NumIVMuls = 0;
+  NumBaseAdds = 0;
+  ImmCost = 0;
+  SetupCost = 0;
+  ScaleCost = 0;
+}
+
 /// RateRegister - Tally up interesting quantities from the given register.
-void Cost::RateRegister(const SCEV *Reg,
-                        SmallPtrSetImpl<const SCEV *> &Regs,
-                        const Loop *L,
-                        ScalarEvolution &SE, DominatorTree &DT) {
+void BasicCost::RateRegister(const SCEV *Reg,
+                             SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L,
+                             ScalarEvolution &SE, DominatorTree &DT) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
     // If this is an addrec for another loop, don't second-guess its addrec phi
     // nodes. LSR isn't currently smart enough to reason about more than one
@@ -965,11 +1104,11 @@
 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
 /// before, rate it. Optional LoserRegs provides a way to declare any formula
 /// that refers to one of those regs an instant loser.
-void Cost::RatePrimaryRegister(const SCEV *Reg,
-                               SmallPtrSetImpl<const SCEV *> &Regs,
-                               const Loop *L,
-                               ScalarEvolution &SE, DominatorTree &DT,
-                               SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+void BasicCost::RatePrimaryRegister(const SCEV *Reg,
+                                    SmallPtrSetImpl<const SCEV *> &Regs,
+                                    const Loop *L, ScalarEvolution &SE,
+                                    DominatorTree &DT,
+                                    SmallPtrSetImpl<const SCEV *> *LoserRegs) {
   if (LoserRegs && LoserRegs->count(Reg)) {
     Lose();
     return;
@@ -981,15 +1120,14 @@
   }
 }
 
-void Cost::RateFormula(const TargetTransformInfo &TTI,
-                       const Formula &F,
-                       SmallPtrSetImpl<const SCEV *> &Regs,
-                       const DenseSet<const SCEV *> &VisitedRegs,
-                       const Loop *L,
-                       const SmallVectorImpl<int64_t> &Offsets,
-                       ScalarEvolution &SE, DominatorTree &DT,
-                       const LSRUse &LU,
-                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+void BasicCost::RateFormula(const TargetTransformInfo &TTI, const Formula &F,
+                            SmallPtrSetImpl<const SCEV *> &Regs,
+                            const DenseSet<const SCEV *> &VisitedRegs,
+                            const Loop *L,
+                            const SmallVectorImpl<int64_t> &Offsets,
+                            ScalarEvolution &SE, DominatorTree &DT,
+                            const LSRUse &LU,
+                            SmallPtrSetImpl<const SCEV *> *LoserRegs) {
   assert(F.isCanonical() && "Cost is accurate only for canonical formula");
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
@@ -1039,7 +1177,7 @@
 }
 
 /// Lose - Set this cost to a losing value.
-void Cost::Lose() {
+void BasicCost::Lose() {
   NumRegs = ~0u;
   AddRecCost = ~0u;
   NumIVMuls = ~0u;
@@ -1050,7 +1188,9 @@
 }
 
 /// operator< - Choose the lower cost.
-bool Cost::operator<(const Cost &Other) const {
+bool BasicCost::operator<(const Cost &CT) const {
+  assert(isa<BasicCost>(CT) && "Expect a BasicCost object");
+  const BasicCost &Other = static_cast<const BasicCost &>(CT);
   return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
                   ImmCost, SetupCost) <
          std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
@@ -1058,7 +1198,20 @@
                   Other.SetupCost);
 }
 
-void Cost::print(raw_ostream &OS) const {
+void BasicCost::operator=(const Cost &CT) {
+  assert(isa<BasicCost>(CT) && "Expect a BasicCost object");
+  const BasicCost &Other = static_cast<const BasicCost &>(CT);
+
+  NumRegs = Other.NumRegs;
+  AddRecCost = Other.AddRecCost;
+  NumIVMuls = Other.NumIVMuls;
+  NumBaseAdds = Other.NumBaseAdds;
+  ImmCost = Other.ImmCost;
+  SetupCost = Other.SetupCost;
+  ScaleCost = Other.ScaleCost;
+}
+
+void BasicCost::print(raw_ostream &OS) const {
   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
   if (AddRecCost != 0)
     OS << ", with addrec cost " << AddRecCost;
@@ -1075,11 +1228,174 @@
     OS << ", plus " << SetupCost << " setup cost";
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void Cost::dump() const {
-  print(errs()); errs() << '\n';
+void AggressiveCost::clear() {
+  NumRegs = 0;
+  SetupCost = 0;
+  InstNumCost = 0;
+  SpillRegsCost = 0;
+  InstCmplxCost = 0;
+  SetupCost = 0;
+}
+
+/// RatePrimaryRegister - Record this register in the set. If we haven't seen it
+/// before, rate it. Optional LoserRegs provides a way to declare any formula
+/// that refers to one of those regs an instant loser.
+void AggressiveCost::RatePrimaryRegister(
+    const TargetTransformInfo &TTI, const SCEV *Reg,
+    SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE,
+    DominatorTree &DT, SmallPtrSetImpl<const SCEV *> *LoserRegs,
+    const LSRUse &LU, bool &IsIndexedLoadStore) {
+  if (LoserRegs && LoserRegs->count(Reg)) {
+    Lose();
+    return;
+  }
+  if (Regs.insert(Reg).second) {
+    RateRegister(TTI, Reg, Regs, L, SE, DT, LU, IsIndexedLoadStore);
+    if (LoserRegs && isLoser())
+      LoserRegs->insert(Reg);
+  }
+}
+
+unsigned AggressiveCost::EvaluateBaseParts(const Formula &F,
+                                           ScalarEvolution &SE, const Loop *L) {
+  unsigned variant = 0;
+  unsigned invariant = 0;
+  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
+                                                     E = F.BaseRegs.end();
+       I != E; ++I) {
+    if (SE.isLoopInvariant(*I, L))
+      invariant++;
+    else
+      variant++;
+  }
+  if (F.BaseGV || F.BaseOffset)
+    invariant++;
+  if (F.ScaledReg)
+    variant++;
+  return variant + (invariant ? 1 : 0);
+}
+
+void AggressiveCost::RateFormula(const TargetTransformInfo &TTI,
+                                 const Formula &F,
+                                 SmallPtrSetImpl<const SCEV *> &Regs,
+                                 const DenseSet<const SCEV *> &VisitedRegs,
+                                 const Loop *L,
+                                 const SmallVectorImpl<int64_t> &Offsets,
+                                 ScalarEvolution &SE, DominatorTree &DT,
+                                 const LSRUse &LU,
+                                 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+  // Show whether pre/postinc addressing mode is used for the formula.
+  bool IsIndexedLoadStore = false;
+  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+  // Tally up the registers.
+  if (const SCEV *ScaledReg = F.ScaledReg) {
+    if (VisitedRegs.count(ScaledReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(TTI, ScaledReg, Regs, L, SE, DT, LoserRegs, LU,
+                        IsIndexedLoadStore);
+    if (isLoser())
+      return;
+  }
+  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
+                                                     E = F.BaseRegs.end();
+       I != E; ++I) {
+    const SCEV *BaseReg = *I;
+    if (VisitedRegs.count(BaseReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(TTI, BaseReg, Regs, L, SE, DT, LoserRegs, LU,
+                        IsIndexedLoadStore);
+    if (isLoser())
+      return;
+  }
+
+  // Determine how many (unfolded) adds we'll need inside the loop.
+  size_t NumBaseParts = EvaluateBaseParts(F, SE, L);
+  unsigned NumBaseAdds = 0;
+  if (NumBaseParts > 1)
+    // Do not count the base and a possible second register if the target
+    // allows to fold 2 registers.
+    NumBaseAdds +=
+        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F) &&
+                             !IsIndexedLoadStore));
+  NumBaseAdds += (F.UnfoldedOffset != 0);
+  InstNumCost += NumBaseAdds;
+
+  // Accumulate non-free scaling amounts.
+  InstCmplxCost += getScalingFactorCost(TTI, LU, F);
+
+  // Tally up the non-zero immediates.
+  unsigned ImmCost = 0;
+  for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
+                                                E = Offsets.end();
+       I != E; ++I) {
+    int64_t Offset = (uint64_t)*I + F.BaseOffset;
+    if (Offset != 0 && IsIndexedLoadStore)
+      ImmCost += 1;
+  }
+  InstNumCost += ImmCost;
+
+  UpdateSpillRegsCost(TTI);
+  assert(isValid() && "invalid cost");
+}
+
+/// Lose - Set this cost to a losing value.
+void AggressiveCost::Lose() {
+  NumRegs = ~0u;
+  InstNumCost = ~0u;
+  SpillRegsCost = ~0u;
+  InstCmplxCost = ~0u;
+  SetupCost = ~0u;
+}
+
+/// UpdateSpillRegsCost -- Update the SpillRegsCost according to
+/// LSRCostModel and register pressure.
+void AggressiveCost::UpdateSpillRegsCost(const TargetTransformInfo &TTI) {
+  if (LSRCostModel == AggrModEstReg)
+    SpillRegsCost = (NumRegs > TTI.getNumberOfRegisters(false))
+                        ? NumRegs - TTI.getNumberOfRegisters(false)
+                        : 0;
+  else if (LSRCostModel == AggrModUnlimitedReg)
+    SpillRegsCost = 0;
+  else
+    SpillRegsCost = NumRegs;
+}
+
+/// operator< - Choose the lower cost.
+bool AggressiveCost::operator<(const Cost &CT) const {
+  assert(isa<AggressiveCost>(CT) && "Expect a AggressiveCost object");
+  const AggressiveCost &Other = static_cast<const AggressiveCost &>(CT);
+  unsigned InstCost = InstNumCost + SpillRegsCost;
+  unsigned OtherInstCost = Other.InstNumCost + Other.SpillRegsCost;
+  return std::tie(InstCost, InstCmplxCost, SetupCost) <
+         std::tie(OtherInstCost, Other.InstCmplxCost, Other.SetupCost);
+}
+
+void AggressiveCost::operator=(const Cost &CT) {
+  assert(isa<AggressiveCost>(CT) && "Expect a AggressiveCost object");
+  const AggressiveCost &Other = static_cast<const AggressiveCost &>(CT);
+
+  NumRegs = Other.NumRegs;
+  InstNumCost = Other.InstNumCost;
+  SpillRegsCost = Other.SpillRegsCost;
+  InstCmplxCost = Other.InstCmplxCost;
+  SetupCost = Other.SetupCost;
+}
+
+void AggressiveCost::print(raw_ostream &OS) const {
+  OS << NumRegs << " Reg" << (NumRegs == 1 ? "" : "s");
+  if (InstNumCost != 0)
+    OS << ", plus " << InstNumCost << " InstNumCost";
+  if (SpillRegsCost != 0)
+    OS << ", plus " << InstNumCost << " SpillRegsCost";
+  if (InstCmplxCost != 0)
+    OS << ", plus " << InstCmplxCost << " InstCmplxCost";
+  if (SetupCost != 0)
+    OS << ", plus " << SetupCost << " SetupCost";
 }
-#endif
 
 namespace {
 
@@ -1269,6 +1585,90 @@
 
 }
 
+/// RateRegister - Tally up interesting quantities from the given register.
+void AggressiveCost::RateRegister(const TargetTransformInfo &TTI,
+                                  const SCEV *Reg,
+                                  SmallPtrSetImpl<const SCEV *> &Regs,
+                                  const Loop *L, ScalarEvolution &SE,
+                                  DominatorTree &DT, const LSRUse &LU,
+                                  bool &IsIndexedLoadStore) {
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
+    // If this is an addrec for another loop, don't second-guess its addrec phi
+    // nodes. LSR isn't currently smart enough to reason about more than one
+    // loop at a time. LSR has already run on inner loops, will not run on outer
+    // loops, and cannot be expected to change sibling loops.
+    if (AR->getLoop() != L) {
+      // If the AddRec exists, consider it's register free and leave it alone.
+      if (isExistingPhi(AR, SE))
+        return;
+
+      // Otherwise, do not consider this formula at all.
+      Lose();
+      return;
+    }
+
+    // Usually we need a separate add/sub insn for every AddRec. But
+    // if the target support pre/postinc addressing mode, the following
+    // pattern (use AARCH64 as an example):
+    //    adrp    x10, a
+    //    add     x10, x10, :lo12:a
+    // .LBB0_2:
+    //    add     x12, x10, x9, lsl #2  // add insn for AddRec used by store
+    //    add     x9, x9, #1
+    //    str     w11, [x12, #48]
+    //    cmp     x9, x8
+    //    b.lt    .LBB0_2
+    // can be optimized to this form using postinc addressing mode:
+    //    adrp    x10, a
+    //    add     x10, x10, :lo12:a
+    //    add     x10, x10, #48
+    // .LBB0_2:
+    //    str     w11, [x10], #4
+    //    add     x9, x9, #1
+    //    cmp     x9, x8
+    //    b.lt    .LBB0_2
+    // And no separate add insn is required for the AddRec. If pre/postinc
+    // addressing mode is used, we record it in IsIndexedLoadStore because
+    // it usually means the load/store cannot fold other regs or offsets
+    // into the same insn.
+    if (AR->isAffine() && isa<SCEVConstant>(AR->getOperand(1))) {
+      int64_t stride = dyn_cast_or_null<SCEVConstant>(AR->getOperand(1))
+                           ->getValue()
+                           ->getValue()
+                           .getSExtValue();
+      if (stride != 0 && TTI.isAddRecFoldable(stride) &&
+          (LU.Kind == LSRUse::Address))
+        IsIndexedLoadStore = true;
+      else
+        InstNumCost += 1;
+    } else {
+      InstNumCost += 1;
+    }
+
+    // Add the step value register, if it needs one.
+    // TODO: The non-affine case isn't precisely modeled here.
+    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+      if (!Regs.count(AR->getOperand(1))) {
+        RateRegister(TTI, AR->getOperand(1), Regs, L, SE, DT, LU,
+                     IsIndexedLoadStore);
+        if (isLoser())
+          return;
+      }
+    }
+  }
+  ++NumRegs;
+
+  // Rough heuristic; favor registers which don't require extra setup
+  // instructions in the preheader.
+  if (!isa<SCEVUnknown>(Reg) && !isa<SCEVConstant>(Reg) &&
+      !(isa<SCEVAddRecExpr>(Reg) &&
+        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
+         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
+    ++SetupCost;
+
+  InstNumCost += isa<SCEVMulExpr>(Reg) && SE.hasComputableLoopEvolution(Reg, L);
+}
+
 /// HasFormula - Test whether this use as a formula which has the same
 /// registers as the given formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
@@ -1492,8 +1892,9 @@
                               F.Scale);
 }
 
-static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F) {
+/// Get the cost of the scaling factor used in F for LU.
+unsigned BasicCost::getScalingFactorCost(const TargetTransformInfo &TTI,
+                                         const LSRUse &LU, const Formula &F) {
   if (!F.Scale)
     return 0;
 
@@ -1530,6 +1931,51 @@
   llvm_unreachable("Invalid LSRUse Kind!");
 }
 
+unsigned AggressiveCost::getScalingFactorCost(const TargetTransformInfo &TTI,
+                                              const LSRUse &LU,
+                                              const Formula &F) {
+  if (!F.Scale)
+    return 0;
+
+  // If the use is not completely folded in that instruction, we will have to
+  // pay an extra cost only for scale != 1.
+  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                            LU.AccessTy, F)) {
+    InstNumCost += 1;
+    return 0;
+  }
+
+  switch (LU.Kind) {
+  case LSRUse::Address: {
+    // Check the scaling factor cost with both the min and max offsets.
+    int ScaleCostMinOffset = TTI.getScalingFactorCost(
+        LU.AccessTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
+        F.Scale);
+    int ScaleCostMaxOffset = TTI.getScalingFactorCost(
+        LU.AccessTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
+        F.Scale);
+
+    assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
+           "Legal addressing mode has an illegal cost!");
+    return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
+  }
+  case LSRUse::ICmpZero:
+  case LSRUse::Basic:
+  case LSRUse::Special:
+    // For ICmpZero, the loop be kept as a count-down loop only when
+    // F.getNumRegs() == 1 and there is no other offset, or else we
+    // will need a separate cmp/test instruction.
+    // For Basic and Special uses, no complex addressing mode is allowed, so
+    // we need an extra instruction when F contains more than one reg or
+    // F contains other offset.
+    if (!(F.getNumRegs() == 1 && F.BaseOffset == 0 && F.UnfoldedOffset == 0))
+      InstNumCost += 1;
+    return 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                              LSRUse::KindType Kind, Type *AccessTy,
                              GlobalValue *BaseGV, int64_t BaseOffset,
@@ -1768,6 +2214,9 @@
                     DenseSet<const SCEV *> &VisitedRegs) const;
   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
 
+  void DumpSolution(SmallVectorImpl<const Formula *> &Solution,
+                    Cost &SolutionCost, std::string HeadStr) const;
+
   BasicBlock::iterator
     HoistInsertPosition(BasicBlock::iterator IP,
                         const SmallVectorImpl<Instruction *> &Inputs) const;
@@ -3921,6 +4370,8 @@
     BestFormulaeTy;
   BestFormulaeTy BestFormulae;
 
+  Cost &CostF = Cost::Create();
+  Cost &CostBest = Cost::Create();
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
     LSRUse &LU = Uses[LUIdx];
     DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
@@ -3937,8 +4388,8 @@
       // avoids the need to recompute this information across formulae using the
       // same bad AddRec. Passing LoserRegs is also essential unless we remove
       // the corresponding bad register from the Regs set.
-      Cost CostF;
       Regs.clear();
+      CostF.clear();
       CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
                         &LoserRegs);
       if (CostF.isLoser()) {
@@ -3973,8 +4424,8 @@
 
         Formula &Best = LU.Formulae[P.first->second];
 
-        Cost CostBest;
         Regs.clear();
+        CostBest.clear();
         CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
                              DT, LU);
         if (CostF < CostBest)
@@ -4001,6 +4452,10 @@
     BestFormulae.clear();
   }
 
+  // wmi
+  Cost::Delete(CostF);
+  Cost::Delete(CostBest);
+
   DEBUG(if (ChangedFormulae) {
           dbgs() << "\n"
                     "After filtering out undesirable candidates:\n";
@@ -4296,12 +4751,14 @@
   // reference that register in order to be considered. This prunes out
   // unprofitable searching.
   SmallSetVector<const SCEV *, 4> ReqRegs;
-  for (const SCEV *S : CurRegs)
-    if (LU.Regs.count(S))
-      ReqRegs.insert(S);
+  if (LSRCostModel == BasicMod) {
+    for (const SCEV *S : CurRegs)
+      if (LU.Regs.count(S))
+        ReqRegs.insert(S);
+  }
 
   SmallPtrSet<const SCEV *, 16> NewRegs;
-  Cost NewCost;
+  Cost &NewCost = Cost::Create();
   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
        E = LU.Formulae.end(); I != E; ++I) {
     const Formula &F = *I;
@@ -4341,27 +4798,23 @@
         if (F.getNumRegs() == 1 && Workspace.size() == 1)
           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
       } else {
-        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
-              dbgs() << ".\n Regs:";
-              for (const SCEV *S : NewRegs)
-                dbgs() << ' ' << *S;
-              dbgs() << '\n');
-
         SolutionCost = NewCost;
         Solution = Workspace;
+        DumpSolution(Solution, SolutionCost, "New best at ");
       }
       Workspace.pop_back();
     }
   }
+  Cost::Delete(NewCost);
 }
 
 /// Solve - Choose one formula from each use. Return the results in the given
 /// Solution vector.
 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   SmallVector<const Formula *, 8> Workspace;
-  Cost SolutionCost;
+  Cost &SolutionCost = Cost::Create();
   SolutionCost.Lose();
-  Cost CurCost;
+  Cost &CurCost = Cost::Create();
   SmallPtrSet<const SCEV *, 16> CurRegs;
   DenseSet<const SCEV *> VisitedRegs;
   Workspace.reserve(Uses.size());
@@ -4371,13 +4824,23 @@
                CurRegs, VisitedRegs);
   if (Solution.empty()) {
     DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
-    return;
+    goto ret;
   }
 
   // Ok, we've now made all our decisions.
-  DEBUG(dbgs() << "\n"
-                  "The chosen solution requires "; SolutionCost.print(dbgs());
-        dbgs() << ":\n";
+  DumpSolution(Solution, SolutionCost, "The chosen solution requires ");
+
+  assert(Solution.size() == Uses.size() && "Malformed solution!");
+
+ret:
+  Cost::Delete(SolutionCost);
+  Cost::Delete(CurCost);
+}
+
+/// DumpSolution - Dump the existing solution.
+void LSRInstance::DumpSolution(SmallVectorImpl<const Formula *> &Solution,
+                               Cost &SolutionCost, std::string HeadStr) const {
+  DEBUG(dbgs() << "\n" << HeadStr; SolutionCost.print(dbgs()); dbgs() << ":\n";
         for (size_t i = 0, e = Uses.size(); i != e; ++i) {
           dbgs() << "  ";
           Uses[i].print(dbgs());
@@ -4386,8 +4849,6 @@
           Solution[i]->print(dbgs());
           dbgs() << '\n';
         });
-
-  assert(Solution.size() == Uses.size() && "Malformed solution!");
 }
 
 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
Index: test/Transforms/LoopStrengthReduce/new_cost_1.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/new_cost_1.ll
+++ test/Transforms/LoopStrengthReduce/new_cost_1.ll
@@ -0,0 +1,45 @@
+; This test makes sure that loop uses post-increment addressing mode to reduce insns.
+; RUN: opt < %s -loop-reduce -lsrmodel=aggr-estimate -S | FileCheck %s
+
+; Make sure there are two new loop induction variables are used.
+; CHECK-LABEL: @foo(
+; CHECK: getelementptr{{.*}}lsr.iv, i64 0, i64 1
+; CHECK: getelementptr{{.*}}lsr.iv2, i64 0, i64 1
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnu"
+
+@N = common global i32 0, align 4
+@a = common global [1000 x i32] zeroinitializer, align 4
+@b = common global [1000 x i32] zeroinitializer, align 4
+@j = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+  %0 = load i32, i32* @N, align 4
+  %cmp9 = icmp sgt i32 %0, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv = sext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.010 = phi i64 [ 0, %for.body.lr.ph ], [ %add3, %for.body ]
+  %add = add nuw nsw i64 %i.010, 12
+  %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %add
+  store i32 15, i32* %arrayidx, align 4
+  %sub = add nsw i64 %i.010, -2
+  %arrayidx2 = getelementptr inbounds [1000 x i32], [1000 x i32]* @b, i64 0, i64 %sub
+  store i32 15, i32* %arrayidx2, align 4
+  %add3 = add nuw nsw i64 %i.010, 1
+  %cmp = icmp slt i64 %add3, %conv
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
Index: test/Transforms/LoopStrengthReduce/new_cost_2.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/new_cost_2.ll
+++ test/Transforms/LoopStrengthReduce/new_cost_2.ll
@@ -0,0 +1,111 @@
+; This test makes sure that loop uses post-increment addressing mode to reduce insns.
+; RUN: opt < %s -loop-reduce -lsrmodel=aggr-estimate -S | FileCheck %s
+
+; Make sure no lsr variable is generated. All the address references will share the
+; basic induction variable.
+; CHECK-NOT: lsr
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.G = type { i8 }
+%"class.G::H" = type { i8 }
+%class.C = type { %class.tuple }
+%class.tuple = type { i8 }
+%"class.internal::MatrixNaive" = type { i32, %class.J, %class.J.2, %class.J.2 }
+%class.J = type { %struct.F }
+%struct.F = type { %struct.anon }
+%struct.anon = type { %class.J.2* }
+%class.J.2 = type { %struct.F.3 }
+%struct.F.3 = type { %struct.anon.4 }
+%struct.anon.4 = type { float* }
+
+$_Z6MatrixI2S1IfLi27ELi51EEEvv = comdat any
+
+$_ZN6Runner15BenchmarkRunnerIN8internal11MatrixNaiveIfLi27ELi51EEEEEvi = comdat any
+
+@Benchmark = global %class.G zeroinitializer, align 1
+
+declare i32* @_Z8CallbackIPFvvEEPiT_(void ()*) #0
+
+; Function Attrs: noreturn uwtable
+define linkonce_odr void @_Z6MatrixI2S1IfLi27ELi51EEEvv() #1 comdat {
+entry:
+  tail call void @_ZN6Runner15BenchmarkRunnerIN8internal11MatrixNaiveIfLi27ELi51EEEEEvi(i32 undef)
+  unreachable
+}
+
+declare void @_ZN1G1HC1EPi(%"class.G::H"*, i32*) #0
+
+declare void @_ZN1GC1ENS_1HE(%class.G*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+
+; Function Attrs: noreturn uwtable
+define linkonce_odr void @_ZN6Runner15BenchmarkRunnerIN8internal11MatrixNaiveIfLi27ELi51EEEEEvi(i32) #1 comdat align 2 {
+entry:
+  %operation = alloca %class.C, align 1
+  %1 = getelementptr inbounds %class.C, %class.C* %operation, i64 0, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %1) #2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %call1 = call %"class.internal::MatrixNaive"* @_ZN1CIN8internal11MatrixNaiveIfLi27ELi51EEEEptEv(%class.C* %operation)
+  %matrix_.i = getelementptr inbounds %"class.internal::MatrixNaive", %"class.internal::MatrixNaive"* %call1, i64 0, i32 1
+  %call.i = call i64 @_ZN1JIS_If1AIfEES0_IS2_EE4sizeEv(%class.J* %matrix_.i)
+  %Execute_num_rows.i = getelementptr inbounds %"class.internal::MatrixNaive", %"class.internal::MatrixNaive"* %call1, i64 0, i32 0
+  %2 = load i32, i32* %Execute_num_rows.i, align 4
+  %tobool32.i = icmp eq i32 %2, 0
+  br i1 %tobool32.i, label %for.cond, label %for.body.lr.ph.i
+
+for.body.lr.ph.i:                                 ; preds = %for.cond
+  %conv.i.le = trunc i64 %call.i to i32
+  %3 = bitcast %class.J* %matrix_.i to i64**
+  %cmp29.i = icmp sgt i32 %conv.i.le, 0
+  %_M_start.i27.i = getelementptr inbounds %"class.internal::MatrixNaive", %"class.internal::MatrixNaive"* %call1, i64 0, i32 3, i32 0, i32 0, i32 0
+  %_M_start.i24.i = getelementptr inbounds %"class.internal::MatrixNaive", %"class.internal::MatrixNaive"* %call1, i64 0, i32 2, i32 0, i32 0, i32 0
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.cleanup6.i, %for.body.lr.ph.i
+  %sum.033.i = phi float [ undef, %for.body.lr.ph.i ], [ %sum.1.lcssa.i, %for.cond.cleanup6.i ]
+  br i1 %cmp29.i, label %for.body7.lr.ph.i, label %for.cond.cleanup6.i
+
+for.body7.lr.ph.i:                                ; preds = %for.body.i
+  %4 = load i64*, i64** %3, align 8
+  %5 = load i64, i64* %4, align 8
+  %6 = inttoptr i64 %5 to float*
+  %7 = load float*, float** %_M_start.i24.i, align 8
+  br label %for.body7.i
+
+for.cond.cleanup6.i.loopexit:                     ; preds = %for.body7.i
+  br label %for.cond.cleanup6.i
+
+for.cond.cleanup6.i:                              ; preds = %for.cond.cleanup6.i.loopexit, %for.body.i
+  %sum.1.lcssa.i = phi float [ %sum.033.i, %for.body.i ], [ %add.i, %for.cond.cleanup6.i.loopexit ]
+  %8 = load float*, float** %_M_start.i27.i, align 8
+  store float %sum.1.lcssa.i, float* %8, align 4
+  br label %for.body.i
+
+for.body7.i:                                      ; preds = %for.body7.i, %for.body7.lr.ph.i
+  %indvars.iv.i = phi i64 [ 0, %for.body7.lr.ph.i ], [ %indvars.iv.next.i, %for.body7.i ]
+  %sum.130.i = phi float [ %sum.033.i, %for.body7.lr.ph.i ], [ %add.i, %for.body7.i ]
+  %add.ptr.i26.i = getelementptr inbounds float, float* %6, i64 %indvars.iv.i
+  %9 = load float, float* %add.ptr.i26.i, align 4
+  %add.ptr.i.i = getelementptr inbounds float, float* %7, i64 %indvars.iv.i
+  %10 = load float, float* %add.ptr.i.i, align 4
+  %mul.i = fmul float %9, %10
+  %add.i = fadd float %sum.130.i, %mul.i
+  %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %conv.i.le
+  br i1 %exitcond, label %for.cond.cleanup6.i.loopexit, label %for.body7.i
+}
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+
+declare %"class.internal::MatrixNaive"* @_ZN1CIN8internal11MatrixNaiveIfLi27ELi51EEEEptEv(%class.C*) #0
+
+declare i64 @_ZN1JIS_If1AIfEES0_IS2_EE4sizeEv(%class.J*) #0
+
+