Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -233,6 +233,17 @@
   /// incurs significant execution cost.
   bool isLoweredToCall(const Function *F) const;
 
+  struct LSRCost {
+    unsigned Insns;
+    unsigned NumRegs;
+    unsigned AddRecCost;
+    unsigned NumIVMuls;
+    unsigned NumBaseAdds;
+    unsigned ImmCost;
+    unsigned SetupCost;
+    unsigned ScaleCost;
+  };
+
   /// Parameters that control the generic loop unrolling transformation.
   struct UnrollingPreferences {
     /// The cost threshold for the unrolled loop. Should be relative to the
@@ -347,6 +358,10 @@
                              bool HasBaseReg, int64_t Scale,
                              unsigned AddrSpace = 0) const;
 
+  /// \brief Return true if LSR cost of C1 is lower than C1.
+  bool isLSRCostLower(TargetTransformInfo::LSRCost &C1,
+                      TargetTransformInfo::LSRCost &C2) const;
+
   /// \brief Return true if the target supports masked load/store
   /// AVX2 and AVX-512 targets allow masks for consecutive load and store
   bool isLegalMaskedStore(Type *DataType) const;
@@ -708,6 +723,8 @@
                                      int64_t BaseOffset, bool HasBaseReg,
                                      int64_t Scale,
                                      unsigned AddrSpace) = 0;
+  virtual bool isLSRCostLower(TargetTransformInfo::LSRCost &C1,
+                              TargetTransformInfo::LSRCost &C2) = 0;
   virtual bool isLegalMaskedStore(Type *DataType) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType) = 0;
@@ -875,6 +892,10 @@
     return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
                                       Scale, AddrSpace);
   }
+  bool isLSRCostLower(TargetTransformInfo::LSRCost &C1,
+                      TargetTransformInfo::LSRCost &C2) override {
+    return Impl.isLSRCostLower(C1, C2);
+  }
   bool isLegalMaskedStore(Type *DataType) override {
     return Impl.isLegalMaskedStore(DataType);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -218,6 +218,13 @@
     return !BaseGV && BaseOffset == 0 && (Scale == 0 || Scale == 1);
   }
 
+  bool isLSRCostLower(TTI::LSRCost &C1, TTI::LSRCost &C2) {
+    return std::tie(C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds,
+                    C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+           std::tie(C2.NumRegs, C2.AddRecCost, C2.NumIVMuls, C2.NumBaseAdds,
+                    C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+  }
+
   bool isLegalMaskedStore(Type *DataType) { return false; }
 
   bool isLegalMaskedLoad(Type *DataType) { return false; }
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -129,6 +129,10 @@
     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace);
   }
 
+  bool isLSRCostLower(TTI::LSRCost C1, TTI::LSRCost C2) {
+    return TargetTransformInfoImplBase::isLSRCostLower(C1, C2);
+  }
+
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
     TargetLoweringBase::AddrMode AM;
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -123,6 +123,10 @@
                                         Scale, AddrSpace);
 }
 
+bool TargetTransformInfo::isLSRCostLower(LSRCost &C1, LSRCost &C2) const {
+  return TTIImpl->isLSRCostLower(C1, C2);
+}
+
 bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
   return TTIImpl->isLegalMaskedStore(DataType);
 }
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -87,6 +87,8 @@
   int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
   int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                     Type *Ty);
+  bool isLSRCostLower(TargetTransformInfo::LSRCost &C1,
+                      TargetTransformInfo::LSRCost &C2);
   bool isLegalMaskedLoad(Type *DataType);
   bool isLegalMaskedStore(Type *DataType);
   bool isLegalMaskedGather(Type *DataType);
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1863,6 +1863,17 @@
   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
 }
 
+bool X86TTIImpl::isLSRCostLower(TargetTransformInfo::LSRCost &C1,
+                                TargetTransformInfo::LSRCost &C2) {
+    // X86 specific here are "instruction number 1st priority".
+    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
+                    C1.NumIVMuls, C1.NumBaseAdds,
+                    C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
+                    C2.NumIVMuls, C2.NumBaseAdds,
+                    C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+}
+
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
   Type *ScalarTy = DataTy->getScalarType();
   int DataWidth = isa<PointerType>(ScalarTy) ?
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -323,6 +323,8 @@
 
   bool unscale();
 
+  bool hasZeroEnd() const;
+
   size_t getNumRegs() const;
   Type *getType() const;
 
@@ -457,6 +459,14 @@
   return true;
 }
 
+bool Formula::hasZeroEnd() const {
+  if (UnfoldedOffset || BaseOffset)
+    return false;
+  if (BaseRegs.size() != 1 || ScaledReg)
+    return false;
+  return true;
+}
+
 /// Return the total number of register operands used by this formula. This does
 /// not include register uses implied by non-constant addrec strides.
 size_t Formula::getNumRegs() const {
@@ -855,6 +865,15 @@
   return Changed;
 }
 
+/// Returns true if A and B has same constant value.
+///
+static bool hasSameConstValue(const SCEV *A, const SCEV *B) {
+  if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(A))
+    if (const SCEVConstant *BC = dyn_cast<SCEVConstant>(B))
+      return APInt::isSameValue(AC->getAPInt(), BC->getAPInt());
+  return false;
+}
+
 namespace {
 
 class LSRUse;
@@ -882,36 +901,39 @@
 class Cost {
   /// TODO: Some of these could be merged. Also, a lexical ordering
   /// isn't always optimal.
-  unsigned NumRegs;
-  unsigned AddRecCost;
-  unsigned NumIVMuls;
-  unsigned NumBaseAdds;
-  unsigned ImmCost;
-  unsigned SetupCost;
-  unsigned ScaleCost;
+  TargetTransformInfo::LSRCost C;
 
 public:
-  Cost()
-    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0), ScaleCost(0) {}
+  Cost() {
+    C.Insns = 0;
+    C.NumRegs = 0;
+    C.AddRecCost = 0;
+    C.NumIVMuls = 0;
+    C.NumBaseAdds = 0;
+    C.ImmCost = 0;
+    C.SetupCost = 0;
+    C.ScaleCost = 0;
+  }
 
-  bool operator<(const Cost &Other) const;
+  bool isLower(Cost &Other, const TargetTransformInfo &TTI);
 
   void Lose();
 
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
-    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
-             | ImmCost | SetupCost | ScaleCost) != ~0u)
-      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
-           & ImmCost & SetupCost & ScaleCost) == ~0u);
+    return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls
+             | C.NumBaseAdds | C.ImmCost | C.SetupCost
+             | C.ScaleCost) != ~0u)
+      || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls
+           & C.NumBaseAdds & C.ImmCost & C.SetupCost
+           & C.ScaleCost) == ~0u);
   }
 #endif
 
   bool isLoser() {
     assert(isValid() && "invalid cost");
-    return NumRegs == ~0u;
+    return C.NumRegs == ~0u;
   }
 
   void RateFormula(const TargetTransformInfo &TTI,
@@ -1095,8 +1117,7 @@
       Lose();
       return;
     }
-    AddRecCost += 1; /// TODO: This should be a function of the stride.
-
+    C.AddRecCost += 1; /// TODO: This should be a function of the stride.
     // Add the step value register, if it needs one.
     // TODO: The non-affine case isn't precisely modeled here.
     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
@@ -1107,7 +1128,7 @@
       }
     }
   }
-  ++NumRegs;
+  ++C.NumRegs;
 
   // Rough heuristic; favor registers which don't require extra setup
   // instructions in the preheader.
@@ -1116,9 +1137,9 @@
       !(isa<SCEVAddRecExpr>(Reg) &&
         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
-    ++SetupCost;
+    ++C.SetupCost;
 
-  NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+  C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
                SE.hasComputableLoopEvolution(Reg, L);
 }
 
@@ -1151,6 +1172,9 @@
                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
   assert(F.isCanonical() && "Cost is accurate only for canonical formula");
   // Tally up the registers.
+  unsigned AddRecCost = C.AddRecCost;
+  unsigned NumRegs = C.NumRegs;
+  unsigned NumBaseAdds = C.NumBaseAdds;
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
       Lose();
@@ -1170,72 +1194,90 @@
       return;
   }
 
+  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+  // additional instruction.
+  unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+  if (C.NumRegs > TTIRegNum) {
+    // Cost already exceeded TTIRegNum, then only newly added register can add
+    // new instructions.
+    if (NumRegs > TTIRegNum)
+      C.Insns += (C.NumRegs - NumRegs);
+    else
+      C.Insns += (C.NumRegs - TTIRegNum);
+  }
+
   // Determine how many (unfolded) adds we'll need inside the loop.
   size_t NumBaseParts = F.getNumRegs();
   if (NumBaseParts > 1)
     // Do not count the base and a possible second register if the target
     // allows to fold 2 registers.
-    NumBaseAdds +=
+    C.NumBaseAdds +=
         NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
-  NumBaseAdds += (F.UnfoldedOffset != 0);
-
+  C.NumBaseAdds += (F.UnfoldedOffset != 0);
   // Accumulate non-free scaling amounts.
-  ScaleCost += getScalingFactorCost(TTI, LU, F);
-
+  C.ScaleCost += getScalingFactorCost(TTI, LU, F);
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
     int64_t O = Fixup.Offset;
     int64_t Offset = (uint64_t)O + F.BaseOffset;
     if (F.BaseGV)
-      ImmCost += 64; // Handle symbolic values conservatively.
+      C.ImmCost += 64; // Handle symbolic values conservatively.
                      // TODO: This should probably be the pointer size.
     else if (Offset != 0)
-      ImmCost += APInt(64, Offset, true).getMinSignedBits();
+      C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
 
     // Check with target if this offset with this instruction is
     // specifically not supported.
     if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) &&
         !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
-      NumBaseAdds++;
+      C.NumBaseAdds++;
   }
+
+  // Each new AddRec adds 1 instruction to calculation.
+  C.Insns += (C.AddRecCost - AddRecCost);
+  // ICmpZero adds no Insns if it ends with zero.
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+    C.Insns++;
+  // BaseAdds adds instructions for unfolded registers.
+  if (LU.Kind != LSRUse::ICmpZero)
+    C.Insns += C.NumBaseAdds - NumBaseAdds;
   assert(isValid() && "invalid cost");
 }
 
 /// Set this cost to a losing value.
 void Cost::Lose() {
-  NumRegs = ~0u;
-  AddRecCost = ~0u;
-  NumIVMuls = ~0u;
-  NumBaseAdds = ~0u;
-  ImmCost = ~0u;
-  SetupCost = ~0u;
-  ScaleCost = ~0u;
+  C.Insns = ~0u;
+  C.NumRegs = ~0u;
+  C.AddRecCost = ~0u;
+  C.NumIVMuls = ~0u;
+  C.NumBaseAdds = ~0u;
+  C.ImmCost = ~0u;
+  C.SetupCost = ~0u;
+  C.ScaleCost = ~0u;
 }
 
 /// Choose the lower cost.
-bool Cost::operator<(const Cost &Other) const {
-  return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
-                  ImmCost, SetupCost) <
-         std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
-                  Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
-                  Other.SetupCost);
+bool Cost::isLower(Cost &Other, const TargetTransformInfo &TTI) {
+  return TTI.isLSRCostLower(C, Other.C);
 }
 
 void Cost::print(raw_ostream &OS) const {
-  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
-  if (AddRecCost != 0)
-    OS << ", with addrec cost " << AddRecCost;
-  if (NumIVMuls != 0)
-    OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
-  if (NumBaseAdds != 0)
-    OS << ", plus " << NumBaseAdds << " base add"
-       << (NumBaseAdds == 1 ? "" : "s");
-  if (ScaleCost != 0)
-    OS << ", plus " << ScaleCost << " scale cost";
-  if (ImmCost != 0)
-    OS << ", plus " << ImmCost << " imm cost";
-  if (SetupCost != 0)
-    OS << ", plus " << SetupCost << " setup cost";
+  OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
+  OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
+  if (C.AddRecCost != 0)
+    OS << ", with addrec cost " << C.AddRecCost;
+  if (C.NumIVMuls != 0)
+    OS << ", plus " << C.NumIVMuls << " IV mul"
+       << (C.NumIVMuls == 1 ? "" : "s");
+  if (C.NumBaseAdds != 0)
+    OS << ", plus " << C.NumBaseAdds << " base add"
+       << (C.NumBaseAdds == 1 ? "" : "s");
+  if (C.ScaleCost != 0)
+    OS << ", plus " << C.ScaleCost << " scale cost";
+  if (C.ImmCost != 0)
+    OS << ", plus " << C.ImmCost << " imm cost";
+  if (C.SetupCost != 0)
+    OS << ", plus " << C.SetupCost << " setup cost";
 }
 
 LLVM_DUMP_METHOD
@@ -1763,6 +1805,7 @@
   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateCrossUseConstantOffsets();
+  void GenerateCrossUseICmpZero();
   void GenerateAllReuseFormulae();
 
   void FilterOutUndesirableDedicatedRegisters();
@@ -3696,6 +3739,85 @@
   print(errs()); errs() << '\n';
 }
 
+/// Look for ICmp AddRecExpr that ends with zero and try to reuse them in
+/// other formulas.
+/// For the following:
+///   ICmpZero {-40,+,4}
+///   Address  {%a,+,4}
+/// Algorithm will add 1 Address Formula:
+///   ICmpZero {-40,+,4}
+///   Address  {%a} + {0,+,4}
+///            40 + {%a} + {-40,+,4}
+///
+void LSRInstance::GenerateCrossUseICmpZero() {
+  SmallVector<const SCEV *, 8> Sequence;
+  // Get all ICmpZero registers that ens with zero.
+  for (LSRUse &LU : Uses) {
+    if (LU.Kind != LSRUse::ICmpZero)
+      continue;
+    for (const Formula &F : LU.Formulae) {
+      if (!F.hasZeroEnd())
+        continue;
+      const SCEVAddRecExpr *Reg = dyn_cast<SCEVAddRecExpr>(F.BaseRegs[0]);
+      if (!Reg || !isa<SCEVConstant>(Reg->getStart()))
+        continue;
+      Sequence.push_back(F.BaseRegs[0]);
+    }
+  }
+  if (Sequence.empty())
+    return;
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    if (LU.Kind == LSRUse::ICmpZero)
+      continue;
+    // If we found AddRecExpr register in LSR use that has same step,
+    // try to make it the same by shifting constant start.
+    for (const SCEV *CmpReg : Sequence) {
+      const SCEVAddRecExpr *RegAR = cast<SCEVAddRecExpr>(CmpReg);
+      const SCEVConstant *RegStart = cast<SCEVConstant>(RegAR->getStart());
+      for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
+        Formula F = LU.Formulae[L];
+        F.unscale();
+        Formula NewF = F;
+        bool Changed = false;
+        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
+          const SCEVAddRecExpr *BaseRegAR =
+              dyn_cast<SCEVAddRecExpr>(F.BaseRegs[N]);
+          if (!BaseRegAR)
+            continue;
+          if (!hasSameConstValue(BaseRegAR->getStepRecurrence(SE),
+                                 RegAR->getStepRecurrence(SE)))
+            continue;
+          const SCEVConstant *BaseRegStart =
+              dyn_cast<SCEVConstant>(BaseRegAR->getStart());
+          if (!BaseRegStart)
+            continue;
+          int64_t RegDiff = BaseRegStart->getAPInt().getSExtValue() -
+                            RegStart->getAPInt().getSExtValue();
+          Type *IntTy = SE.getEffectiveSCEVType(F.BaseRegs[N]->getType());
+          const SCEV *NegRegDiff =
+              SE.getSCEV(ConstantInt::get(IntTy, -RegDiff));
+          NewF.BaseOffset += RegDiff;
+          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+                          LU.Kind, LU.AccessTy, NewF)) {
+            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset +
+                                         RegDiff))
+              continue;
+            NewF = F;
+            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + RegDiff;
+          }
+          NewF.BaseRegs[N] = SE.getAddExpr(NegRegDiff, F.BaseRegs[N]);
+          Changed = true;
+        }
+        if (!Changed)
+          continue;
+        NewF.canonicalize();
+        (void)InsertFormula(LU, LUIdx, NewF);
+      }
+    }
+  }
+}
+
 /// Look for registers which are a constant distance apart and try to form reuse
 /// opportunities between them.
 void LSRInstance::GenerateCrossUseConstantOffsets() {
@@ -3885,7 +4007,7 @@
     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
   }
-
+  GenerateCrossUseICmpZero();
   GenerateCrossUseConstantOffsets();
 
   DEBUG(dbgs() << "\n"
@@ -3961,7 +4083,7 @@
         Cost CostBest;
         Regs.clear();
         CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
-        if (CostF < CostBest)
+        if (CostF.isLower(CostBest, TTI))
           std::swap(F, Best);
         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
               dbgs() << "\n"
@@ -4288,7 +4410,7 @@
     NewCost = CurCost;
     NewRegs = CurRegs;
     NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
-    if (NewCost < SolutionCost) {
+    if (NewCost.isLower(SolutionCost, TTI)) {
       Workspace.push_back(&F);
       if (Workspace.size() != Uses.size()) {
         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
Index: test/CodeGen/X86/2006-05-11-InstrSched.ll
===================================================================
--- test/CodeGen/X86/2006-05-11-InstrSched.ll
+++ test/CodeGen/X86/2006-05-11-InstrSched.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats 2>&1 | \
-; RUN:     grep "asm-printer" | grep 35
+; RUN:     grep "asm-printer" | grep 33
 
 target datalayout = "e-p:32:32"
 define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
Index: test/CodeGen/X86/atom-fixup-lea3.ll
===================================================================
--- test/CodeGen/X86/atom-fixup-lea3.ll
+++ test/CodeGen/X86/atom-fixup-lea3.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
-; CHECK: addl ([[reg:%[a-z]+]])
-; CHECK-NEXT: addl $4, [[reg]]
+; CHECK: addl ({{%[a-z]+}},[[reg:%[a-z]+]],4)
+; CHECK-NEXT: movl
+; CHECK-NEXT: addl 4({{%[a-z]+}},[[reg:%[a-z]+]],4)
+; CHECK-NEXT: incl
 
 ; Test for the FixupLEAs pre-emit pass.
 ; An LEA should NOT be substituted for the ADD instruction
@@ -20,7 +22,7 @@
 ;  return sum;
 ;}
 
-define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 {
+define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %k, i32* nocapture %l, i32* nocapture %m, i32* nocapture %array2) #0 {
 entry:
   %cmp7 = icmp sgt i32 %n, 0
   br i1 %cmp7, label %for.body.lr.ph, label %for.end
@@ -35,6 +37,9 @@
   %j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ]
   %inc1 = add nsw i32 %j.09, 1
   %arrayidx = getelementptr inbounds i32, i32* %array2, i32 %j.09
+  store i32 %0, i32* %m, align 4
+  store i32 %sum.010, i32* %m, align 4
+  store i32 %0, i32* %m, align 4
   %1 = load i32, i32* %arrayidx, align 4
   %add = add nsw i32 %0, %1
   store i32 %add, i32* %m, align 4
Index: test/CodeGen/X86/avoid_complex_am.ll
===================================================================
--- test/CodeGen/X86/avoid_complex_am.ll
+++ test/CodeGen/X86/avoid_complex_am.ll
@@ -8,7 +8,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"
 
-define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c, i32 %n) {
 ; CHECK: @mulDouble
 entry:
   br label %for.body
@@ -30,9 +30,7 @@
   %arrayidx4 = getelementptr inbounds double, double* %a, i64 %indvars.iv
   store double %mul, double* %arrayidx4, align 8
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-; Comparison should be 19 * 1 = 19.
-; CHECK: icmp eq i32 {{%[^,]+}}, 19
-  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
Index: test/CodeGen/X86/compact-unwind.ll
===================================================================
--- test/CodeGen/X86/compact-unwind.ll
+++ test/CodeGen/X86/compact-unwind.ll
@@ -66,12 +66,12 @@
 
 ; NOFP-CU:      Entry at offset 0x20:
 ; NOFP-CU-NEXT:        start:                0x1d _test1
-; NOFP-CU-NEXT:        length:               0x42
+; NOFP-CU-NEXT:        length:               0x4b
 ; NOFP-CU-NEXT:        compact encoding:     0x02040c0a
 
 ; NOFP-FROM-ASM:      Entry at offset 0x20:
 ; NOFP-FROM-ASM-NEXT:        start:                0x1d _test1
-; NOFP-FROM-ASM-NEXT:        length:               0x42
+; NOFP-FROM-ASM-NEXT:        length:               0x4b
 ; NOFP-FROM-ASM-NEXT:        compact encoding:     0x02040c0a
 
 define void @test1(%class.ImageLoader* %image) optsize ssp uwtable {
Index: test/CodeGen/X86/full-lsr.ll
===================================================================
--- test/CodeGen/X86/full-lsr.ll
+++ test/CodeGen/X86/full-lsr.ll
@@ -1,16 +1,10 @@
 ; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck %s
 
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
-; ATOM: foo
-; ATOM: addl
-; ATOM: addl
-; ATOM: leal
 
 ; CHECK: foo
-; CHECK: addl
-; CHECK: addl
-; CHECK: addl
+; CHECK: incl
 
 entry:
 	%0 = icmp sgt i32 %N, 0		; <i1> [#uses=1]
Index: test/CodeGen/X86/loop-strength-reduce4.ll
===================================================================
--- test/CodeGen/X86/loop-strength-reduce4.ll
+++ test/CodeGen/X86/loop-strength-reduce4.ll
@@ -4,16 +4,19 @@
 ; By starting the IV at -64 instead of 0, a cmp is eliminated,
 ; as the flags from the add can be used directly.
 
-; STATIC: movl    $-64, [[ECX:%e..]]
+; STATIC: movl    $-64, [[EAX:%e..]]
 
-; STATIC: movl    [[EAX:%e..]], _state+76([[ECX]])
-; STATIC: addl    $16, [[ECX]]
+; STATIC: movl    %{{.+}}, _state+76([[EAX]])
+; STATIC: addl    $16, [[EAX]]
 ; STATIC: jne
 
-; In PIC mode the symbol can't be folded, so the change-compare-stride
-; trick applies.
+; The same for PIC mode.
 
-; PIC: cmpl $64
+; PIC: movl    $-64, [[EAX:%e..]]
+
+; PIC: movl    %{{.+}}, 76(%{{.+}},[[EAX]])
+; PIC: addl    $16, [[EAX]]
+; PIC: jne
 
 @state = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
 @S = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
Index: test/CodeGen/X86/masked-iv-safe.ll
===================================================================
--- test/CodeGen/X86/masked-iv-safe.ll
+++ test/CodeGen/X86/masked-iv-safe.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: count_up
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: incq
+; CHECK: addq $8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up(double* %d, i64 %n) nounwind {
@@ -38,7 +38,7 @@
 
 ; CHECK-LABEL: count_down
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $-8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_down(double* %d, i64 %n) nounwind {
@@ -71,7 +71,7 @@
 
 ; CHECK-LABEL: count_up_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: incq
+; CHECK: addq $8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up_signed(double* %d, i64 %n) nounwind {
@@ -106,7 +106,7 @@
 
 ; CHECK-LABEL: count_down_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $-8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_down_signed(double* %d, i64 %n) nounwind {
@@ -141,7 +141,7 @@
 
 ; CHECK-LABEL: another_count_up
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_up(double* %d, i64 %n) nounwind {
@@ -174,7 +174,7 @@
 
 ; CHECK-LABEL: another_count_down
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq $-8,
+; CHECK: addq $-8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_down(double* %d, i64 %n) nounwind {
@@ -207,7 +207,7 @@
 
 ; CHECK-LABEL: another_count_up_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_up_signed(double* %d, i64 %n) nounwind {
@@ -242,7 +242,7 @@
 
 ; CHECK-LABEL: another_count_down_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: decq
+; CHECK: addq $-8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_down_signed(double* %d, i64 %n) nounwind {
Index: test/CodeGen/X86/misched-matrix.ll
===================================================================
--- test/CodeGen/X86/misched-matrix.ll
+++ test/CodeGen/X86/misched-matrix.ll
@@ -16,19 +16,19 @@
 ; alias analysis ability (that doesn't require any AliasAnalysis pass).
 ;
 ; TOPDOWN-LABEL: %for.body
-; TOPDOWN: movl %{{.*}}, (
+; TOPDOWN: movl %{{.*}}, 64(
 ; TOPDOWN: imull {{[0-9]*}}(
-; TOPDOWN: movl %{{.*}}, 4(
+; TOPDOWN: movl %{{.*}}, 68(
 ; TOPDOWN: imull {{[0-9]*}}(
-; TOPDOWN: movl %{{.*}}, 8(
-; TOPDOWN: movl %{{.*}}, 12(
+; TOPDOWN: movl %{{.*}}, 72(
+; TOPDOWN: movl %{{.*}}, 76(
 ; TOPDOWN-LABEL: %for.end
 ;
 ; For -misched=ilpmin, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are interleaved.
 ;
 ; ILPMIN-LABEL: %for.body
-; ILPMIN: movl %{{.*}}, (
+; ILPMIN: movl %{{.*}}, 64(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
@@ -36,7 +36,7 @@
 ; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
-; ILPMIN: movl %{{.*}}, 4(
+; ILPMIN: movl %{{.*}}, 68(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
@@ -44,7 +44,7 @@
 ; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
-; ILPMIN: movl %{{.*}}, 8(
+; ILPMIN: movl %{{.*}}, 72(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
@@ -52,14 +52,14 @@
 ; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
-; ILPMIN: movl %{{.*}}, 12(
+; ILPMIN: movl %{{.*}}, 76(
 ; ILPMIN-LABEL: %for.end
 ;
 ; For -misched=ilpmax, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are clustered.
 ;
 ; ILPMAX-LABEL: %for.body
-; ILPMAX: movl %{{.*}}, (
+; ILPMAX: movl %{{.*}}, 64(
 ; ILPMAX: imull
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -67,7 +67,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: addl
-; ILPMAX: movl %{{.*}}, 4(
+; ILPMAX: movl %{{.*}}, 68(
 ; ILPMAX: imull
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -75,7 +75,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: addl
-; ILPMAX: movl %{{.*}}, 8(
+; ILPMAX: movl %{{.*}}, 72(
 ; ILPMAX: imull
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -83,7 +83,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: addl
-; ILPMAX: movl %{{.*}}, 12(
+; ILPMAX: movl %{{.*}}, 76(
 ; ILPMAX-LABEL: %for.end
 
 define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -162,10 +162,10 @@
 ; Consequently, we should *not* form any chains.
 ;
 ; X64: foldedidx:
-; X64: movzbl -3(
+; X64: movzbl 400(
 ;
 ; X32: foldedidx:
-; X32: movzbl -3(
+; X32: movzbl 400(
 define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nounwind ssp {
 entry:
   br label %for.body
@@ -277,7 +277,7 @@
 ;
 ; X32: @testCmpZero
 ; X32: %for.body82.us
-; X32: dec
+; X32: cmp
 ; X32: jne
 define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len) nounwind ssp {
 entry: