Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -94,6 +94,8 @@
   int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
   int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                     Type *Ty);
+  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+                     TargetTransformInfo::LSRCost &C2);
   bool isLegalMaskedLoad(Type *DataType);
   bool isLegalMaskedStore(Type *DataType);
   bool isLegalMaskedGather(Type *DataType);
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2088,6 +2088,17 @@
   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
 }
 
+bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+                               TargetTransformInfo::LSRCost &C2) {
+    // X86 specific here are "instruction number 1st priority".
+    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
+                    C1.NumIVMuls, C1.NumBaseAdds,
+                    C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
+                    C2.NumIVMuls, C2.NumBaseAdds,
+                    C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+}
+
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
   Type *ScalarTy = DataTy->getScalarType();
   int DataWidth = isa<PointerType>(ScalarTy) ?
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -924,6 +924,15 @@
   return Changed;
 }
 
+/// Returns true if A and B has same constant value.
+///
+static bool hasSameConstValue(const SCEV *A, const SCEV *B) {
+  if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(A))
+    if (const SCEVConstant *BC = dyn_cast<SCEVConstant>(B))
+      return APInt::isSameValue(AC->getAPInt(), BC->getAPInt());
+  return false;
+}
+
 namespace {
 
 class LSRUse;
@@ -1891,6 +1900,7 @@
   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateCrossUseConstantOffsets();
+  void GenerateCrossUseICmpZero();
   void GenerateAllReuseFormulae();
 
   void FilterOutUndesirableDedicatedRegisters();
@@ -3845,6 +3855,85 @@
 }
 #endif
 
+/// Look for ICmp AddRecExpr that ends with zero and try to reuse them in
+/// other formulas.
+/// For the following:
+///   ICmpZero {-40,+,4}
+///   Address  {%a,+,4}
+/// Algorithm will add 1 Address Formula:
+///   ICmpZero {-40,+,4}
+///   Address  {%a} + {0,+,4}
+///            40 + {%a} + {-40,+,4}
+///
+void LSRInstance::GenerateCrossUseICmpZero() {
+  SmallVector<const SCEV *, 8> Sequence;
+  // Get all ICmpZero registers that ens with zero.
+  for (LSRUse &LU : Uses) {
+    if (LU.Kind != LSRUse::ICmpZero)
+      continue;
+    for (const Formula &F : LU.Formulae) {
+      if (!F.hasZeroEnd())
+        continue;
+      const SCEVAddRecExpr *Reg = dyn_cast<SCEVAddRecExpr>(F.BaseRegs[0]);
+      if (!Reg || !isa<SCEVConstant>(Reg->getStart()))
+        continue;
+      Sequence.push_back(F.BaseRegs[0]);
+    }
+  }
+  if (Sequence.empty())
+    return;
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    if (LU.Kind == LSRUse::ICmpZero)
+      continue;
+    // If we found AddRecExpr register in LSR use that has same step,
+    // try to make it the same by shifting constant start.
+    for (const SCEV *CmpReg : Sequence) {
+      const SCEVAddRecExpr *RegAR = cast<SCEVAddRecExpr>(CmpReg);
+      const SCEVConstant *RegStart = cast<SCEVConstant>(RegAR->getStart());
+      for (size_t K = 0, KE = LU.Formulae.size(); K != KE; ++K) {
+        Formula F = LU.Formulae[K];
+        F.unscale();
+        Formula NewF = F;
+        bool Changed = false;
+        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
+          const SCEVAddRecExpr *BaseRegAR =
+              dyn_cast<SCEVAddRecExpr>(F.BaseRegs[N]);
+          if (!BaseRegAR)
+            continue;
+          if (!hasSameConstValue(BaseRegAR->getStepRecurrence(SE),
+                                 RegAR->getStepRecurrence(SE)))
+            continue;
+          const SCEVConstant *BaseRegStart =
+              dyn_cast<SCEVConstant>(BaseRegAR->getStart());
+          if (!BaseRegStart)
+            continue;
+          int64_t RegDiff = BaseRegStart->getAPInt().getSExtValue() -
+                            RegStart->getAPInt().getSExtValue();
+          Type *IntTy = SE.getEffectiveSCEVType(F.BaseRegs[N]->getType());
+          const SCEV *NegRegDiff =
+              SE.getSCEV(ConstantInt::get(IntTy, -RegDiff));
+          NewF.BaseOffset += RegDiff;
+          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+                          LU.Kind, LU.AccessTy, NewF)) {
+            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset +
+                                         RegDiff))
+              continue;
+            NewF = F;
+            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + RegDiff;
+          }
+          NewF.BaseRegs[N] = SE.getAddExpr(NegRegDiff, F.BaseRegs[N]);
+          Changed = true;
+        }
+        if (!Changed)
+          continue;
+        NewF.canonicalize(*L);
+        (void)InsertFormula(LU, LUIdx, NewF);
+      }
+    }
+  }
+}
+
 /// Look for registers which are a constant distance apart and try to form reuse
 /// opportunities between them.
 void LSRInstance::GenerateCrossUseConstantOffsets() {
@@ -4034,7 +4123,7 @@
     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
   }
-
+  GenerateCrossUseICmpZero();
   GenerateCrossUseConstantOffsets();
 
   DEBUG(dbgs() << "\n"
Index: test/CodeGen/X86/2006-05-11-InstrSched.ll
===================================================================
--- test/CodeGen/X86/2006-05-11-InstrSched.ll
+++ test/CodeGen/X86/2006-05-11-InstrSched.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats 2>&1 | \
-; RUN:     grep "asm-printer" | grep 35
+; RUN:     grep "asm-printer" | grep 33
 
 target datalayout = "e-p:32:32"
 define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
Index: test/CodeGen/X86/atom-fixup-lea3.ll
===================================================================
--- test/CodeGen/X86/atom-fixup-lea3.ll
+++ test/CodeGen/X86/atom-fixup-lea3.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
-; CHECK: addl ([[reg:%[a-z]+]])
-; CHECK-NEXT: addl $4, [[reg]]
+; CHECK: addl ({{%[a-z]+}},[[reg:%[a-z]+]],4)
+; CHECK-NEXT: movl
+; CHECK-NEXT: addl 4({{%[a-z]+}},[[reg:%[a-z]+]],4)
+; CHECK-NEXT: incl
 
 ; Test for the FixupLEAs pre-emit pass.
 ; An LEA should NOT be substituted for the ADD instruction
@@ -20,7 +22,7 @@
 ;  return sum;
 ;}
 
-define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 {
+define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %k, i32* nocapture %l, i32* nocapture %m, i32* nocapture %array2) #0 {
 entry:
   %cmp7 = icmp sgt i32 %n, 0
   br i1 %cmp7, label %for.body.lr.ph, label %for.end
@@ -35,6 +37,9 @@
   %j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ]
   %inc1 = add nsw i32 %j.09, 1
   %arrayidx = getelementptr inbounds i32, i32* %array2, i32 %j.09
+  store i32 %0, i32* %m, align 4
+  store i32 %sum.010, i32* %m, align 4
+  store i32 %0, i32* %m, align 4
   %1 = load i32, i32* %arrayidx, align 4
   %add = add nsw i32 %0, %1
   store i32 %add, i32* %m, align 4
Index: test/CodeGen/X86/avoid_complex_am.ll
===================================================================
--- test/CodeGen/X86/avoid_complex_am.ll
+++ test/CodeGen/X86/avoid_complex_am.ll
@@ -8,7 +8,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"
 
-define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c, i32 %n) {
 ; CHECK: @mulDouble
 entry:
   br label %for.body
@@ -30,9 +30,7 @@
   %arrayidx4 = getelementptr inbounds double, double* %a, i64 %indvars.iv
   store double %mul, double* %arrayidx4, align 8
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-; Comparison should be 19 * 1 = 19.
-; CHECK: icmp eq i32 {{%[^,]+}}, 19
-  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
Index: test/CodeGen/X86/compact-unwind.ll
===================================================================
--- test/CodeGen/X86/compact-unwind.ll
+++ test/CodeGen/X86/compact-unwind.ll
@@ -66,12 +66,12 @@
 
 ; NOFP-CU:      Entry at offset 0x20:
 ; NOFP-CU-NEXT:        start:                0x1d _test1
-; NOFP-CU-NEXT:        length:               0x42
+; NOFP-CU-NEXT:        length:               0x4b
 ; NOFP-CU-NEXT:        compact encoding:     0x02040c0a
 
 ; NOFP-FROM-ASM:      Entry at offset 0x20:
 ; NOFP-FROM-ASM-NEXT:        start:                0x1d _test1
-; NOFP-FROM-ASM-NEXT:        length:               0x42
+; NOFP-FROM-ASM-NEXT:        length:               0x4b
 ; NOFP-FROM-ASM-NEXT:        compact encoding:     0x02040c0a
 
 define void @test1(%class.ImageLoader* %image) optsize ssp uwtable {
Index: test/CodeGen/X86/full-lsr.ll
===================================================================
--- test/CodeGen/X86/full-lsr.ll
+++ test/CodeGen/X86/full-lsr.ll
@@ -1,16 +1,10 @@
 ; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck %s
 
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
-; ATOM: foo
-; ATOM: addl
-; ATOM: addl
-; ATOM: leal
 
 ; CHECK: foo
-; CHECK: addl
-; CHECK: addl
-; CHECK: addl
+; CHECK: incl
 
 entry:
 	%0 = icmp sgt i32 %N, 0		; <i1> [#uses=1]
Index: test/CodeGen/X86/loop-strength-reduce4.ll
===================================================================
--- test/CodeGen/X86/loop-strength-reduce4.ll
+++ test/CodeGen/X86/loop-strength-reduce4.ll
@@ -4,16 +4,19 @@
 ; By starting the IV at -64 instead of 0, a cmp is eliminated,
 ; as the flags from the add can be used directly.
 
-; STATIC: movl    $-64, [[ECX:%e..]]
+; STATIC: movl    $-64, [[EAX:%e..]]
 
-; STATIC: movl    [[EAX:%e..]], _state+76([[ECX]])
-; STATIC: addl    $16, [[ECX]]
+; STATIC: movl    %{{.+}}, _state+76([[EAX]])
+; STATIC: addl    $16, [[EAX]]
 ; STATIC: jne
 
-; In PIC mode the symbol can't be folded, so the change-compare-stride
-; trick applies.
+; The same for PIC mode.
 
-; PIC: cmpl $64
+; PIC: movl    $-64, [[EAX:%e..]]
+
+; PIC: movl    %{{.+}}, 76(%{{.+}},[[EAX]])
+; PIC: addl    $16, [[EAX]]
+; PIC: jne
 
 @state = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
 @S = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
Index: test/CodeGen/X86/masked-iv-safe.ll
===================================================================
--- test/CodeGen/X86/masked-iv-safe.ll
+++ test/CodeGen/X86/masked-iv-safe.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: count_up
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: incq
+; CHECK: addq $8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up(double* %d, i64 %n) nounwind {
@@ -38,7 +38,7 @@
 
 ; CHECK-LABEL: count_down
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $-8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_down(double* %d, i64 %n) nounwind {
@@ -71,7 +71,7 @@
 
 ; CHECK-LABEL: count_up_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: incq
+; CHECK: addq $8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up_signed(double* %d, i64 %n) nounwind {
@@ -106,7 +106,7 @@
 
 ; CHECK-LABEL: count_down_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $-8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_down_signed(double* %d, i64 %n) nounwind {
@@ -141,7 +141,7 @@
 
 ; CHECK-LABEL: another_count_up
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_up(double* %d, i64 %n) nounwind {
@@ -174,7 +174,7 @@
 
 ; CHECK-LABEL: another_count_down
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq $-8,
+; CHECK: addq $-8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_down(double* %d, i64 %n) nounwind {
@@ -207,7 +207,7 @@
 
 ; CHECK-LABEL: another_count_up_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_up_signed(double* %d, i64 %n) nounwind {
@@ -242,7 +242,7 @@
 
 ; CHECK-LABEL: another_count_down_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: decq
+; CHECK: addq $-8
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_down_signed(double* %d, i64 %n) nounwind {
Index: test/CodeGen/X86/misched-matrix.ll
===================================================================
--- test/CodeGen/X86/misched-matrix.ll
+++ test/CodeGen/X86/misched-matrix.ll
@@ -16,19 +16,19 @@
 ; alias analysis ability (that doesn't require any AliasAnalysis pass).
 ;
 ; TOPDOWN-LABEL: %for.body
-; TOPDOWN: movl %{{.*}}, (
+; TOPDOWN: movl %{{.*}}, 64(
 ; TOPDOWN: imull {{[0-9]*}}(
-; TOPDOWN: movl %{{.*}}, 4(
+; TOPDOWN: movl %{{.*}}, 68(
 ; TOPDOWN: imull {{[0-9]*}}(
-; TOPDOWN: movl %{{.*}}, 8(
-; TOPDOWN: movl %{{.*}}, 12(
+; TOPDOWN: movl %{{.*}}, 72(
+; TOPDOWN: movl %{{.*}}, 76(
 ; TOPDOWN-LABEL: %for.end
 ;
 ; For -misched=ilpmin, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are interleaved.
 ;
 ; ILPMIN-LABEL: %for.body
-; ILPMIN: movl %{{.*}}, (
+; ILPMIN: movl %{{.*}}, 64(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
@@ -36,7 +36,7 @@
 ; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
-; ILPMIN: movl %{{.*}}, 4(
+; ILPMIN: movl %{{.*}}, 68(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
@@ -44,7 +44,7 @@
 ; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
-; ILPMIN: movl %{{.*}}, 8(
+; ILPMIN: movl %{{.*}}, 72(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
@@ -52,14 +52,14 @@
 ; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
-; ILPMIN: movl %{{.*}}, 12(
+; ILPMIN: movl %{{.*}}, 76(
 ; ILPMIN-LABEL: %for.end
 ;
 ; For -misched=ilpmax, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are clustered.
 ;
 ; ILPMAX-LABEL: %for.body
-; ILPMAX: movl %{{.*}}, (
+; ILPMAX: movl %{{.*}}, 64(
 ; ILPMAX: imull
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -67,7 +67,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: addl
-; ILPMAX: movl %{{.*}}, 4(
+; ILPMAX: movl %{{.*}}, 68(
 ; ILPMAX: imull
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -75,7 +75,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: addl
-; ILPMAX: movl %{{.*}}, 8(
+; ILPMAX: movl %{{.*}}, 72(
 ; ILPMAX: imull
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -83,7 +83,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: addl
-; ILPMAX: movl %{{.*}}, 12(
+; ILPMAX: movl %{{.*}}, 76(
 ; ILPMAX-LABEL: %for.end
 
 define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -162,10 +162,10 @@
 ; Consequently, we should *not* form any chains.
 ;
 ; X64: foldedidx:
-; X64: movzbl -3(
+; X64: movzbl 400(
 ;
 ; X32: foldedidx:
-; X32: movzbl -3(
+; X32: movzbl 400(
 define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nounwind ssp {
 entry:
   br label %for.body
@@ -277,7 +277,7 @@
 ;
 ; X32: @testCmpZero
 ; X32: %for.body82.us
-; X32: dec
+; X32: cmp
 ; X32: jne
 define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len) nounwind ssp {
 entry: