Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -471,6 +471,11 @@
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2) const;
 
+  /// Return true if the target can fuse a compare and branch.
+  /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
+  /// calculation for the instructions in a loop.
+  bool canMacroFuseCmp() const;
+
   /// \brief Return true if the target supports masked load/store
   /// AVX2 and AVX-512 targets allow masks for consecutive load and store
   bool isLegalMaskedStore(Type *DataType) const;
@@ -974,6 +979,7 @@
                                      Instruction *I) = 0;
   virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                              TargetTransformInfo::LSRCost &C2) = 0;
+  virtual bool canMacroFuseCmp() = 0;
   virtual bool isLegalMaskedStore(Type *DataType) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType) = 0;
@@ -1192,6 +1198,9 @@
                      TargetTransformInfo::LSRCost &C2) override {
     return Impl.isLSRCostLess(C1, C2);
   }
+  bool canMacroFuseCmp() override {
+    return Impl.canMacroFuseCmp();
+  }
   bool isLegalMaskedStore(Type *DataType) override {
     return Impl.isLegalMaskedStore(DataType);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -246,6 +246,8 @@
                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
   }
 
+  bool canMacroFuseCmp() { return false; }
+
   bool isLegalMaskedStore(Type *DataType) { return false; }
 
   bool isLegalMaskedLoad(Type *DataType) { return false; }
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -155,6 +155,10 @@
   return TTIImpl->isLSRCostLess(C1, C2);
 }
 
+bool TargetTransformInfo::canMacroFuseCmp() const {
+  return TTIImpl->canMacroFuseCmp();
+}
+
 bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
   return TTIImpl->isLegalMaskedStore(DataType);
 }
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -120,6 +120,7 @@
                     Type *Ty);
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2);
+  bool canMacroFuseCmp();
   bool isLegalMaskedLoad(Type *DataType);
   bool isLegalMaskedStore(Type *DataType);
   bool isLegalMaskedGather(Type *DataType);
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2482,6 +2482,10 @@
                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
 
+bool X86TTIImpl::canMacroFuseCmp() {
+  return ST->hasMacroFusion();
+}
+
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
   // The backend can't handle a single element vector.
   if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1343,14 +1343,15 @@
 
   // If ICmpZero formula ends with not 0, it could not be replaced by
   // just add or sub. We'll need to compare final result of AddRec.
-  // That means we'll need an additional instruction.
+  // That means we'll need an additional instruction. But if the target can
+  // macro-fuse a compare with a branch, don't count this extra instruction.
   // For -10 + {0, +, 1}:
   // i = i + 1;
   // cmp i, 10
   //
   // For {-10, +, 1}:
   // i = i + 1;
-  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp())
     C.Insns++;
   // Each new AddRec adds 1 instruction to calculation.
   C.Insns += (C.AddRecCost - PrevAddRecCost);
Index: test/CodeGen/X86/rdrand.ll
===================================================================
--- test/CodeGen/X86/rdrand.ll
+++ test/CodeGen/X86/rdrand.ll
@@ -82,35 +82,41 @@
 define void @loop(i32* %p, i32 %n) nounwind {
 ; X86-LABEL: loop:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    je .LBB3_3
 ; X86-NEXT:  # %bb.1: # %while.body.preheader
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB3_2: # %while.body
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    rdrandl %edx
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    leal 4(%ecx), %ecx
-; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    rdrandl %esi
+; X86-NEXT:    movl %esi, (%ecx,%edx,4)
+; X86-NEXT:    addl $1, %edx
+; X86-NEXT:    cmpl %edx, %eax
 ; X86-NEXT:    jne .LBB3_2
 ; X86-NEXT:  .LBB3_3: # %while.end
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: loop:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    testl %esi, %esi
-; X64-NEXT:    je .LBB3_2
+; X64-NEXT:    je .LBB3_3
+; X64-NEXT:  # %bb.1: # %while.body.preheader
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB3_1: # %while.body
+; X64-NEXT:  .LBB3_2: # %while.body
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    rdrandl %eax
-; X64-NEXT:    movl %eax, (%rdi)
-; X64-NEXT:    leaq 4(%rdi), %rdi
-; X64-NEXT:    addl $-1, %esi
-; X64-NEXT:    jne .LBB3_1
-; X64-NEXT:  .LBB3_2: # %while.end
+; X64-NEXT:    rdrandl %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx,4)
+; X64-NEXT:    addq $1, %rcx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    jne .LBB3_2
+; X64-NEXT:  .LBB3_3: # %while.end
 ; X64-NEXT:    retq
 entry:
   %tobool1 = icmp eq i32 %n, 0
Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -347,30 +347,31 @@
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl $-400, %eax # imm = 0xFE70
+; X32-NEXT:    movl $3, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    .p2align 4, 0x90
 ; X32-NEXT:  .LBB3_1: # %for.body
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movzbl 400(%esi,%eax), %edi
-; X32-NEXT:    movzbl 400(%edx,%eax), %ebx
+; X32-NEXT:    movzbl -3(%esi,%eax), %edi
+; X32-NEXT:    movzbl -3(%edx,%eax), %ebx
 ; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movb %bl, 400(%ecx,%eax)
-; X32-NEXT:    movzbl 401(%esi,%eax), %edi
-; X32-NEXT:    movzbl 401(%edx,%eax), %ebx
+; X32-NEXT:    movb %bl, -3(%ecx,%eax)
+; X32-NEXT:    movzbl -2(%esi,%eax), %edi
+; X32-NEXT:    movzbl -2(%edx,%eax), %ebx
 ; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movb %bl, 401(%ecx,%eax)
-; X32-NEXT:    movzbl 402(%esi,%eax), %edi
-; X32-NEXT:    movzbl 402(%edx,%eax), %ebx
+; X32-NEXT:    movb %bl, -2(%ecx,%eax)
+; X32-NEXT:    movzbl -1(%esi,%eax), %edi
+; X32-NEXT:    movzbl -1(%edx,%eax), %ebx
 ; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movb %bl, 402(%ecx,%eax)
-; X32-NEXT:    movzbl 403(%esi,%eax), %edi
-; X32-NEXT:    movzbl 403(%edx,%eax), %ebx
+; X32-NEXT:    movb %bl, -1(%ecx,%eax)
+; X32-NEXT:    movzbl (%esi,%eax), %edi
+; X32-NEXT:    movzbl (%edx,%eax), %ebx
 ; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movb %bl, 403(%ecx,%eax)
+; X32-NEXT:    movb %bl, (%ecx,%eax)
 ; X32-NEXT:    addl $4, %eax
+; X32-NEXT:    cmpl $403, %eax # imm = 0x193
 ; X32-NEXT:    jne .LBB3_1
 ; X32-NEXT:  # %bb.2: # %for.end
 ; X32-NEXT:    popl %esi
Index: test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll
+++ test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown                    | FileCheck %s --check-prefix=BASE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=macrofusion | FileCheck %s --check-prefix=FUSE
+
+; PR35681 - https://bugs.llvm.org/show_bug.cgi?id=35681
+; If a CPU can macro-fuse a compare and branch, then we discount that
+; cost in LSR and avoid generating large offsets in each memory access. 
+; This reduces code size and may improve decode throughput.
+
+define void @maxArray(double* noalias nocapture %x, double* noalias nocapture readonly %y) {
+; BASE-LABEL: maxArray:
+; BASE:       # %bb.0: # %entry
+; BASE-NEXT:    movq $-524288, %rax # imm = 0xFFF80000
+; BASE-NEXT:    .p2align 4, 0x90
+; BASE-NEXT:  .LBB0_1: # %vector.body
+; BASE-NEXT:    # =>This Inner Loop Header: Depth=1
+; BASE-NEXT:    movupd 524288(%rsi,%rax), %xmm0
+; BASE-NEXT:    movupd 524288(%rdi,%rax), %xmm1
+; BASE-NEXT:    maxpd %xmm1, %xmm0
+; BASE-NEXT:    movupd %xmm0, 524288(%rdi,%rax)
+; BASE-NEXT:    addq $16, %rax
+; BASE-NEXT:    jne .LBB0_1
+; BASE-NEXT:  # %bb.2: # %for.cond.cleanup
+; BASE-NEXT:    retq
+;
+; FUSE-LABEL: maxArray:
+; FUSE:       # %bb.0: # %entry
+; FUSE-NEXT:    xorl %eax, %eax
+; FUSE-NEXT:    .p2align 4, 0x90
+; FUSE-NEXT:  .LBB0_1: # %vector.body
+; FUSE-NEXT:    # =>This Inner Loop Header: Depth=1
+; FUSE-NEXT:    movupd (%rsi,%rax,8), %xmm0
+; FUSE-NEXT:    movupd (%rdi,%rax,8), %xmm1
+; FUSE-NEXT:    maxpd %xmm1, %xmm0
+; FUSE-NEXT:    movupd %xmm0, (%rdi,%rax,8)
+; FUSE-NEXT:    addq $2, %rax
+; FUSE-NEXT:    cmpq $65536, %rax # imm = 0x10000
+; FUSE-NEXT:    jne .LBB0_1
+; FUSE-NEXT:  # %bb.2: # %for.cond.cleanup
+; FUSE-NEXT:    retq
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds double, double* %y, i64 %index
+  %1 = bitcast double* %0 to <2 x double>*
+  %wide.load = load <2 x double>, <2 x double>* %1, align 8
+  %2 = getelementptr inbounds double, double* %x, i64 %index
+  %3 = bitcast double* %2 to <2 x double>*
+  %wide.load21 = load <2 x double>, <2 x double>* %3, align 8
+  %4 = fcmp ogt <2 x double> %wide.load, %wide.load21
+  %5 = select <2 x i1> %4, <2 x double> %wide.load, <2 x double> %wide.load21
+  %6 = bitcast double* %2 to <2 x double>*
+  store <2 x double> %5, <2 x double>* %6, align 8
+  %index.next = add i64 %index, 2
+  %7 = icmp eq i64 %index.next, 65536
+  br i1 %7, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+