Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -342,6 +342,7 @@
     unsigned ImmCost;
     unsigned SetupCost;
     unsigned ScaleCost;
+    unsigned FoldedStoreAddresses;
   };
 
   /// Parameters that control the generic loop unrolling transformation.
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2525,13 +2525,23 @@
 
 bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                                TargetTransformInfo::LSRCost &C2) {
-    // X86 specific here are "instruction number 1st priority".
-    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
-                    C1.NumIVMuls, C1.NumBaseAdds,
-                    C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
-           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
-                    C2.NumIVMuls, C2.NumBaseAdds,
-                    C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+    // X86 specific here are "stores with folded address" and
+    // "number of instructions" get higest priority.
+    // We pay attention to stores with folded address as
+    // this kind of stores (like  "movq %rcx, (%rax, %rbx, 4)")
+    // go only to ports 2 and 3, while stores with simple address
+    // (like "movq %rcx, 8(%rax)" go to ports 2, 3 and 7.
+    // When there are a lot of such stores mixed with loads (which
+    // always go to all 3 ports) there could be significant stalls.
+    // The cost function try to avoid cases when there are too many
+    // stores with folded address, treating each 2 such stores as 1
+    // additional instruction.
+    unsigned C1Insns = ((C1.FoldedStoreAddresses >> 1) + C1.Insns);
+    unsigned C2Insns = ((C2.FoldedStoreAddresses >> 1) + C2.Insns);
+    return std::tie(C1Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
+                    C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+           std::tie(C2Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
+                    C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
 
 bool X86TTIImpl::canMacroFuseCmp() {
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1006,6 +1006,7 @@
     C.ImmCost = 0;
     C.SetupCost = 0;
     C.ScaleCost = 0;
+    C.FoldedStoreAddresses = 0;
   }
 
   bool isLess(Cost &Other, const TargetTransformInfo &TTI);
@@ -1015,10 +1016,12 @@
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
-    return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
-             | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
-      || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
-           & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
+    return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls
+             | C.NumBaseAdds | C.ImmCost | C.SetupCost | C.ScaleCost
+             | C.FoldedStoreAddresses) != ~0u)
+      || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls
+           & C.NumBaseAdds & C.ImmCost & C.SetupCost & C.ScaleCost
+           & C.FoldedStoreAddresses) == ~0u);
   }
 #endif
 
@@ -1343,6 +1346,10 @@
         !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
                               Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
       C.NumBaseAdds++;
+
+    if (NumBaseParts > 1 && isa<StoreInst>(Fixup.UserInst)
+        && isAMCompletelyFolded(TTI, LU, F))
+      C.FoldedStoreAddresses += 1;
   }
 
   // If we don't count instruction cost exit here.
@@ -1394,6 +1401,7 @@
   C.ImmCost = std::numeric_limits<unsigned>::max();
   C.SetupCost = std::numeric_limits<unsigned>::max();
   C.ScaleCost = std::numeric_limits<unsigned>::max();
+  C.FoldedStoreAddresses = std::numeric_limits<unsigned>::max();
 }
 
 /// Choose the lower cost.
@@ -1423,6 +1431,9 @@
     OS << ", plus " << C.ImmCost << " imm cost";
   if (C.SetupCost != 0)
     OS << ", plus " << C.SetupCost << " setup cost";
+  if (C.FoldedStoreAddresses != 0)
+    OS << ", plus " << C.FoldedStoreAddresses << " folded store address"
+       << (C.FoldedStoreAddresses == 1 ? "" : "es");
 }
 
 LLVM_DUMP_METHOD void Cost::dump() const {
Index: test/CodeGen/X86/loop-strength-reduce4.ll
===================================================================
--- test/CodeGen/X86/loop-strength-reduce4.ll
+++ test/CodeGen/X86/loop-strength-reduce4.ll
@@ -4,19 +4,16 @@
 ; By starting the IV at -64 instead of 0, a cmp is eliminated,
 ; as the flags from the add can be used directly.
 
-; STATIC: movl    $-64, [[EAX:%e..]]
+; STATIC: movl    $-64, [[ECX:%e..]]
 
-; STATIC: movl    %{{.+}}, _state+76([[EAX]])
-; STATIC: addl    $16, [[EAX]]
+; STATIC: movl    [[EAX:%e..]], _state+76([[ECX]])
+; STATIC: addl    $16, [[ECX]]
 ; STATIC: jne
 
-; The same for PIC mode.
+; In PIC mode the symbol can't be folded, so the change-compare-stride
+; trick applies.
 
-; PIC: movl    $-64, [[EAX:%e..]]
-
-; PIC: movl    %{{.+}}, 76(%{{.+}},[[EAX]])
-; PIC: addl    $16, [[EAX]]
-; PIC: jne
+; PIC: dec
 
 @state = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
 @S = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
Index: test/CodeGen/X86/merge_store.ll
===================================================================
--- test/CodeGen/X86/merge_store.ll
+++ test/CodeGen/X86/merge_store.ll
@@ -4,14 +4,16 @@
 define void @merge_store(i32* nocapture %a) {
 ; CHECK-LABEL: merge_store:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addq $12, %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    movabsq $4294967297, %rcx # imm = 0x100000001
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rcx, (%rdi,%rax,4)
-; CHECK-NEXT:    movq %rcx, 8(%rdi,%rax,4)
-; CHECK-NEXT:    addq $4, %rax
+; CHECK-NEXT:    movq %rcx, -12(%rdi)
+; CHECK-NEXT:    movq %rcx, -4(%rdi)
+; CHECK-NEXT:    addl $4, %eax
+; CHECK-NEXT:    addq $16, %rdi
 ; CHECK-NEXT:    cmpl $1000, %eax # imm = 0x3E8
 ; CHECK-NEXT:    jl .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
Index: test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll
+++ test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-reduce -mtriple=x86_64 -S %s | FileCheck %s
+
+; The test has 8 stores where if we fold address (like  "128(%rsi,%rdi,4)")
+;  we can safe 1 add instruction.
+; However folding store addresses here leads to regression on x86, because
+;  stores with such addresses can go only to 2 ports.
+; Test checks that LSR do not use floded address in stores.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i32 %arg, float %arg1, float* noalias nocapture readonly %arg2, float* noalias nocapture %arg3) local_unnamed_addr {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = insertelement <8 x float> undef, float [[ARG1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> undef, float [[ARG1]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> undef, float [[ARG1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> undef, float [[ARG1]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[ARG2:%.*]], i64 56
+; CHECK-NEXT:    [[SCEVGEP11:%.*]] = getelementptr float, float* [[ARG3:%.*]], i64 56
+; CHECK-NEXT:    br label [[BB11:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    [[LSR_IV12:%.*]] = phi float* [ [[SCEVGEP13:%.*]], [[BB11]] ], [ [[SCEVGEP11]], [[BB:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi float* [ [[SCEVGEP2:%.*]], [[BB11]] ], [ [[SCEVGEP]], [[BB]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB11]] ], [ 4096, [[BB]] ]
+; CHECK-NEXT:    [[LSR_IV1214:%.*]] = bitcast float* [[LSR_IV12]] to <8 x float>*
+; CHECK-NEXT:    [[LSR_IV13:%.*]] = bitcast float* [[LSR_IV1]] to <8 x float>*
+; CHECK-NEXT:    [[SCEVGEP28:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -7
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP28]], align 4
+; CHECK-NEXT:    [[SCEVGEP27:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -6
+; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP27]], align 4
+; CHECK-NEXT:    [[SCEVGEP25:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -5
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP25]], align 4
+; CHECK-NEXT:    [[SCEVGEP23:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -4
+; CHECK-NEXT:    [[TMP24:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP23]], align 4
+; CHECK-NEXT:    [[SCEVGEP10:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -7
+; CHECK-NEXT:    [[TMP27:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP10]], align 4
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -6
+; CHECK-NEXT:    [[TMP30:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP9]], align 4
+; CHECK-NEXT:    [[SCEVGEP8:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -5
+; CHECK-NEXT:    [[TMP33:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP8]], align 4
+; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -4
+; CHECK-NEXT:    [[TMP36:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP7]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = fmul <8 x float> [[TMP27]], [[TMP4]]
+; CHECK-NEXT:    [[TMP38:%.*]] = fmul <8 x float> [[TMP30]], [[TMP6]]
+; CHECK-NEXT:    [[TMP39:%.*]] = fmul <8 x float> [[TMP33]], [[TMP8]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fmul <8 x float> [[TMP36]], [[TMP10]]
+; CHECK-NEXT:    [[TMP41:%.*]] = fadd <8 x float> [[TMP15]], [[TMP37]]
+; CHECK-NEXT:    [[TMP42:%.*]] = fadd <8 x float> [[TMP18]], [[TMP38]]
+; CHECK-NEXT:    [[TMP43:%.*]] = fadd <8 x float> [[TMP21]], [[TMP39]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fadd <8 x float> [[TMP24]], [[TMP40]]
+; CHECK-NEXT:    [[SCEVGEP21:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -7
+; CHECK-NEXT:    store <8 x float> [[TMP41]], <8 x float>* [[SCEVGEP21]], align 4
+; CHECK-NEXT:    [[SCEVGEP26:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -6
+; CHECK-NEXT:    store <8 x float> [[TMP42]], <8 x float>* [[SCEVGEP26]], align 4
+; CHECK-NEXT:    [[SCEVGEP24:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -5
+; CHECK-NEXT:    store <8 x float> [[TMP43]], <8 x float>* [[SCEVGEP24]], align 4
+; CHECK-NEXT:    [[SCEVGEP22:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -4
+; CHECK-NEXT:    store <8 x float> [[TMP44]], <8 x float>* [[SCEVGEP22]], align 4
+; CHECK-NEXT:    [[SCEVGEP20:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -3
+; CHECK-NEXT:    [[TMP52:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP20]], align 4
+; CHECK-NEXT:    [[SCEVGEP19:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -2
+; CHECK-NEXT:    [[TMP55:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP19]], align 4
+; CHECK-NEXT:    [[SCEVGEP17:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -1
+; CHECK-NEXT:    [[TMP58:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP17]], align 4
+; CHECK-NEXT:    [[TMP61:%.*]] = load <8 x float>, <8 x float>* [[LSR_IV1214]], align 4
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -3
+; CHECK-NEXT:    [[TMP64:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP6]], align 4
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -2
+; CHECK-NEXT:    [[TMP67:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP5]], align 4
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -1
+; CHECK-NEXT:    [[TMP70:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP4]], align 4
+; CHECK-NEXT:    [[TMP73:%.*]] = load <8 x float>, <8 x float>* [[LSR_IV13]], align 4
+; CHECK-NEXT:    [[TMP74:%.*]] = fmul <8 x float> [[TMP64]], [[TMP4]]
+; CHECK-NEXT:    [[TMP75:%.*]] = fmul <8 x float> [[TMP67]], [[TMP6]]
+; CHECK-NEXT:    [[TMP76:%.*]] = fmul <8 x float> [[TMP70]], [[TMP8]]
+; CHECK-NEXT:    [[TMP77:%.*]] = fmul <8 x float> [[TMP73]], [[TMP10]]
+; CHECK-NEXT:    [[TMP78:%.*]] = fadd <8 x float> [[TMP52]], [[TMP74]]
+; CHECK-NEXT:    [[TMP79:%.*]] = fadd <8 x float> [[TMP55]], [[TMP75]]
+; CHECK-NEXT:    [[TMP80:%.*]] = fadd <8 x float> [[TMP58]], [[TMP76]]
+; CHECK-NEXT:    [[TMP81:%.*]] = fadd <8 x float> [[TMP61]], [[TMP77]]
+; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -3
+; CHECK-NEXT:    store <8 x float> [[TMP78]], <8 x float>* [[SCEVGEP15]], align 4
+; CHECK-NEXT:    [[SCEVGEP18:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -2
+; CHECK-NEXT:    store <8 x float> [[TMP79]], <8 x float>* [[SCEVGEP18]], align 4
+; CHECK-NEXT:    [[SCEVGEP16:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -1
+; CHECK-NEXT:    store <8 x float> [[TMP80]], <8 x float>* [[SCEVGEP16]], align 4
+; CHECK-NEXT:    store <8 x float> [[TMP81]], <8 x float>* [[LSR_IV1214]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -64
+; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr float, float* [[LSR_IV1]], i64 64
+; CHECK-NEXT:    [[SCEVGEP13]] = getelementptr float, float* [[LSR_IV12]], i64 64
+; CHECK-NEXT:    [[TMP87:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP87]], label [[BB88:%.*]], label [[BB11]]
+; CHECK:       bb88:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = insertelement <8 x float> undef, float %arg1, i32 0
+  %tmp4 = shufflevector <8 x float> %tmp, <8 x float> undef, <8 x i32> zeroinitializer
+  %tmp5 = insertelement <8 x float> undef, float %arg1, i32 0
+  %tmp6 = shufflevector <8 x float> %tmp5, <8 x float> undef, <8 x i32> zeroinitializer
+  %tmp7 = insertelement <8 x float> undef, float %arg1, i32 0
+  %tmp8 = shufflevector <8 x float> %tmp7, <8 x float> undef, <8 x i32> zeroinitializer
+  %tmp9 = insertelement <8 x float> undef, float %arg1, i32 0
+  %tmp10 = shufflevector <8 x float> %tmp9, <8 x float> undef, <8 x i32> zeroinitializer
+  br label %bb11
+
+bb11:                                             ; preds = %bb11, %bb
+  %tmp12 = phi i64 [ 0, %bb ], [ %tmp86, %bb11 ]
+  %tmp13 = getelementptr inbounds float, float* %arg3, i64 %tmp12
+  %tmp14 = bitcast float* %tmp13 to <8 x float>*
+  %tmp15 = load <8 x float>, <8 x float>* %tmp14, align 4
+  %tmp16 = getelementptr float, float* %tmp13, i64 8
+  %tmp17 = bitcast float* %tmp16 to <8 x float>*
+  %tmp18 = load <8 x float>, <8 x float>* %tmp17, align 4
+  %tmp19 = getelementptr float, float* %tmp13, i64 16
+  %tmp20 = bitcast float* %tmp19 to <8 x float>*
+  %tmp21 = load <8 x float>, <8 x float>* %tmp20, align 4
+  %tmp22 = getelementptr float, float* %tmp13, i64 24
+  %tmp23 = bitcast float* %tmp22 to <8 x float>*
+  %tmp24 = load <8 x float>, <8 x float>* %tmp23, align 4
+  %tmp25 = getelementptr inbounds float, float* %arg2, i64 %tmp12
+  %tmp26 = bitcast float* %tmp25 to <8 x float>*
+  %tmp27 = load <8 x float>, <8 x float>* %tmp26, align 4
+  %tmp28 = getelementptr float, float* %tmp25, i64 8
+  %tmp29 = bitcast float* %tmp28 to <8 x float>*
+  %tmp30 = load <8 x float>, <8 x float>* %tmp29, align 4
+  %tmp31 = getelementptr float, float* %tmp25, i64 16
+  %tmp32 = bitcast float* %tmp31 to <8 x float>*
+  %tmp33 = load <8 x float>, <8 x float>* %tmp32, align 4
+  %tmp34 = getelementptr float, float* %tmp25, i64 24
+  %tmp35 = bitcast float* %tmp34 to <8 x float>*
+  %tmp36 = load <8 x float>, <8 x float>* %tmp35, align 4
+  %tmp37 = fmul <8 x float> %tmp27, %tmp4
+  %tmp38 = fmul <8 x float> %tmp30, %tmp6
+  %tmp39 = fmul <8 x float> %tmp33, %tmp8
+  %tmp40 = fmul <8 x float> %tmp36, %tmp10
+  %tmp41 = fadd <8 x float> %tmp15, %tmp37
+  %tmp42 = fadd <8 x float> %tmp18, %tmp38
+  %tmp43 = fadd <8 x float> %tmp21, %tmp39
+  %tmp44 = fadd <8 x float> %tmp24, %tmp40
+  %tmp45 = bitcast float* %tmp13 to <8 x float>*
+  store <8 x float> %tmp41, <8 x float>* %tmp45, align 4
+  %tmp46 = bitcast float* %tmp16 to <8 x float>*
+  store <8 x float> %tmp42, <8 x float>* %tmp46, align 4
+  %tmp47 = bitcast float* %tmp19 to <8 x float>*
+  store <8 x float> %tmp43, <8 x float>* %tmp47, align 4
+  %tmp48 = bitcast float* %tmp22 to <8 x float>*
+  store <8 x float> %tmp44, <8 x float>* %tmp48, align 4
+  %tmp49 = or i64 %tmp12, 32
+  %tmp50 = getelementptr inbounds float, float* %arg3, i64 %tmp49
+  %tmp51 = bitcast float* %tmp50 to <8 x float>*
+  %tmp52 = load <8 x float>, <8 x float>* %tmp51, align 4
+  %tmp53 = getelementptr float, float* %tmp50, i64 8
+  %tmp54 = bitcast float* %tmp53 to <8 x float>*
+  %tmp55 = load <8 x float>, <8 x float>* %tmp54, align 4
+  %tmp56 = getelementptr float, float* %tmp50, i64 16
+  %tmp57 = bitcast float* %tmp56 to <8 x float>*
+  %tmp58 = load <8 x float>, <8 x float>* %tmp57, align 4
+  %tmp59 = getelementptr float, float* %tmp50, i64 24
+  %tmp60 = bitcast float* %tmp59 to <8 x float>*
+  %tmp61 = load <8 x float>, <8 x float>* %tmp60, align 4
+  %tmp62 = getelementptr inbounds float, float* %arg2, i64 %tmp49
+  %tmp63 = bitcast float* %tmp62 to <8 x float>*
+  %tmp64 = load <8 x float>, <8 x float>* %tmp63, align 4
+  %tmp65 = getelementptr float, float* %tmp62, i64 8
+  %tmp66 = bitcast float* %tmp65 to <8 x float>*
+  %tmp67 = load <8 x float>, <8 x float>* %tmp66, align 4
+  %tmp68 = getelementptr float, float* %tmp62, i64 16
+  %tmp69 = bitcast float* %tmp68 to <8 x float>*
+  %tmp70 = load <8 x float>, <8 x float>* %tmp69, align 4
+  %tmp71 = getelementptr float, float* %tmp62, i64 24
+  %tmp72 = bitcast float* %tmp71 to <8 x float>*
+  %tmp73 = load <8 x float>, <8 x float>* %tmp72, align 4
+  %tmp74 = fmul <8 x float> %tmp64, %tmp4
+  %tmp75 = fmul <8 x float> %tmp67, %tmp6
+  %tmp76 = fmul <8 x float> %tmp70, %tmp8
+  %tmp77 = fmul <8 x float> %tmp73, %tmp10
+  %tmp78 = fadd <8 x float> %tmp52, %tmp74
+  %tmp79 = fadd <8 x float> %tmp55, %tmp75
+  %tmp80 = fadd <8 x float> %tmp58, %tmp76
+  %tmp81 = fadd <8 x float> %tmp61, %tmp77
+  %tmp82 = bitcast float* %tmp50 to <8 x float>*
+  store <8 x float> %tmp78, <8 x float>* %tmp82, align 4
+  %tmp83 = bitcast float* %tmp53 to <8 x float>*
+  store <8 x float> %tmp79, <8 x float>* %tmp83, align 4
+  %tmp84 = bitcast float* %tmp56 to <8 x float>*
+  store <8 x float> %tmp80, <8 x float>* %tmp84, align 4
+  %tmp85 = bitcast float* %tmp59 to <8 x float>*
+  store <8 x float> %tmp81, <8 x float>* %tmp85, align 4
+  %tmp86 = add nsw i64 %tmp12, 64
+  %tmp87 = icmp eq i64 %tmp86, 4096
+  br i1 %tmp87, label %bb88, label %bb11
+
+bb88:                                             ; preds = %bb11
+  ret void
+}