Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -349,6 +349,7 @@
     unsigned ImmCost;
     unsigned SetupCost;
     unsigned ScaleCost;
+    unsigned FoldedAddress;
   };
 
   /// Parameters that control the generic loop unrolling transformation.
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2469,10 +2469,12 @@
 bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                                TargetTransformInfo::LSRCost &C2) {
     // X86 specific here are "instruction number 1st priority".
-    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
+    unsigned C1Insns = C1.Insns + (C1.FoldedAddress >> 3);
+    unsigned C2Insns = C2.Insns + (C2.FoldedAddress >> 3);
+    return std::tie(C1Insns, C1.NumRegs, C1.AddRecCost,
                     C1.NumIVMuls, C1.NumBaseAdds,
                     C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
-           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
+           std::tie(C2Insns, C2.NumRegs, C2.AddRecCost,
                     C2.NumIVMuls, C2.NumBaseAdds,
                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -976,6 +976,7 @@
     C.ImmCost = 0;
     C.SetupCost = 0;
     C.ScaleCost = 0;
+    C.FoldedAddress = 0;
   }
 
   bool isLess(Cost &Other, const TargetTransformInfo &TTI);
@@ -986,9 +987,9 @@
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
     return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
-             | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
+             | C.ImmCost | C.SetupCost | C.ScaleCost | C.FoldedAddress) != ~0u)
       || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
-           & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
+           & C.ImmCost & C.SetupCost & C.ScaleCost & C.FoldedAddress) == ~0u);
   }
 #endif
 
@@ -1298,6 +1299,9 @@
                               Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
       C.NumBaseAdds++;
   }
+  if (NumBaseParts > 1 && LU.Kind == LSRUse::Address &&
+      isAMCompletelyFolded(TTI, LU, F))
+    C.FoldedAddress += LU.Fixups.size();
 
   // If we don't count instruction cost exit here.
   if (!InsnsCost) {
@@ -1347,6 +1351,7 @@
   C.ImmCost = ~0u;
   C.SetupCost = ~0u;
   C.ScaleCost = ~0u;
+  C.FoldedAddress = ~0u;
 }
 
 /// Choose the lower cost.
@@ -1376,6 +1381,9 @@
     OS << ", plus " << C.ImmCost << " imm cost";
   if (C.SetupCost != 0)
     OS << ", plus " << C.SetupCost << " setup cost";
+  if (C.FoldedAddress != 0)
+    OS << ", plus " << C.FoldedAddress << " folded address"
+       << (C.FoldedAddress == 1 ? "" : "es");
 }
 
 LLVM_DUMP_METHOD void Cost::dump() const {
Index: test/CodeGen/X86/misched-matrix.ll
===================================================================
--- test/CodeGen/X86/misched-matrix.ll
+++ test/CodeGen/X86/misched-matrix.ll
@@ -16,19 +16,19 @@
 ; alias analysis ability (that doesn't require any AliasAnalysis pass).
 ;
 ; TOPDOWN-LABEL: %for.body
-; TOPDOWN: movl %{{.*}}, (
+; TOPDOWN: movl %{{.*}}, -12(
 ; TOPDOWN-NOT: imull {{[0-9]*}}(
-; TOPDOWN: movl %{{.*}}, 4(
+; TOPDOWN: movl %{{.*}}, -8(
 ; TOPDOWN-NOT: imull {{[0-9]*}}(
-; TOPDOWN: movl %{{.*}}, 8(
-; TOPDOWN: movl %{{.*}}, 12(
+; TOPDOWN: movl %{{.*}}, -4(
+; TOPDOWN: movl %{{.*}}, (
 ; TOPDOWN-LABEL: %for.end
 ;
 ; For -misched=ilpmin, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are interleaved.
 ;
 ; ILPMIN-LABEL: %for.body
-; ILPMIN: movl %{{.*}}, (
+; ILPMIN: movl %{{.*}}, -12(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
@@ -36,7 +36,7 @@
 ; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
-; ILPMIN: movl %{{.*}}, 4(
+; ILPMIN: movl %{{.*}}, -8(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
@@ -44,7 +44,7 @@
 ; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
-; ILPMIN: movl %{{.*}}, 8(
+; ILPMIN: movl %{{.*}}, -4(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
@@ -52,14 +52,14 @@
 ; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
-; ILPMIN: movl %{{.*}}, 12(
+; ILPMIN: movl %{{.*}}, (
 ; ILPMIN-LABEL: %for.end
 ;
 ; For -misched=ilpmax, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are clustered.
 ;
 ; ILPMAX-LABEL: %for.body
-; ILPMAX: movl %{{.*}}, (
+; ILPMAX: movl %{{.*}}, -12(
 ; ILPMAX: imull
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -67,7 +67,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: addl
-; ILPMAX: movl %{{.*}}, 4(
+; ILPMAX: movl %{{.*}}, -8(
 ; ILPMAX: imull
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -75,7 +75,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: addl
-; ILPMAX: movl %{{.*}}, 8(
+; ILPMAX: movl %{{.*}}, -4(
 ; ILPMAX: imull
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -83,7 +83,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: addl
-; ILPMAX: movl %{{.*}}, 12(
+; ILPMAX: movl %{{.*}}, (
 ; ILPMAX-LABEL: %for.end
 
 define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
Index: test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll
+++ test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-reduce -mtriple=x86_64 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32, float, float* noalias nocapture readonly, float* noalias nocapture) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> undef, float [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[TMP2:%.*]], i64 56
+; CHECK-NEXT:    [[SCEVGEP11:%.*]] = getelementptr float, float* [[TMP3:%.*]], i64 56
+; CHECK-NEXT:    br label [[TMP13:%.*]]
+; CHECK:         [[LSR_IV12:%.*]] = phi float* [ [[SCEVGEP13:%.*]], [[TMP13]] ], [ [[SCEVGEP11]], [[TMP4:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi float* [ [[SCEVGEP2:%.*]], [[TMP13]] ], [ [[SCEVGEP]], [[TMP4]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[TMP13]] ], [ 4096, [[TMP4]] ]
+; CHECK-NEXT:    [[LSR_IV1214:%.*]] = bitcast float* [[LSR_IV12]] to <8 x float>*
+; CHECK-NEXT:    [[LSR_IV13:%.*]] = bitcast float* [[LSR_IV1]] to <8 x float>*
+; CHECK-NEXT:    [[SCEVGEP28:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -7
+; CHECK-NEXT:    [[TMP14:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP28]], align 4
+; CHECK-NEXT:    [[SCEVGEP27:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -6
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP27]], align 4
+; CHECK-NEXT:    [[SCEVGEP25:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -5
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP25]], align 4
+; CHECK-NEXT:    [[SCEVGEP23:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -4
+; CHECK-NEXT:    [[TMP17:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP23]], align 4
+; CHECK-NEXT:    [[SCEVGEP10:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -7
+; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP10]], align 4
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -6
+; CHECK-NEXT:    [[TMP19:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP9]], align 4
+; CHECK-NEXT:    [[SCEVGEP8:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -5
+; CHECK-NEXT:    [[TMP20:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP8]], align 4
+; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -4
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP7]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul <8 x float> [[TMP18]], [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP19]], [[TMP8]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul <8 x float> [[TMP20]], [[TMP10]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fmul <8 x float> [[TMP21]], [[TMP12]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fadd <8 x float> [[TMP14]], [[TMP22]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd <8 x float> [[TMP15]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd <8 x float> [[TMP16]], [[TMP24]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd <8 x float> [[TMP17]], [[TMP25]]
+; CHECK-NEXT:    [[SCEVGEP21:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -7
+; CHECK-NEXT:    store <8 x float> [[TMP26]], <8 x float>* [[SCEVGEP21]], align 4
+; CHECK-NEXT:    [[SCEVGEP26:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -6
+; CHECK-NEXT:    store <8 x float> [[TMP27]], <8 x float>* [[SCEVGEP26]], align 4
+; CHECK-NEXT:    [[SCEVGEP24:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -5
+; CHECK-NEXT:    store <8 x float> [[TMP28]], <8 x float>* [[SCEVGEP24]], align 4
+; CHECK-NEXT:    [[SCEVGEP22:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -4
+; CHECK-NEXT:    store <8 x float> [[TMP29]], <8 x float>* [[SCEVGEP22]], align 4
+; CHECK-NEXT:    [[SCEVGEP20:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -3
+; CHECK-NEXT:    [[TMP30:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP20]], align 4
+; CHECK-NEXT:    [[SCEVGEP19:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -2
+; CHECK-NEXT:    [[TMP31:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP19]], align 4
+; CHECK-NEXT:    [[SCEVGEP17:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -1
+; CHECK-NEXT:    [[TMP32:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP17]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = load <8 x float>, <8 x float>* [[LSR_IV1214]], align 4
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -3
+; CHECK-NEXT:    [[TMP34:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP6]], align 4
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -2
+; CHECK-NEXT:    [[TMP35:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP5]], align 4
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -1
+; CHECK-NEXT:    [[TMP36:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP4]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = load <8 x float>, <8 x float>* [[LSR_IV13]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = fmul <8 x float> [[TMP34]], [[TMP6]]
+; CHECK-NEXT:    [[TMP39:%.*]] = fmul <8 x float> [[TMP35]], [[TMP8]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fmul <8 x float> [[TMP36]], [[TMP10]]
+; CHECK-NEXT:    [[TMP41:%.*]] = fmul <8 x float> [[TMP37]], [[TMP12]]
+; CHECK-NEXT:    [[TMP42:%.*]] = fadd <8 x float> [[TMP30]], [[TMP38]]
+; CHECK-NEXT:    [[TMP43:%.*]] = fadd <8 x float> [[TMP31]], [[TMP39]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fadd <8 x float> [[TMP32]], [[TMP40]]
+; CHECK-NEXT:    [[TMP45:%.*]] = fadd <8 x float> [[TMP33]], [[TMP41]]
+; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -3
+; CHECK-NEXT:    store <8 x float> [[TMP42]], <8 x float>* [[SCEVGEP15]], align 4
+; CHECK-NEXT:    [[SCEVGEP18:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -2
+; CHECK-NEXT:    store <8 x float> [[TMP43]], <8 x float>* [[SCEVGEP18]], align 4
+; CHECK-NEXT:    [[SCEVGEP16:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -1
+; CHECK-NEXT:    store <8 x float> [[TMP44]], <8 x float>* [[SCEVGEP16]], align 4
+; CHECK-NEXT:    store <8 x float> [[TMP45]], <8 x float>* [[LSR_IV1214]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -64
+; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr float, float* [[LSR_IV1]], i64 64
+; CHECK-NEXT:    [[SCEVGEP13]] = getelementptr float, float* [[LSR_IV12]], i64 64
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP46]], label [[TMP47:%.*]], label [[TMP13]]
+; CHECK:         ret void
+;
+  %5 = insertelement <8 x float> undef, float %1, i32 0
+  %6 = shufflevector <8 x float> %5, <8 x float> undef, <8 x i32> zeroinitializer
+  %7 = insertelement <8 x float> undef, float %1, i32 0
+  %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> zeroinitializer
+  %9 = insertelement <8 x float> undef, float %1, i32 0
+  %10 = shufflevector <8 x float> %9, <8 x float> undef, <8 x i32> zeroinitializer
+  %11 = insertelement <8 x float> undef, float %1, i32 0
+  %12 = shufflevector <8 x float> %11, <8 x float> undef, <8 x i32> zeroinitializer
+  br label %13
+
+; <label>:13:                                     ; preds = %13, %4
+  %14 = phi i64 [ 0, %4 ], [ %88, %13 ]
+  %15 = getelementptr inbounds float, float* %3, i64 %14
+  %16 = bitcast float* %15 to <8 x float>*
+  %17 = load <8 x float>, <8 x float>* %16, align 4
+  %18 = getelementptr float, float* %15, i64 8
+  %19 = bitcast float* %18 to <8 x float>*
+  %20 = load <8 x float>, <8 x float>* %19, align 4
+  %21 = getelementptr float, float* %15, i64 16
+  %22 = bitcast float* %21 to <8 x float>*
+  %23 = load <8 x float>, <8 x float>* %22, align 4
+  %24 = getelementptr float, float* %15, i64 24
+  %25 = bitcast float* %24 to <8 x float>*
+  %26 = load <8 x float>, <8 x float>* %25, align 4
+  %27 = getelementptr inbounds float, float* %2, i64 %14
+  %28 = bitcast float* %27 to <8 x float>*
+  %29 = load <8 x float>, <8 x float>* %28, align 4
+  %30 = getelementptr float, float* %27, i64 8
+  %31 = bitcast float* %30 to <8 x float>*
+  %32 = load <8 x float>, <8 x float>* %31, align 4
+  %33 = getelementptr float, float* %27, i64 16
+  %34 = bitcast float* %33 to <8 x float>*
+  %35 = load <8 x float>, <8 x float>* %34, align 4
+  %36 = getelementptr float, float* %27, i64 24
+  %37 = bitcast float* %36 to <8 x float>*
+  %38 = load <8 x float>, <8 x float>* %37, align 4
+  %39 = fmul <8 x float> %29, %6
+  %40 = fmul <8 x float> %32, %8
+  %41 = fmul <8 x float> %35, %10
+  %42 = fmul <8 x float> %38, %12
+  %43 = fadd <8 x float> %17, %39
+  %44 = fadd <8 x float> %20, %40
+  %45 = fadd <8 x float> %23, %41
+  %46 = fadd <8 x float> %26, %42
+  %47 = bitcast float* %15 to <8 x float>*
+  store <8 x float> %43, <8 x float>* %47, align 4
+  %48 = bitcast float* %18 to <8 x float>*
+  store <8 x float> %44, <8 x float>* %48, align 4
+  %49 = bitcast float* %21 to <8 x float>*
+  store <8 x float> %45, <8 x float>* %49, align 4
+  %50 = bitcast float* %24 to <8 x float>*
+  store <8 x float> %46, <8 x float>* %50, align 4
+  %51 = or i64 %14, 32
+  %52 = getelementptr inbounds float, float* %3, i64 %51
+  %53 = bitcast float* %52 to <8 x float>*
+  %54 = load <8 x float>, <8 x float>* %53, align 4
+  %55 = getelementptr float, float* %52, i64 8
+  %56 = bitcast float* %55 to <8 x float>*
+  %57 = load <8 x float>, <8 x float>* %56, align 4
+  %58 = getelementptr float, float* %52, i64 16
+  %59 = bitcast float* %58 to <8 x float>*
+  %60 = load <8 x float>, <8 x float>* %59, align 4
+  %61 = getelementptr float, float* %52, i64 24
+  %62 = bitcast float* %61 to <8 x float>*
+  %63 = load <8 x float>, <8 x float>* %62, align 4
+  %64 = getelementptr inbounds float, float* %2, i64 %51
+  %65 = bitcast float* %64 to <8 x float>*
+  %66 = load <8 x float>, <8 x float>* %65, align 4
+  %67 = getelementptr float, float* %64, i64 8
+  %68 = bitcast float* %67 to <8 x float>*
+  %69 = load <8 x float>, <8 x float>* %68, align 4
+  %70 = getelementptr float, float* %64, i64 16
+  %71 = bitcast float* %70 to <8 x float>*
+  %72 = load <8 x float>, <8 x float>* %71, align 4
+  %73 = getelementptr float, float* %64, i64 24
+  %74 = bitcast float* %73 to <8 x float>*
+  %75 = load <8 x float>, <8 x float>* %74, align 4
+  %76 = fmul <8 x float> %66, %6
+  %77 = fmul <8 x float> %69, %8
+  %78 = fmul <8 x float> %72, %10
+  %79 = fmul <8 x float> %75, %12
+  %80 = fadd <8 x float> %54, %76
+  %81 = fadd <8 x float> %57, %77
+  %82 = fadd <8 x float> %60, %78
+  %83 = fadd <8 x float> %63, %79
+  %84 = bitcast float* %52 to <8 x float>*
+  store <8 x float> %80, <8 x float>* %84, align 4
+  %85 = bitcast float* %55 to <8 x float>*
+  store <8 x float> %81, <8 x float>* %85, align 4
+  %86 = bitcast float* %58 to <8 x float>*
+  store <8 x float> %82, <8 x float>* %86, align 4
+  %87 = bitcast float* %61 to <8 x float>*
+  store <8 x float> %83, <8 x float>* %87, align 4
+  %88 = add nsw i64 %14, 64
+  %89 = icmp eq i64 %88, 4096
+  br i1 %89, label %90, label %13
+
+; <label>:90:                                     ; preds = %13
+  ret void
+}