Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -342,6 +342,7 @@ unsigned ImmCost; unsigned SetupCost; unsigned ScaleCost; + unsigned FoldedStoreAddresses; }; /// Parameters that control the generic loop unrolling transformation. Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2525,13 +2525,23 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2) { - // X86 specific here are "instruction number 1st priority". - return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, - C1.NumIVMuls, C1.NumBaseAdds, - C1.ScaleCost, C1.ImmCost, C1.SetupCost) < - std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, - C2.NumIVMuls, C2.NumBaseAdds, - C2.ScaleCost, C2.ImmCost, C2.SetupCost); + // X86 specific here are "stores with folded address" and + // "number of instructions" get higest priority. + // We pay attention to stores with folded address as + // this kind of stores (like "movq %rcx, (%rax, %rbx, 4)") + // go only to ports 2 and 3, while stores with simple address + // (like "movq %rcx, 8(%rax)" go to ports 2, 3 and 7. + // When there are a lot of such stores mixed with loads (which + // always go to all 3 ports) there could be significant stalls. + // The cost function try to avoid cases when there are too many + // stores with folded address, treating each 2 such stores as 1 + // additional instruction. + unsigned C1Insns = ((C1.FoldedStoreAddresses >> 1) + C1.Insns); + unsigned C2Insns = ((C2.FoldedStoreAddresses >> 1) + C2.Insns); + return std::tie(C1Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, + C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls, + C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost); } bool X86TTIImpl::canMacroFuseCmp() { Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1006,6 +1006,7 @@ C.ImmCost = 0; C.SetupCost = 0; C.ScaleCost = 0; + C.FoldedStoreAddresses = 0; } bool isLess(Cost &Other, const TargetTransformInfo &TTI); @@ -1015,10 +1016,12 @@ #ifndef NDEBUG // Once any of the metrics loses, they must all remain losers. bool isValid() { - return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds - | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u) - || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds - & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u); + return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls + | C.NumBaseAdds | C.ImmCost | C.SetupCost | C.ScaleCost + | C.FoldedStoreAddresses) != ~0u) + || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls + & C.NumBaseAdds & C.ImmCost & C.SetupCost & C.ScaleCost + & C.FoldedStoreAddresses) == ~0u); } #endif @@ -1343,6 +1346,10 @@ !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) C.NumBaseAdds++; + + if (NumBaseParts > 1 && isa(Fixup.UserInst) + && isAMCompletelyFolded(TTI, LU, F)) + C.FoldedStoreAddresses += 1; } // If we don't count instruction cost exit here. @@ -1394,6 +1401,7 @@ C.ImmCost = std::numeric_limits::max(); C.SetupCost = std::numeric_limits::max(); C.ScaleCost = std::numeric_limits::max(); + C.FoldedStoreAddresses = std::numeric_limits::max(); } /// Choose the lower cost. @@ -1423,6 +1431,9 @@ OS << ", plus " << C.ImmCost << " imm cost"; if (C.SetupCost != 0) OS << ", plus " << C.SetupCost << " setup cost"; + if (C.FoldedStoreAddresses != 0) + OS << ", plus " << C.FoldedStoreAddresses << " folded store address" + << (C.FoldedStoreAddresses == 1 ? "" : "es"); } LLVM_DUMP_METHOD void Cost::dump() const { Index: test/CodeGen/X86/loop-strength-reduce4.ll =================================================================== --- test/CodeGen/X86/loop-strength-reduce4.ll +++ test/CodeGen/X86/loop-strength-reduce4.ll @@ -4,19 +4,16 @@ ; By starting the IV at -64 instead of 0, a cmp is eliminated, ; as the flags from the add can be used directly. -; STATIC: movl $-64, [[EAX:%e..]] +; STATIC: movl $-64, [[ECX:%e..]] -; STATIC: movl %{{.+}}, _state+76([[EAX]]) -; STATIC: addl $16, [[EAX]] +; STATIC: movl [[EAX:%e..]], _state+76([[ECX]]) +; STATIC: addl $16, [[ECX]] ; STATIC: jne -; The same for PIC mode. +; In PIC mode the symbol can't be folded, so the change-compare-stride +; trick applies. -; PIC: movl $-64, [[EAX:%e..]] - -; PIC: movl %{{.+}}, 76(%{{.+}},[[EAX]]) -; PIC: addl $16, [[EAX]] -; PIC: jne +; PIC: dec @state = external global [0 x i32] ; <[0 x i32]*> [#uses=4] @S = external global [0 x i32] ; <[0 x i32]*> [#uses=4] Index: test/CodeGen/X86/merge_store.ll =================================================================== --- test/CodeGen/X86/merge_store.ll +++ test/CodeGen/X86/merge_store.ll @@ -4,14 +4,16 @@ define void @merge_store(i32* nocapture %a) { ; CHECK-LABEL: merge_store: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addq $12, %rdi ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rcx, (%rdi,%rax,4) -; CHECK-NEXT: movq %rcx, 8(%rdi,%rax,4) -; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: movq %rcx, -12(%rdi) +; CHECK-NEXT: movq %rcx, -4(%rdi) +; CHECK-NEXT: addl $4, %eax +; CHECK-NEXT: addq $16, %rdi ; CHECK-NEXT: cmpl $1000, %eax # imm = 0x3E8 ; CHECK-NEXT: jl .LBB0_1 ; CHECK-NEXT: # %bb.2: # %for.end Index: test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll +++ test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll @@ -0,0 +1,202 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-reduce -mtriple=x86_64 -S %s | FileCheck %s + +; The test has 8 stores where if we fold address (like "128(%rsi,%rdi,4)") +; we can safe 1 add instruction. +; However folding store addresses here leads to regression on x86, because +; stores with such addresses can go only to 2 ports. +; Test checks that LSR do not use floded address in stores. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @foo(i32 %arg, float %arg1, float* noalias nocapture readonly %arg2, float* noalias nocapture %arg3) local_unnamed_addr { +; CHECK-LABEL: @foo( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = insertelement <8 x float> undef, float [[ARG1:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP]], <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> undef, float [[ARG1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> undef, float [[ARG1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x float> undef, float [[ARG1]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[ARG2:%.*]], i64 56 +; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr float, float* [[ARG3:%.*]], i64 56 +; CHECK-NEXT: br label [[BB11:%.*]] +; CHECK: bb11: +; CHECK-NEXT: [[LSR_IV12:%.*]] = phi float* [ [[SCEVGEP13:%.*]], [[BB11]] ], [ [[SCEVGEP11]], [[BB:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi float* [ [[SCEVGEP2:%.*]], [[BB11]] ], [ [[SCEVGEP]], [[BB]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB11]] ], [ 4096, [[BB]] ] +; CHECK-NEXT: [[LSR_IV1214:%.*]] = bitcast float* [[LSR_IV12]] to <8 x float>* +; CHECK-NEXT: [[LSR_IV13:%.*]] = bitcast float* [[LSR_IV1]] to <8 x float>* +; CHECK-NEXT: [[SCEVGEP28:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -7 +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP28]], align 4 +; CHECK-NEXT: [[SCEVGEP27:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -6 +; CHECK-NEXT: [[TMP18:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP27]], align 4 +; CHECK-NEXT: [[SCEVGEP25:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -5 +; CHECK-NEXT: [[TMP21:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP25]], align 4 +; CHECK-NEXT: [[SCEVGEP23:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -4 +; CHECK-NEXT: [[TMP24:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP23]], align 4 +; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -7 +; CHECK-NEXT: [[TMP27:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP10]], align 4 +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -6 +; CHECK-NEXT: [[TMP30:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP9]], align 4 +; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -5 +; CHECK-NEXT: [[TMP33:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP8]], align 4 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -4 +; CHECK-NEXT: [[TMP36:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP7]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = fmul <8 x float> [[TMP27]], [[TMP4]] +; CHECK-NEXT: [[TMP38:%.*]] = fmul <8 x float> [[TMP30]], [[TMP6]] +; CHECK-NEXT: [[TMP39:%.*]] = fmul <8 x float> [[TMP33]], [[TMP8]] +; CHECK-NEXT: [[TMP40:%.*]] = fmul <8 x float> [[TMP36]], [[TMP10]] +; CHECK-NEXT: [[TMP41:%.*]] = fadd <8 x float> [[TMP15]], [[TMP37]] +; CHECK-NEXT: [[TMP42:%.*]] = fadd <8 x float> [[TMP18]], [[TMP38]] +; CHECK-NEXT: [[TMP43:%.*]] = fadd <8 x float> [[TMP21]], [[TMP39]] +; CHECK-NEXT: [[TMP44:%.*]] = fadd <8 x float> [[TMP24]], [[TMP40]] +; CHECK-NEXT: [[SCEVGEP21:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -7 +; CHECK-NEXT: store <8 x float> [[TMP41]], <8 x float>* [[SCEVGEP21]], align 4 +; CHECK-NEXT: [[SCEVGEP26:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -6 +; CHECK-NEXT: store <8 x float> [[TMP42]], <8 x float>* [[SCEVGEP26]], align 4 +; CHECK-NEXT: [[SCEVGEP24:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -5 +; CHECK-NEXT: store <8 x float> [[TMP43]], <8 x float>* [[SCEVGEP24]], align 4 +; CHECK-NEXT: [[SCEVGEP22:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -4 +; CHECK-NEXT: store <8 x float> [[TMP44]], <8 x float>* [[SCEVGEP22]], align 4 +; CHECK-NEXT: [[SCEVGEP20:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -3 +; CHECK-NEXT: [[TMP52:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP20]], align 4 +; CHECK-NEXT: [[SCEVGEP19:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -2 +; CHECK-NEXT: [[TMP55:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP19]], align 4 +; CHECK-NEXT: [[SCEVGEP17:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -1 +; CHECK-NEXT: [[TMP58:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP17]], align 4 +; CHECK-NEXT: [[TMP61:%.*]] = load <8 x float>, <8 x float>* [[LSR_IV1214]], align 4 +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -3 +; CHECK-NEXT: [[TMP64:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP6]], align 4 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -2 +; CHECK-NEXT: [[TMP67:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP5]], align 4 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -1 +; CHECK-NEXT: [[TMP70:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP4]], align 4 +; CHECK-NEXT: [[TMP73:%.*]] = load <8 x float>, <8 x float>* [[LSR_IV13]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = fmul <8 x float> [[TMP64]], [[TMP4]] +; CHECK-NEXT: [[TMP75:%.*]] = fmul <8 x float> [[TMP67]], [[TMP6]] +; CHECK-NEXT: [[TMP76:%.*]] = fmul <8 x float> [[TMP70]], [[TMP8]] +; CHECK-NEXT: [[TMP77:%.*]] = fmul <8 x float> [[TMP73]], [[TMP10]] +; CHECK-NEXT: [[TMP78:%.*]] = fadd <8 x float> [[TMP52]], [[TMP74]] +; CHECK-NEXT: [[TMP79:%.*]] = fadd <8 x float> [[TMP55]], [[TMP75]] +; CHECK-NEXT: [[TMP80:%.*]] = fadd <8 x float> [[TMP58]], [[TMP76]] +; CHECK-NEXT: [[TMP81:%.*]] = fadd <8 x float> [[TMP61]], [[TMP77]] +; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -3 +; CHECK-NEXT: store <8 x float> [[TMP78]], <8 x float>* [[SCEVGEP15]], align 4 +; CHECK-NEXT: [[SCEVGEP18:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -2 +; CHECK-NEXT: store <8 x float> [[TMP79]], <8 x float>* [[SCEVGEP18]], align 4 +; CHECK-NEXT: [[SCEVGEP16:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -1 +; CHECK-NEXT: store <8 x float> [[TMP80]], <8 x float>* [[SCEVGEP16]], align 4 +; CHECK-NEXT: store <8 x float> [[TMP81]], <8 x float>* [[LSR_IV1214]], align 4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -64 +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr float, float* [[LSR_IV1]], i64 64 +; CHECK-NEXT: [[SCEVGEP13]] = getelementptr float, float* [[LSR_IV12]], i64 64 +; CHECK-NEXT: [[TMP87:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP87]], label [[BB88:%.*]], label [[BB11]] +; CHECK: bb88: +; CHECK-NEXT: ret void +; +bb: + %tmp = insertelement <8 x float> undef, float %arg1, i32 0 + %tmp4 = shufflevector <8 x float> %tmp, <8 x float> undef, <8 x i32> zeroinitializer + %tmp5 = insertelement <8 x float> undef, float %arg1, i32 0 + %tmp6 = shufflevector <8 x float> %tmp5, <8 x float> undef, <8 x i32> zeroinitializer + %tmp7 = insertelement <8 x float> undef, float %arg1, i32 0 + %tmp8 = shufflevector <8 x float> %tmp7, <8 x float> undef, <8 x i32> zeroinitializer + %tmp9 = insertelement <8 x float> undef, float %arg1, i32 0 + %tmp10 = shufflevector <8 x float> %tmp9, <8 x float> undef, <8 x i32> zeroinitializer + br label %bb11 + +bb11: ; preds = %bb11, %bb + %tmp12 = phi i64 [ 0, %bb ], [ %tmp86, %bb11 ] + %tmp13 = getelementptr inbounds float, float* %arg3, i64 %tmp12 + %tmp14 = bitcast float* %tmp13 to <8 x float>* + %tmp15 = load <8 x float>, <8 x float>* %tmp14, align 4 + %tmp16 = getelementptr float, float* %tmp13, i64 8 + %tmp17 = bitcast float* %tmp16 to <8 x float>* + %tmp18 = load <8 x float>, <8 x float>* %tmp17, align 4 + %tmp19 = getelementptr float, float* %tmp13, i64 16 + %tmp20 = bitcast float* %tmp19 to <8 x float>* + %tmp21 = load <8 x float>, <8 x float>* %tmp20, align 4 + %tmp22 = getelementptr float, float* %tmp13, i64 24 + %tmp23 = bitcast float* %tmp22 to <8 x float>* + %tmp24 = load <8 x float>, <8 x float>* %tmp23, align 4 + %tmp25 = getelementptr inbounds float, float* %arg2, i64 %tmp12 + %tmp26 = bitcast float* %tmp25 to <8 x float>* + %tmp27 = load <8 x float>, <8 x float>* %tmp26, align 4 + %tmp28 = getelementptr float, float* %tmp25, i64 8 + %tmp29 = bitcast float* %tmp28 to <8 x float>* + %tmp30 = load <8 x float>, <8 x float>* %tmp29, align 4 + %tmp31 = getelementptr float, float* %tmp25, i64 16 + %tmp32 = bitcast float* %tmp31 to <8 x float>* + %tmp33 = load <8 x float>, <8 x float>* %tmp32, align 4 + %tmp34 = getelementptr float, float* %tmp25, i64 24 + %tmp35 = bitcast float* %tmp34 to <8 x float>* + %tmp36 = load <8 x float>, <8 x float>* %tmp35, align 4 + %tmp37 = fmul <8 x float> %tmp27, %tmp4 + %tmp38 = fmul <8 x float> %tmp30, %tmp6 + %tmp39 = fmul <8 x float> %tmp33, %tmp8 + %tmp40 = fmul <8 x float> %tmp36, %tmp10 + %tmp41 = fadd <8 x float> %tmp15, %tmp37 + %tmp42 = fadd <8 x float> %tmp18, %tmp38 + %tmp43 = fadd <8 x float> %tmp21, %tmp39 + %tmp44 = fadd <8 x float> %tmp24, %tmp40 + %tmp45 = bitcast float* %tmp13 to <8 x float>* + store <8 x float> %tmp41, <8 x float>* %tmp45, align 4 + %tmp46 = bitcast float* %tmp16 to <8 x float>* + store <8 x float> %tmp42, <8 x float>* %tmp46, align 4 + %tmp47 = bitcast float* %tmp19 to <8 x float>* + store <8 x float> %tmp43, <8 x float>* %tmp47, align 4 + %tmp48 = bitcast float* %tmp22 to <8 x float>* + store <8 x float> %tmp44, <8 x float>* %tmp48, align 4 + %tmp49 = or i64 %tmp12, 32 + %tmp50 = getelementptr inbounds float, float* %arg3, i64 %tmp49 + %tmp51 = bitcast float* %tmp50 to <8 x float>* + %tmp52 = load <8 x float>, <8 x float>* %tmp51, align 4 + %tmp53 = getelementptr float, float* %tmp50, i64 8 + %tmp54 = bitcast float* %tmp53 to <8 x float>* + %tmp55 = load <8 x float>, <8 x float>* %tmp54, align 4 + %tmp56 = getelementptr float, float* %tmp50, i64 16 + %tmp57 = bitcast float* %tmp56 to <8 x float>* + %tmp58 = load <8 x float>, <8 x float>* %tmp57, align 4 + %tmp59 = getelementptr float, float* %tmp50, i64 24 + %tmp60 = bitcast float* %tmp59 to <8 x float>* + %tmp61 = load <8 x float>, <8 x float>* %tmp60, align 4 + %tmp62 = getelementptr inbounds float, float* %arg2, i64 %tmp49 + %tmp63 = bitcast float* %tmp62 to <8 x float>* + %tmp64 = load <8 x float>, <8 x float>* %tmp63, align 4 + %tmp65 = getelementptr float, float* %tmp62, i64 8 + %tmp66 = bitcast float* %tmp65 to <8 x float>* + %tmp67 = load <8 x float>, <8 x float>* %tmp66, align 4 + %tmp68 = getelementptr float, float* %tmp62, i64 16 + %tmp69 = bitcast float* %tmp68 to <8 x float>* + %tmp70 = load <8 x float>, <8 x float>* %tmp69, align 4 + %tmp71 = getelementptr float, float* %tmp62, i64 24 + %tmp72 = bitcast float* %tmp71 to <8 x float>* + %tmp73 = load <8 x float>, <8 x float>* %tmp72, align 4 + %tmp74 = fmul <8 x float> %tmp64, %tmp4 + %tmp75 = fmul <8 x float> %tmp67, %tmp6 + %tmp76 = fmul <8 x float> %tmp70, %tmp8 + %tmp77 = fmul <8 x float> %tmp73, %tmp10 + %tmp78 = fadd <8 x float> %tmp52, %tmp74 + %tmp79 = fadd <8 x float> %tmp55, %tmp75 + %tmp80 = fadd <8 x float> %tmp58, %tmp76 + %tmp81 = fadd <8 x float> %tmp61, %tmp77 + %tmp82 = bitcast float* %tmp50 to <8 x float>* + store <8 x float> %tmp78, <8 x float>* %tmp82, align 4 + %tmp83 = bitcast float* %tmp53 to <8 x float>* + store <8 x float> %tmp79, <8 x float>* %tmp83, align 4 + %tmp84 = bitcast float* %tmp56 to <8 x float>* + store <8 x float> %tmp80, <8 x float>* %tmp84, align 4 + %tmp85 = bitcast float* %tmp59 to <8 x float>* + store <8 x float> %tmp81, <8 x float>* %tmp85, align 4 + %tmp86 = add nsw i64 %tmp12, 64 + %tmp87 = icmp eq i64 %tmp86, 4096 + br i1 %tmp87, label %bb88, label %bb11 + +bb88: ; preds = %bb11 + ret void +}