diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -81,10 +81,8 @@ using namespace llvm; -// By default, we limit this to creating 16 common bases out of loops per -// function. 16 is a little over half of the allocatable register set. static cl::opt MaxVarsPrep("ppc-formprep-max-vars", - cl::Hidden, cl::init(16), + cl::Hidden, cl::init(24), cl::desc("Potential common base number threshold per function for PPC loop " "prep")); @@ -94,8 +92,7 @@ // Sum of following 3 per loop thresholds for all loops can not be larger // than MaxVarsPrep. -// By default, we limit this to creating 9 PHIs for one loop. -// 9 and 3 for each kind prep are exterimental values on Power9. +// now the thresholds for each kind prep are exterimental values on Power9. static cl::opt MaxVarsUpdateForm("ppc-preinc-prep-max-vars", cl::Hidden, cl::init(3), cl::desc("Potential PHI threshold per loop for PPC loop prep of update " @@ -106,7 +103,7 @@ cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form")); static cl::opt MaxVarsDQForm("ppc-dqprep-max-vars", - cl::Hidden, cl::init(3), + cl::Hidden, cl::init(8), cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form")); diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -13,28 +13,24 @@ ; CHECK-NEXT: # %bb.0: # %entry ; CHECK-NEXT: stdu 1, -448(1) ; CHECK-NEXT: .cfi_def_cfa_offset 448 -; CHECK-NEXT: .cfi_offset r14, -288 -; CHECK-NEXT: .cfi_offset r15, -280 -; CHECK-NEXT: .cfi_offset r16, -272 -; CHECK-NEXT: .cfi_offset r17, -264 -; CHECK-NEXT: .cfi_offset r18, -256 -; CHECK-NEXT: .cfi_offset r19, -248 -; CHECK-NEXT: .cfi_offset r20, -240 -; CHECK-NEXT: .cfi_offset r21, -232 -; CHECK-NEXT: .cfi_offset r22, -224 -; CHECK-NEXT: .cfi_offset r23, -216 -; CHECK-NEXT: .cfi_offset r24, -208 -; CHECK-NEXT: .cfi_offset r25, -200 -; CHECK-NEXT: .cfi_offset r26, -192 -; CHECK-NEXT: .cfi_offset r27, -184 -; CHECK-NEXT: .cfi_offset r28, -176 -; CHECK-NEXT: .cfi_offset r29, -168 -; CHECK-NEXT: .cfi_offset r30, -160 -; CHECK-NEXT: .cfi_offset r31, -152 -; CHECK-NEXT: .cfi_offset f14, -144 -; CHECK-NEXT: .cfi_offset f15, -136 -; CHECK-NEXT: .cfi_offset f16, -128 -; CHECK-NEXT: .cfi_offset f17, -120 +; CHECK-NEXT: .cfi_offset r14, -256 +; CHECK-NEXT: .cfi_offset r15, -248 +; CHECK-NEXT: .cfi_offset r16, -240 +; CHECK-NEXT: .cfi_offset r17, -232 +; CHECK-NEXT: .cfi_offset r18, -224 +; CHECK-NEXT: .cfi_offset r19, -216 +; CHECK-NEXT: .cfi_offset r20, -208 +; CHECK-NEXT: .cfi_offset r21, -200 +; CHECK-NEXT: .cfi_offset r22, -192 +; CHECK-NEXT: .cfi_offset r23, -184 +; CHECK-NEXT: .cfi_offset r24, -176 +; CHECK-NEXT: .cfi_offset r25, -168 +; CHECK-NEXT: .cfi_offset r26, -160 +; CHECK-NEXT: .cfi_offset r27, -152 +; CHECK-NEXT: .cfi_offset r28, -144 +; CHECK-NEXT: .cfi_offset r29, -136 +; CHECK-NEXT: .cfi_offset r30, -128 +; CHECK-NEXT: .cfi_offset r31, -120 ; CHECK-NEXT: .cfi_offset f18, -112 ; CHECK-NEXT: .cfi_offset f19, -104 ; CHECK-NEXT: .cfi_offset f20, -96 @@ -50,29 +46,25 @@ ; CHECK-NEXT: .cfi_offset f30, -16 ; CHECK-NEXT: .cfi_offset f31, -8 ; CHECK-NEXT: lwz 4, 0(4) -; CHECK-NEXT: std 14, 160(1) # 8-byte Folded Spill -; CHECK-NEXT: std 15, 168(1) # 8-byte Folded Spill +; CHECK-NEXT: std 14, 192(1) # 8-byte Folded Spill +; CHECK-NEXT: std 15, 200(1) # 8-byte Folded Spill ; CHECK-NEXT: cmpwi 4, 1 -; CHECK-NEXT: std 16, 176(1) # 8-byte Folded Spill -; CHECK-NEXT: std 17, 184(1) # 8-byte Folded Spill -; CHECK-NEXT: std 18, 192(1) # 8-byte Folded Spill -; CHECK-NEXT: std 19, 200(1) # 8-byte Folded Spill -; CHECK-NEXT: std 20, 208(1) # 8-byte Folded Spill -; CHECK-NEXT: std 21, 216(1) # 8-byte Folded Spill -; CHECK-NEXT: std 22, 224(1) # 8-byte Folded Spill -; CHECK-NEXT: std 23, 232(1) # 8-byte Folded Spill -; CHECK-NEXT: std 24, 240(1) # 8-byte Folded Spill -; CHECK-NEXT: std 25, 248(1) # 8-byte Folded Spill -; CHECK-NEXT: std 26, 256(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 264(1) # 8-byte Folded Spill -; CHECK-NEXT: std 28, 272(1) # 8-byte Folded Spill -; CHECK-NEXT: std 29, 280(1) # 8-byte Folded Spill -; CHECK-NEXT: std 30, 288(1) # 8-byte Folded Spill -; CHECK-NEXT: std 31, 296(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 14, 304(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 15, 312(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 16, 320(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 17, 328(1) # 8-byte Folded Spill +; CHECK-NEXT: std 16, 208(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 216(1) # 8-byte Folded Spill +; CHECK-NEXT: std 18, 224(1) # 8-byte Folded Spill +; CHECK-NEXT: std 19, 232(1) # 8-byte Folded Spill +; CHECK-NEXT: std 20, 240(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 248(1) # 8-byte Folded Spill +; CHECK-NEXT: std 22, 256(1) # 8-byte Folded Spill +; CHECK-NEXT: std 23, 264(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 272(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 280(1) # 8-byte Folded Spill +; CHECK-NEXT: std 26, 288(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 296(1) # 8-byte Folded Spill +; CHECK-NEXT: std 28, 304(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 312(1) # 8-byte Folded Spill +; CHECK-NEXT: std 30, 320(1) # 8-byte Folded Spill +; CHECK-NEXT: std 31, 328(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 18, 336(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 19, 344(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 20, 352(1) # 8-byte Folded Spill @@ -94,235 +86,259 @@ ; CHECK-NEXT: blt 0, .LBB0_7 ; CHECK-NEXT: # %bb.2: # %_loop_1_do_.preheader ; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: mr 28, 5 +; CHECK-NEXT: mr 24, 5 ; CHECK-NEXT: li 5, 9 -; CHECK-NEXT: lwa 19, 0(7) -; CHECK-NEXT: ld 29, 648(1) -; CHECK-NEXT: ld 12, 672(1) -; CHECK-NEXT: mr 23, 6 +; CHECK-NEXT: mr 11, 7 +; CHECK-NEXT: ld 12, 640(1) +; CHECK-NEXT: std 9, 176(1) # 8-byte Folded Spill +; CHECK-NEXT: std 10, 184(1) # 8-byte Folded Spill +; CHECK-NEXT: mr 7, 6 ; CHECK-NEXT: ld 6, 544(1) -; CHECK-NEXT: ld 27, 640(1) -; CHECK-NEXT: ld 11, 680(1) -; CHECK-NEXT: ld 26, 632(1) -; CHECK-NEXT: ld 7, 688(1) -; CHECK-NEXT: ld 2, 664(1) -; CHECK-NEXT: ld 30, 656(1) -; CHECK-NEXT: ld 25, 624(1) +; CHECK-NEXT: lxv 1, 0(9) +; CHECK-NEXT: ld 9, 648(1) +; CHECK-NEXT: ld 29, 688(1) +; CHECK-NEXT: ld 28, 680(1) +; CHECK-NEXT: ld 2, 632(1) +; CHECK-NEXT: ld 26, 624(1) +; CHECK-NEXT: lxv 0, 0(10) ; CHECK-NEXT: cmpldi 3, 9 -; CHECK-NEXT: ld 24, 616(1) -; CHECK-NEXT: ld 18, 608(1) +; CHECK-NEXT: lxv 4, 0(8) +; CHECK-NEXT: ld 30, 664(1) +; CHECK-NEXT: ld 10, 704(1) +; CHECK-NEXT: ld 27, 672(1) +; CHECK-NEXT: ld 25, 616(1) +; CHECK-NEXT: ld 23, 608(1) +; CHECK-NEXT: ld 22, 600(1) +; CHECK-NEXT: ld 21, 592(1) +; CHECK-NEXT: ld 19, 584(1) +; CHECK-NEXT: ld 17, 576(1) ; CHECK-NEXT: iselgt 3, 3, 5 -; CHECK-NEXT: ld 5, 552(1) +; CHECK-NEXT: ld 5, 656(1) ; CHECK-NEXT: addi 3, 3, -2 -; CHECK-NEXT: std 6, 112(1) # 8-byte Folded Spill -; CHECK-NEXT: std 12, 120(1) # 8-byte Folded Spill +; CHECK-NEXT: lwa 20, 0(11) +; CHECK-NEXT: lxv 13, 0(12) +; CHECK-NEXT: std 6, 128(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 136(1) # 8-byte Folded Spill ; CHECK-NEXT: lxv 2, 0(6) ; CHECK-NEXT: ld 6, 696(1) -; CHECK-NEXT: ld 17, 600(1) -; CHECK-NEXT: ld 16, 592(1) -; CHECK-NEXT: ld 14, 584(1) -; CHECK-NEXT: std 11, 128(1) # 8-byte Folded Spill -; CHECK-NEXT: std 7, 136(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 10, 0(11) -; CHECK-NEXT: lxv 11, 0(12) -; CHECK-NEXT: lxv 1, 0(9) -; CHECK-NEXT: lxv 0, 0(10) -; CHECK-NEXT: lxv 9, 0(8) -; CHECK-NEXT: lxv 40, 0(14) -; CHECK-NEXT: lxv 39, 0(16) -; CHECK-NEXT: lxv 38, 0(17) -; CHECK-NEXT: lxv 33, 0(18) -; CHECK-NEXT: lxv 32, 0(24) -; CHECK-NEXT: lxv 37, 0(25) -; CHECK-NEXT: lxv 36, 0(26) -; CHECK-NEXT: lxv 35, 0(27) -; CHECK-NEXT: lxv 34, 0(29) +; CHECK-NEXT: lxv 34, 0(2) +; CHECK-NEXT: lxv 7, 0(29) +; CHECK-NEXT: lxv 39, 0(17) +; CHECK-NEXT: lxv 38, 0(19) +; CHECK-NEXT: lxv 33, 0(21) +; CHECK-NEXT: lxv 32, 0(22) +; CHECK-NEXT: lxv 37, 0(23) +; CHECK-NEXT: lxv 36, 0(25) +; CHECK-NEXT: lxv 35, 0(26) +; CHECK-NEXT: lxv 11, 0(9) +; CHECK-NEXT: lxv 12, 0(30) ; CHECK-NEXT: rldicl 3, 3, 61, 3 ; CHECK-NEXT: addi 0, 3, 1 ; CHECK-NEXT: ld 3, 560(1) -; CHECK-NEXT: std 5, 64(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 72(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 4, 0(5) -; CHECK-NEXT: ld 5, 704(1) -; CHECK-NEXT: lxv 13, 0(30) -; CHECK-NEXT: lxv 12, 0(2) -; CHECK-NEXT: lxv 8, 0(7) -; CHECK-NEXT: lxv 7, 0(6) -; CHECK-NEXT: lxv 6, 0(5) -; CHECK-NEXT: std 6, 144(1) # 8-byte Folded Spill -; CHECK-NEXT: std 5, 152(1) # 8-byte Folded Spill +; CHECK-NEXT: sldi 11, 20, 2 +; CHECK-NEXT: lxv 9, 0(5) +; CHECK-NEXT: lxv 10, 0(27) +; CHECK-NEXT: lxv 8, 0(28) +; CHECK-NEXT: lxv 6, 0(6) +; CHECK-NEXT: lxv 5, 0(10) ; CHECK-NEXT: lxv 3, 0(3) -; CHECK-NEXT: std 3, 80(1) # 8-byte Folded Spill -; CHECK-NEXT: std 29, 88(1) # 8-byte Folded Spill -; CHECK-NEXT: sldi 3, 19, 4 -; CHECK-NEXT: mulli 5, 19, 40 -; CHECK-NEXT: li 29, 0 -; CHECK-NEXT: add 3, 3, 28 -; CHECK-NEXT: addi 15, 3, 32 -; CHECK-NEXT: ld 3, 568(1) -; CHECK-NEXT: add 11, 28, 5 -; CHECK-NEXT: sldi 5, 19, 5 -; CHECK-NEXT: mulli 7, 19, 48 -; CHECK-NEXT: mulli 27, 19, 6 -; CHECK-NEXT: std 30, 96(1) # 8-byte Folded Spill -; CHECK-NEXT: std 2, 104(1) # 8-byte Folded Spill -; CHECK-NEXT: li 30, 1 -; CHECK-NEXT: std 24, 32(1) # 8-byte Folded Spill -; CHECK-NEXT: std 25, 40(1) # 8-byte Folded Spill -; CHECK-NEXT: add 12, 28, 5 -; CHECK-NEXT: lxv 5, 0(3) -; CHECK-NEXT: std 3, 48(1) # 8-byte Folded Spill -; CHECK-NEXT: std 26, 56(1) # 8-byte Folded Spill -; CHECK-NEXT: sldi 3, 19, 3 -; CHECK-NEXT: mulli 5, 19, 24 -; CHECK-NEXT: add 3, 3, 28 +; CHECK-NEXT: std 3, 96(1) # 8-byte Folded Spill +; CHECK-NEXT: std 12, 104(1) # 8-byte Folded Spill +; CHECK-NEXT: sldi 3, 20, 4 +; CHECK-NEXT: add 12, 20, 11 +; CHECK-NEXT: std 8, 168(1) # 8-byte Folded Spill +; CHECK-NEXT: std 6, 160(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 8, 552(1) +; CHECK-NEXT: sldi 18, 20, 1 +; CHECK-NEXT: lxv 41, 0(8) +; CHECK-NEXT: add 3, 3, 24 +; CHECK-NEXT: addi 16, 3, 32 +; CHECK-NEXT: sldi 3, 20, 3 +; CHECK-NEXT: std 9, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: std 5, 120(1) # 8-byte Folded Spill +; CHECK-NEXT: sldi 5, 12, 3 +; CHECK-NEXT: std 26, 80(1) # 8-byte Folded Spill +; CHECK-NEXT: std 2, 88(1) # 8-byte Folded Spill +; CHECK-NEXT: add 2, 24, 5 +; CHECK-NEXT: mr 9, 30 +; CHECK-NEXT: li 26, 1 +; CHECK-NEXT: add 3, 3, 24 ; CHECK-NEXT: addi 31, 3, 32 -; CHECK-NEXT: ld 3, 576(1) -; CHECK-NEXT: add 2, 28, 5 -; CHECK-NEXT: mr 5, 28 -; CHECK-NEXT: lxv 41, 0(3) +; CHECK-NEXT: ld 3, 568(1) +; CHECK-NEXT: std 28, 144(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 152(1) # 8-byte Folded Spill +; CHECK-NEXT: sldi 5, 20, 5 +; CHECK-NEXT: add 29, 20, 18 +; CHECK-NEXT: std 23, 64(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 72(1) # 8-byte Folded Spill +; CHECK-NEXT: mulli 27, 20, 48 +; CHECK-NEXT: add 30, 24, 5 +; CHECK-NEXT: li 25, 0 +; CHECK-NEXT: lxv 40, 0(3) +; CHECK-NEXT: mulli 23, 20, 6 +; CHECK-NEXT: sldi 5, 29, 3 +; CHECK-NEXT: add 28, 24, 5 +; CHECK-NEXT: mr 5, 24 +; CHECK-NEXT: std 17, 32(1) # 8-byte Folded Spill +; CHECK-NEXT: std 19, 40(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 48(1) # 8-byte Folded Spill +; CHECK-NEXT: std 22, 56(1) # 8-byte Folded Spill ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: # %_loop_2_do_.lr.ph ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_4 Depth 2 -; CHECK-NEXT: maddld 6, 27, 29, 19 +; CHECK-NEXT: maddld 6, 23, 25, 12 +; CHECK-NEXT: maddld 21, 23, 25, 11 ; CHECK-NEXT: mtctr 0 -; CHECK-NEXT: li 24, 0 ; CHECK-NEXT: sldi 6, 6, 3 -; CHECK-NEXT: add 26, 28, 6 -; CHECK-NEXT: mulld 6, 27, 29 +; CHECK-NEXT: add 22, 24, 6 +; CHECK-NEXT: sldi 6, 21, 3 +; CHECK-NEXT: add 21, 24, 6 +; CHECK-NEXT: maddld 6, 23, 25, 29 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 19, 24, 6 +; CHECK-NEXT: maddld 6, 23, 25, 18 ; CHECK-NEXT: sldi 6, 6, 3 -; CHECK-NEXT: add 25, 28, 6 -; CHECK-NEXT: mr 6, 23 +; CHECK-NEXT: add 17, 24, 6 +; CHECK-NEXT: maddld 6, 23, 25, 20 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 15, 24, 6 +; CHECK-NEXT: mulld 6, 23, 25 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 14, 24, 6 +; CHECK-NEXT: mr 6, 7 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_4: # %_loop_2_do_ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add 22, 15, 24 ; CHECK-NEXT: lxvp 42, 0(6) -; CHECK-NEXT: lxvp 44, 0(25) -; CHECK-NEXT: lxvp 46, 0(26) -; CHECK-NEXT: add 21, 2, 24 -; CHECK-NEXT: add 20, 11, 24 -; CHECK-NEXT: lxvp 48, -32(22) -; CHECK-NEXT: add 22, 12, 24 -; CHECK-NEXT: lxvpx 50, 2, 24 -; CHECK-NEXT: lxvpx 30, 12, 24 -; CHECK-NEXT: lxvpx 28, 11, 24 +; CHECK-NEXT: lxvp 44, 0(14) +; CHECK-NEXT: lxvp 46, 0(15) +; CHECK-NEXT: lxvp 48, 0(17) +; CHECK-NEXT: lxvp 50, 0(19) +; CHECK-NEXT: lxvp 30, 0(21) +; CHECK-NEXT: lxvp 28, 0(22) ; CHECK-NEXT: lxvp 26, 32(6) -; CHECK-NEXT: lxvp 24, 32(25) -; CHECK-NEXT: lxvp 22, 32(26) -; CHECK-NEXT: lxvpx 20, 15, 24 -; CHECK-NEXT: addi 24, 24, 64 +; CHECK-NEXT: lxvp 24, 32(14) +; CHECK-NEXT: lxvp 22, 32(15) +; CHECK-NEXT: lxvp 20, 32(17) +; CHECK-NEXT: lxvp 18, 32(19) ; CHECK-NEXT: addi 6, 6, 64 -; CHECK-NEXT: addi 25, 25, 64 -; CHECK-NEXT: addi 26, 26, 64 -; CHECK-NEXT: lxvp 18, 32(21) -; CHECK-NEXT: lxvp 16, 32(22) -; CHECK-NEXT: lxvp 14, 32(20) -; CHECK-NEXT: xvmaddadp 9, 45, 43 +; CHECK-NEXT: addi 14, 14, 64 +; CHECK-NEXT: addi 15, 15, 64 +; CHECK-NEXT: addi 17, 17, 64 +; CHECK-NEXT: addi 19, 19, 64 +; CHECK-NEXT: xvmaddadp 4, 45, 43 ; CHECK-NEXT: xvmaddadp 1, 47, 43 -; CHECK-NEXT: xvmaddadp 5, 44, 42 -; CHECK-NEXT: xvmaddadp 41, 46, 42 ; CHECK-NEXT: xvmaddadp 0, 49, 43 ; CHECK-NEXT: xvmaddadp 2, 51, 43 -; CHECK-NEXT: xvmaddadp 4, 31, 43 +; CHECK-NEXT: xvmaddadp 41, 31, 43 ; CHECK-NEXT: xvmaddadp 3, 29, 43 -; CHECK-NEXT: xvmaddadp 40, 48, 42 -; CHECK-NEXT: xvmaddadp 39, 50, 42 -; CHECK-NEXT: xvmaddadp 38, 30, 42 -; CHECK-NEXT: xvmaddadp 33, 28, 42 -; CHECK-NEXT: xvmaddadp 32, 25, 27 -; CHECK-NEXT: xvmaddadp 37, 23, 27 -; CHECK-NEXT: xvmaddadp 36, 21, 27 -; CHECK-NEXT: xvmaddadp 35, 19, 27 +; CHECK-NEXT: xvmaddadp 40, 44, 42 +; CHECK-NEXT: xvmaddadp 39, 46, 42 +; CHECK-NEXT: xvmaddadp 38, 48, 42 +; CHECK-NEXT: xvmaddadp 33, 50, 42 +; CHECK-NEXT: xvmaddadp 32, 30, 42 +; CHECK-NEXT: xvmaddadp 37, 28, 42 +; CHECK-NEXT: lxvp 42, 32(21) +; CHECK-NEXT: lxvp 44, 32(22) +; CHECK-NEXT: addi 21, 21, 64 +; CHECK-NEXT: addi 22, 22, 64 +; CHECK-NEXT: xvmaddadp 36, 25, 27 +; CHECK-NEXT: xvmaddadp 35, 23, 27 +; CHECK-NEXT: xvmaddadp 34, 21, 27 +; CHECK-NEXT: xvmaddadp 13, 19, 27 ; CHECK-NEXT: xvmaddadp 12, 24, 26 -; CHECK-NEXT: xvmaddadp 11, 22, 26 -; CHECK-NEXT: xvmaddadp 10, 20, 26 -; CHECK-NEXT: xvmaddadp 8, 18, 26 -; CHECK-NEXT: xvmaddadp 34, 17, 27 -; CHECK-NEXT: xvmaddadp 13, 15, 27 -; CHECK-NEXT: xvmaddadp 7, 16, 26 -; CHECK-NEXT: xvmaddadp 6, 14, 26 +; CHECK-NEXT: xvmaddadp 10, 22, 26 +; CHECK-NEXT: xvmaddadp 8, 20, 26 +; CHECK-NEXT: xvmaddadp 7, 18, 26 +; CHECK-NEXT: xvmaddadp 11, 43, 27 +; CHECK-NEXT: xvmaddadp 9, 45, 27 +; CHECK-NEXT: xvmaddadp 6, 42, 26 +; CHECK-NEXT: xvmaddadp 5, 44, 26 ; CHECK-NEXT: bdnz .LBB0_4 ; CHECK-NEXT: # %bb.5: # %_loop_2_endl_ ; CHECK-NEXT: # -; CHECK-NEXT: addi 30, 30, 6 -; CHECK-NEXT: add 5, 5, 7 -; CHECK-NEXT: add 31, 31, 7 -; CHECK-NEXT: add 11, 11, 7 -; CHECK-NEXT: add 15, 15, 7 -; CHECK-NEXT: add 12, 12, 7 -; CHECK-NEXT: add 2, 2, 7 -; CHECK-NEXT: addi 29, 29, 1 -; CHECK-NEXT: cmpld 30, 4 +; CHECK-NEXT: addi 26, 26, 6 +; CHECK-NEXT: add 5, 5, 27 +; CHECK-NEXT: add 31, 31, 27 +; CHECK-NEXT: add 2, 2, 27 +; CHECK-NEXT: add 16, 16, 27 +; CHECK-NEXT: add 30, 30, 27 +; CHECK-NEXT: add 28, 28, 27 +; CHECK-NEXT: addi 25, 25, 1 +; CHECK-NEXT: cmpld 26, 4 ; CHECK-NEXT: ble 0, .LBB0_3 ; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit -; CHECK-NEXT: ld 4, 112(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 9, 0(8) -; CHECK-NEXT: stxv 1, 0(9) -; CHECK-NEXT: stxv 0, 0(10) -; CHECK-NEXT: stxv 2, 0(4) -; CHECK-NEXT: ld 4, 64(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 4, 168(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 4, 0(4) -; CHECK-NEXT: ld 4, 80(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 4, 176(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 1, 0(4) +; CHECK-NEXT: ld 4, 184(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 0, 0(4) +; CHECK-NEXT: ld 4, 128(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 2, 0(4) +; CHECK-NEXT: ld 4, 96(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 41, 0(8) ; CHECK-NEXT: stxv 3, 0(4) -; CHECK-NEXT: ld 4, 48(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 5, 0(4) -; CHECK-NEXT: stxv 41, 0(3) +; CHECK-NEXT: stxv 40, 0(3) ; CHECK-NEXT: ld 3, 32(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 40, 0(14) -; CHECK-NEXT: stxv 39, 0(16) -; CHECK-NEXT: stxv 38, 0(17) -; CHECK-NEXT: stxv 33, 0(18) -; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: stxv 39, 0(3) ; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 37, 0(3) +; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 33, 0(3) ; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 37, 0(3) ; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 35, 0(3) ; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 34, 0(3) -; CHECK-NEXT: ld 3, 96(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 13, 0(3) ; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 12, 0(3) -; CHECK-NEXT: ld 3, 120(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 13, 0(3) +; CHECK-NEXT: ld 3, 112(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 11, 0(3) -; CHECK-NEXT: ld 3, 128(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 10, 0(3) +; CHECK-NEXT: ld 3, 120(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 9, 0(3) ; CHECK-NEXT: ld 3, 136(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 8, 0(3) +; CHECK-NEXT: stxv 12, 0(9) +; CHECK-NEXT: stxv 10, 0(3) ; CHECK-NEXT: ld 3, 144(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: stxv 8, 0(3) ; CHECK-NEXT: ld 3, 152(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 6, 0(3) +; CHECK-NEXT: stxv 5, 0(10) ; CHECK-NEXT: .LBB0_7: # %_return_bb ; CHECK-NEXT: lfd 31, 440(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 30, 432(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 31, 296(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 30, 288(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 29, 280(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 28, 272(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 27, 264(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 26, 256(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 25, 248(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 31, 328(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 30, 320(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 29, 312(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 28, 304(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 27, 296(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 26, 288(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 25, 280(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 29, 424(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 24, 240(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 23, 232(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 22, 224(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 24, 272(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 23, 264(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 22, 256(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 28, 416(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 21, 216(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 20, 208(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 19, 200(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 21, 248(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 20, 240(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 19, 232(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 27, 408(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 18, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 17, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 16, 176(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 18, 224(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 17, 216(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 16, 208(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 26, 400(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 15, 168(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 14, 160(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 15, 200(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 14, 192(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 25, 392(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 24, 384(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 23, 376(1) # 8-byte Folded Reload @@ -331,10 +347,6 @@ ; CHECK-NEXT: lfd 20, 352(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 19, 344(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 18, 336(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 17, 328(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 16, 320(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 15, 312(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 14, 304(1) # 8-byte Folded Reload ; CHECK-NEXT: addi 1, 1, 448 ; CHECK-NEXT: blr entry: