diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -1064,7 +1064,7 @@ } protected: - void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand); + virtual void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand); void pickNodeFromQueue(SchedCandidate &Cand); }; diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.h b/llvm/lib/Target/PowerPC/PPCMachineScheduler.h --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.h +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.h @@ -42,6 +42,9 @@ SUnit *pickNode(bool &IsTopNode) override; void enterMBB(MachineBasicBlock *MBB) override; void leaveMBB() override; + + void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override; + bool biasAddiCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp @@ -15,6 +15,16 @@ DisableAddiLoadHeuristic("disable-ppc-sched-addi-load", cl::desc("Disable scheduling addi instruction before" "load for ppc"), cl::Hidden); +static cl::opt + EnableAddiHeuristic("ppc-postra-bias-addi", + cl::desc("Enable scheduling addi instruction as early" + "as possible post ra"), + cl::Hidden, cl::init(true)); + +static bool isADDIInstr(const GenericScheduler::SchedCandidate &Cand) { + return Cand.SU->getInstr()->getOpcode() == PPC::ADDI || + Cand.SU->getInstr()->getOpcode() == PPC::ADDI8; +}; bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, @@ -22,19 +32,13 @@ if (DisableAddiLoadHeuristic) return false; - auto isADDIInstr = [&] (const MachineInstr &Inst) { - return Inst.getOpcode() == PPC::ADDI || Inst.getOpcode() == PPC::ADDI8; - }; - SchedCandidate &FirstCand = Zone.isTop() ? TryCand : Cand; SchedCandidate &SecondCand = Zone.isTop() ? Cand : TryCand; - if (isADDIInstr(*FirstCand.SU->getInstr()) && - SecondCand.SU->getInstr()->mayLoad()) { + if (isADDIInstr(FirstCand) && SecondCand.SU->getInstr()->mayLoad()) { TryCand.Reason = Stall; return true; } - if (FirstCand.SU->getInstr()->mayLoad() && - isADDIInstr(*SecondCand.SU->getInstr())) { + if (FirstCand.SU->getInstr()->mayLoad() && isADDIInstr(SecondCand)) { TryCand.Reason = NoCand; return true; } @@ -61,6 +65,38 @@ return; } +bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand) const { + if (!EnableAddiHeuristic) + return false; + + if (isADDIInstr(TryCand) && !isADDIInstr(Cand)) { + TryCand.Reason = Stall; + return true; + } + return false; +} + +void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand) { + PostGenericScheduler::tryCandidate(Cand, TryCand); + + if (!Cand.isValid()) + return; + + // Add powerpc post ra specific heuristic only when TryCand isn't selected or + // selected as node order. + if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand) + return; + + // There are some benefits to schedule the ADDI as early as possible post ra + // to avoid stalled by vector instructions which take up all the hw units. + // And ADDI is usually used to post inc the loop indvar, which matters the + // performance. + if (biasAddiCandidate(Cand, TryCand)) + return; +} + void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) { // Custom PPC PostRA specific behavior here. PostGenericScheduler::enterMBB(MBB); diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -500,12 +500,12 @@ ; CHECK-NEXT: ld r0, -8(r6) ; CHECK-NEXT: add r29, r0, r29 ; CHECK-NEXT: .LBB6_3: # +; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: mulld r0, r29, r28 ; CHECK-NEXT: mulld r0, r0, r30 ; CHECK-NEXT: mulld r0, r0, r12 ; CHECK-NEXT: mulld r0, r0, r11 ; CHECK-NEXT: maddld r3, r0, r7, r3 -; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: bdz .LBB6_9 ; CHECK-NEXT: .LBB6_4: # ; CHECK-NEXT: lbzu r0, 1(r5) diff --git a/llvm/test/CodeGen/PowerPC/pr42492.ll b/llvm/test/CodeGen/PowerPC/pr42492.ll --- a/llvm/test/CodeGen/PowerPC/pr42492.ll +++ b/llvm/test/CodeGen/PowerPC/pr42492.ll @@ -13,12 +13,12 @@ ; CHECK-NEXT: add 3, 3, 4 ; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: .p2align 5 -; CHECK-NEXT: .LBB0_2: # +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: addi 7, 4, 1 ; CHECK-NEXT: sldi 6, 6, 4 ; CHECK-NEXT: cmplwi 4, 14 -; CHECK-NEXT: addi 7, 4, 1 ; CHECK-NEXT: bc 12, 1, .LBB0_4 -; CHECK-NEXT: # %bb.3: # +; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: cmpd 3, 4 ; CHECK-NEXT: mr 4, 7 ; CHECK-NEXT: bc 4, 2, .LBB0_2 diff --git a/llvm/test/CodeGen/PowerPC/sched-addi.ll b/llvm/test/CodeGen/PowerPC/sched-addi.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/sched-addi.ll @@ -0,0 +1,161 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck -check-prefix=CHECK-P9 %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -ppc-postra-bias-addi=false < %s |\ +; RUN: FileCheck -check-prefix=CHECK-P9-NO-HEURISTIC %s + +%_type_of_scalars = type <{ [16 x i8], double, [152 x i8] }> +%_elem_type_of_x = type <{ double }> +%_elem_type_of_a = type <{ double }> + +@scalars = common local_unnamed_addr global %_type_of_scalars zeroinitializer, align 16 + +define void @test([0 x %_elem_type_of_x]* noalias %.x, [0 x %_elem_type_of_a]* %.a, i64* noalias %.n) { +; CHECK-P9-LABEL: test: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: ld 5, 0(5) +; CHECK-P9-NEXT: addis 6, 2, scalars@toc@ha +; CHECK-P9-NEXT: addi 6, 6, scalars@toc@l +; CHECK-P9-NEXT: addi 6, 6, 16 +; CHECK-P9-NEXT: rldicr 5, 5, 0, 58 +; CHECK-P9-NEXT: addi 5, 5, -32 +; CHECK-P9-NEXT: rldicl 5, 5, 59, 5 +; CHECK-P9-NEXT: addi 5, 5, 1 +; CHECK-P9-NEXT: lxvdsx 0, 0, 6 +; CHECK-P9-NEXT: mtctr 5 +; CHECK-P9-NEXT: .p2align 4 +; CHECK-P9-NEXT: .LBB0_1: # %vector.body +; CHECK-P9-NEXT: # +; CHECK-P9-NEXT: lxv 1, 16(4) +; CHECK-P9-NEXT: lxv 2, 0(4) +; CHECK-P9-NEXT: lxv 3, 48(4) +; CHECK-P9-NEXT: lxv 4, 32(4) +; CHECK-P9-NEXT: xvmuldp 2, 2, 0 +; CHECK-P9-NEXT: lxv 5, 240(4) +; CHECK-P9-NEXT: lxv 6, 224(4) +; CHECK-P9-NEXT: xvmuldp 1, 1, 0 +; CHECK-P9-NEXT: xvmuldp 4, 4, 0 +; CHECK-P9-NEXT: xvmuldp 3, 3, 0 +; CHECK-P9-NEXT: xvmuldp 5, 5, 0 +; CHECK-P9-NEXT: stxv 1, 16(3) +; CHECK-P9-NEXT: stxv 3, 48(3) +; CHECK-P9-NEXT: stxv 4, 32(3) +; CHECK-P9-NEXT: stxv 5, 240(3) +; CHECK-P9-NEXT: addi 4, 4, 256 +; CHECK-P9-NEXT: xvmuldp 6, 6, 0 +; CHECK-P9-NEXT: stxv 2, 0(3) +; CHECK-P9-NEXT: stxv 6, 224(3) +; CHECK-P9-NEXT: addi 3, 3, 256 +; CHECK-P9-NEXT: bdnz .LBB0_1 +; CHECK-P9-NEXT: # %bb.2: # %return.block +; CHECK-P9-NEXT: blr +; +; CHECK-P9-NO-HEURISTIC-LABEL: test: +; CHECK-P9-NO-HEURISTIC: # %bb.0: # %entry +; CHECK-P9-NO-HEURISTIC-NEXT: ld 5, 0(5) +; CHECK-P9-NO-HEURISTIC-NEXT: addis 6, 2, scalars@toc@ha +; CHECK-P9-NO-HEURISTIC-NEXT: addi 6, 6, scalars@toc@l +; CHECK-P9-NO-HEURISTIC-NEXT: rldicr 5, 5, 0, 58 +; CHECK-P9-NO-HEURISTIC-NEXT: addi 6, 6, 16 +; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, -32 +; CHECK-P9-NO-HEURISTIC-NEXT: rldicl 5, 5, 59, 5 +; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, 1 +; CHECK-P9-NO-HEURISTIC-NEXT: lxvdsx 0, 0, 6 +; CHECK-P9-NO-HEURISTIC-NEXT: mtctr 5 +; CHECK-P9-NO-HEURISTIC-NEXT: .p2align 4 +; CHECK-P9-NO-HEURISTIC-NEXT: .LBB0_1: # %vector.body +; CHECK-P9-NO-HEURISTIC-NEXT: # +; CHECK-P9-NO-HEURISTIC-NEXT: lxv 1, 16(4) +; CHECK-P9-NO-HEURISTIC-NEXT: lxv 2, 0(4) +; CHECK-P9-NO-HEURISTIC-NEXT: lxv 3, 48(4) +; CHECK-P9-NO-HEURISTIC-NEXT: lxv 4, 32(4) +; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 2, 2, 0 +; CHECK-P9-NO-HEURISTIC-NEXT: lxv 5, 240(4) +; CHECK-P9-NO-HEURISTIC-NEXT: lxv 6, 224(4) +; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 1, 1, 0 +; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 4, 4, 0 +; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 3, 3, 0 +; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 6, 6, 0 +; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 5, 5, 0 +; CHECK-P9-NO-HEURISTIC-NEXT: stxv 1, 16(3) +; CHECK-P9-NO-HEURISTIC-NEXT: stxv 2, 0(3) +; CHECK-P9-NO-HEURISTIC-NEXT: stxv 3, 48(3) +; CHECK-P9-NO-HEURISTIC-NEXT: stxv 4, 32(3) +; CHECK-P9-NO-HEURISTIC-NEXT: stxv 5, 240(3) +; CHECK-P9-NO-HEURISTIC-NEXT: stxv 6, 224(3) +; CHECK-P9-NO-HEURISTIC-NEXT: addi 4, 4, 256 +; CHECK-P9-NO-HEURISTIC-NEXT: addi 3, 3, 256 +; CHECK-P9-NO-HEURISTIC-NEXT: bdnz .LBB0_1 +; CHECK-P9-NO-HEURISTIC-NEXT: # %bb.2: # %return.block +; CHECK-P9-NO-HEURISTIC-NEXT: blr +entry: + %x_rvo_based_addr_3 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1 + %a_rvo_based_addr_5 = getelementptr inbounds [0 x %_elem_type_of_a], [0 x %_elem_type_of_a]* %.a, i64 0, i64 -1 + %_val_n_ = load i64, i64* %.n, align 8 + %_val_c1_ = load double, double* getelementptr inbounds (%_type_of_scalars, %_type_of_scalars* @scalars, i64 0, i32 1), align 16 + %n.vec = and i64 %_val_n_, -32 + %broadcast.splatinsert26 = insertelement <4 x double> undef, double %_val_c1_, i32 0 + %broadcast.splat27 = shufflevector <4 x double> %broadcast.splatinsert26, <4 x double> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %offset.idx = or i64 %index, 1 + %0 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_3, i64 %offset.idx, i32 0 + %1 = getelementptr %_elem_type_of_a, %_elem_type_of_a* %a_rvo_based_addr_5, i64 %offset.idx, i32 0 + %2 = bitcast double* %1 to <4 x double>* + %wide.load = load <4 x double>, <4 x double>* %2, align 8 + %3 = getelementptr double, double* %1, i64 4 + %4 = bitcast double* %3 to <4 x double>* + %wide.load19 = load <4 x double>, <4 x double>* %4, align 8 + %5 = getelementptr double, double* %1, i64 8 + %6 = bitcast double* %5 to <4 x double>* + %wide.load20 = load <4 x double>, <4 x double>* %6, align 8 + %7 = getelementptr double, double* %1, i64 12 + %8 = bitcast double* %7 to <4 x double>* + %wide.load21 = load <4 x double>, <4 x double>* %8, align 8 + %9 = getelementptr double, double* %1, i64 16 + %10 = bitcast double* %9 to <4 x double>* + %wide.load22 = load <4 x double>, <4 x double>* %10, align 8 + %11 = getelementptr double, double* %1, i64 20 + %12 = bitcast double* %11 to <4 x double>* + %wide.load23 = load <4 x double>, <4 x double>* %12, align 8 + %13 = getelementptr double, double* %1, i64 24 + %14 = bitcast double* %13 to <4 x double>* + %wide.load24 = load <4 x double>, <4 x double>* %14, align 8 + %15 = getelementptr double, double* %1, i64 28 + %16 = bitcast double* %15 to <4 x double>* + %wide.load25 = load <4 x double>, <4 x double>* %16, align 8 + %17 = fmul fast <4 x double> %wide.load, %broadcast.splat27 + %18 = fmul fast <4 x double> %wide.load19, %broadcast.splat27 + %19 = fmul fast <4 x double> %wide.load20, %broadcast.splat27 + %20 = fmul fast <4 x double> %wide.load21, %broadcast.splat27 + %21 = fmul fast <4 x double> %wide.load22, %broadcast.splat27 + %22 = fmul fast <4 x double> %wide.load23, %broadcast.splat27 + %23 = fmul fast <4 x double> %wide.load24, %broadcast.splat27 + %24 = fmul fast <4 x double> %wide.load25, %broadcast.splat27 + %25 = bitcast double* %0 to <4 x double>* + store <4 x double> %17, <4 x double>* %25, align 8 + %26 = getelementptr double, double* %0, i64 4 + %27 = bitcast double* %26 to <4 x double>* + store <4 x double> %18, <4 x double>* %27, align 8 + %28 = getelementptr double, double* %0, i64 8 + %29 = bitcast double* %28 to <4 x double>* + %30 = getelementptr double, double* %0, i64 12 + %31 = bitcast double* %30 to <4 x double>* + %32 = getelementptr double, double* %0, i64 16 + %33 = bitcast double* %32 to <4 x double>* + %34 = getelementptr double, double* %0, i64 20 + %35 = bitcast double* %34 to <4 x double>* + %36 = getelementptr double, double* %0, i64 24 + %37 = bitcast double* %36 to <4 x double>* + %38 = getelementptr double, double* %0, i64 28 + %39 = bitcast double* %38 to <4 x double>* + store <4 x double> %24, <4 x double>* %39, align 8 + %index.next = add i64 %index, 32 + %cm = icmp eq i64 %index.next, %n.vec + br i1 %cm, label %return.block, label %vector.body + +return.block: + ret void +} + diff --git a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll --- a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll @@ -22,35 +22,35 @@ ; CHECK-NEXT: isel 3, 3, 4, 0 ; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 7, -1 ; CHECK-NEXT: mtctr 3 -; CHECK-NEXT: lbz 5, 0(5) ; CHECK-NEXT: li 3, 1 +; CHECK-NEXT: li 7, -1 +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: lbz 5, 0(5) ; CHECK-NEXT: bdz .LBB0_6 ; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addi 3, 3, 1 +; CHECK-NEXT: addi 8, 7, -1 ; CHECK-NEXT: xori 6, 5, 84 ; CHECK-NEXT: clrldi 5, 7, 32 ; CHECK-NEXT: lbz 5, 0(5) -; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: addi 8, 7, -1 ; CHECK-NEXT: bdz .LBB0_5 ; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: cntlzw 6, 6 ; CHECK-NEXT: srwi 7, 6, 5 ; CHECK-NEXT: xori 6, 5, 84 ; CHECK-NEXT: clrldi 5, 8, 32 ; CHECK-NEXT: addi 8, 8, -1 ; CHECK-NEXT: lbz 5, 0(5) -; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: bdz .LBB0_4 -; CHECK-NEXT: .LBB0_3: # +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: clrldi 10, 8, 32 +; CHECK-NEXT: addi 8, 8, -1 ; CHECK-NEXT: cntlzw 9, 6 ; CHECK-NEXT: xori 6, 5, 84 ; CHECK-NEXT: lbz 5, 0(10) -; CHECK-NEXT: addi 8, 8, -1 -; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: add 4, 4, 7 ; CHECK-NEXT: srwi 7, 9, 5 ; CHECK-NEXT: bdnz .LBB0_3 diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll --- a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll @@ -14,20 +14,20 @@ ; CHECK-NEXT: mr 30, 3 ; CHECK-NEXT: bl calloc ; CHECK-NEXT: nop -; CHECK-NEXT: clrldi 4, 30, 32 ; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: addi 3, 3, -4 +; CHECK-NEXT: li 6, 1 +; CHECK-NEXT: clrldi 4, 30, 32 ; CHECK-NEXT: mtctr 4 ; CHECK-NEXT: mullw 4, 5, 5 -; CHECK-NEXT: li 6, 1 ; CHECK-NEXT: bdz .LBB0_3 ; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addi 5, 6, 1 ; CHECK-NEXT: stwu 4, 4(3) ; CHECK-NEXT: mullw 4, 6, 6 -; CHECK-NEXT: addi 5, 6, 1 ; CHECK-NEXT: bdz .LBB0_3 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_2: # +; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: stwu 4, 4(3) ; CHECK-NEXT: mullw 4, 5, 5 ; CHECK-NEXT: addi 5, 5, 1 diff --git a/llvm/test/CodeGen/PowerPC/sms-simple.ll b/llvm/test/CodeGen/PowerPC/sms-simple.ll --- a/llvm/test/CodeGen/PowerPC/sms-simple.ll +++ b/llvm/test/CodeGen/PowerPC/sms-simple.ll @@ -11,12 +11,12 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r5, r2, x@toc@ha ; CHECK-NEXT: addi r5, r5, x@toc@l +; CHECK-NEXT: addi r5, r5, -8 ; CHECK-NEXT: addis r6, r2, y@toc@ha ; CHECK-NEXT: li r7, 340 ; CHECK-NEXT: addi r3, r6, y@toc@l ; CHECK-NEXT: lwz r6, y@toc@l(r6) ; CHECK-NEXT: mtctr r7 -; CHECK-NEXT: addi r5, r5, -8 ; CHECK-NEXT: lwzu r7, 12(r5) ; CHECK-NEXT: maddld r6, r7, r7, r6 ; CHECK-NEXT: lwz r7, 4(r5) diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll @@ -46,11 +46,11 @@ ; CHECK-P9-NEXT: mtfprd f1, r3 ; CHECK-P9-NEXT: mffprwz r3, f0 ; CHECK-P9-NEXT: mtfprd f0, r3 +; CHECK-P9-NEXT: addi r3, r1, -2 ; CHECK-P9-NEXT: xxswapd v2, vs1 ; CHECK-P9-NEXT: xxswapd v3, vs0 ; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: vsldoi v2, v2, v2, 8 -; CHECK-P9-NEXT: addi r3, r1, -2 ; CHECK-P9-NEXT: stxsihx v2, 0, r3 ; CHECK-P9-NEXT: lhz r3, -2(r1) ; CHECK-P9-NEXT: blr @@ -764,11 +764,11 @@ ; CHECK-P9-NEXT: mtfprd f1, r3 ; CHECK-P9-NEXT: mffprwz r3, f0 ; CHECK-P9-NEXT: mtfprd f0, r3 +; CHECK-P9-NEXT: addi r3, r1, -2 ; CHECK-P9-NEXT: xxswapd v2, vs1 ; CHECK-P9-NEXT: xxswapd v3, vs0 ; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: vsldoi v2, v2, v2, 8 -; CHECK-P9-NEXT: addi r3, r1, -2 ; CHECK-P9-NEXT: stxsihx v2, 0, r3 ; CHECK-P9-NEXT: lhz r3, -2(r1) ; CHECK-P9-NEXT: blr