diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -871,6 +871,10 @@ /// (3) Bottom-up allocation is no longer guaranteed to optimally color. virtual bool reverseLocalAssignment() const { return false; } + /// Add the allocation priority to global and split ranges as well as the + /// local ranges when registers are added to the queue. + virtual bool addAllocPriorityToGlobalRanges() const { return false; } + /// Allow the target to override the cost of using a callee-saved register for /// the first time. Default value of 0 means we will use a callee-saved /// register if it is available. diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -760,6 +760,7 @@ // Giant live ranges fall back to the global assignment heuristic, which // prevents excessive spilling in pathological cases. bool ReverseLocal = TRI->reverseLocalAssignment(); + bool AddPriorityToGlobal = TRI->addAllocPriorityToGlobalRanges(); const TargetRegisterClass &RC = *MRI->getRegClass(Reg); bool ForceGlobal = !ReverseLocal && (Size / SlotIndex::InstrDist) > (2 * RC.getNumRegs()); @@ -783,6 +784,9 @@ // don't fit should be spilled (or split) ASAP so they don't create // interference. Mark a bit to prioritize global above local ranges. Prio = (1u << 29) + Size; + + if (AddPriorityToGlobal) + Prio |= RC.AllocationPriority << 24; } // Mark a higher bit to prioritize global and local above RS_Split. Prio |= (1u << 31); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -94,6 +94,16 @@ bool isCallerPreservedPhysReg(MCRegister PhysReg, const MachineFunction &MF) const override; + // Provide hints to the register allocator for allocating subregisters + // of primed and unprimed accumulators. For example, if accumulator + // ACC5 is assigned, we also want to assign UACC5 to the input. + // Similarly if UACC5 is assigned, we want to assign VSRp10, VSRp11 + // to its inputs. + bool getRegAllocationHints(Register VirtReg, ArrayRef Order, + SmallVectorImpl &Hints, + const MachineFunction &MF, const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const override; + /// We require the register scavenger. bool requiresRegisterScavenging(const MachineFunction &MF) const override { return true; @@ -137,6 +147,8 @@ unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; + bool addAllocPriorityToGlobalRanges() const override { return true; } + // Support for virtual base registers. bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -472,6 +472,62 @@ return false; } +bool PPCRegisterInfo::getRegAllocationHints(Register VirtReg, + ArrayRef Order, + SmallVectorImpl &Hints, + const MachineFunction &MF, + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const { + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + + // Call the base implementation first to set any hints based on the usual + // heuristics and decide what the return value should be. We want to return + // the same value returned by the base implementation. If the base + // implementation decides to return true and force the allocation then we + // will leave it as such. On the other hand if the base implementation + // decides to return false the following code will not force the allocation + // as we are just looking to provide a hint. + bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( + VirtReg, Order, Hints, MF, VRM, Matrix); + // We are interested in instructions that copy values to ACC/UACC. + // The copy into UACC will be simply a COPY to a subreg so we + // want to allocate the corresponding physical subreg for the source. + // The copy into ACC will be a BUILD_UACC so we want to allocate + // the same number UACC for the source. + for (MachineInstr &Use : MRI->reg_nodbg_instructions(VirtReg)) { + const MachineOperand *ResultOp = nullptr; + Register ResultReg; + switch (Use.getOpcode()) { + case TargetOpcode::COPY: { + ResultOp = &Use.getOperand(0); + ResultReg = ResultOp->getReg(); + if (Register::isVirtualRegister(ResultReg) && + MRI->getRegClass(ResultReg)->contains(PPC::UACC0) && + VRM->hasPhys(ResultReg)) { + Register UACCPhys = VRM->getPhys(ResultReg); + Register HintReg = getSubReg(UACCPhys, ResultOp->getSubReg()); + Hints.push_back(HintReg); + } + break; + } + case PPC::BUILD_UACC: { + ResultOp = &Use.getOperand(0); + ResultReg = ResultOp->getReg(); + if (MRI->getRegClass(ResultReg)->contains(PPC::ACC0) && + VRM->hasPhys(ResultReg)) { + Register ACCPhys = VRM->getPhys(ResultReg); + assert((ACCPhys >= PPC::ACC0 && ACCPhys <= PPC::ACC7) && + "Expecting an ACC register for BUILD_UACC."); + Register HintReg = PPC::UACC0 + (ACCPhys - PPC::ACC0); + Hints.push_back(HintReg); + } + break; + } + } + } + return BaseImplRetVal; +} + unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const PPCFrameLowering *TFI = getFrameLowering(MF); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -460,6 +460,13 @@ } def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3, ACC4, ACC5, ACC6, ACC7)> { + // The AllocationPriority is in the range [0, 63]. Assigned the ACC registers + // the highest possible priority in this range to force the register allocator + // to assign these registers first. This is done because the ACC registers + // must represent 4 advacent vector registers. For example ACC1 must be + // VS4 - VS7. The value here must be at least 32 as we want to allocate + // these registers even before we allocate global ranges. + let AllocationPriority = 63; let Size = 512; } @@ -476,6 +483,11 @@ def UACCRC : RegisterClass<"PPC", [v512i1], 128, (add UACC0, UACC1, UACC2, UACC3, UACC4, UACC5, UACC6, UACC7)> { + // The AllocationPriority for the UACC registers is still high and must be at + // least 32 as we want to allocate these registers before we allocate other + // global ranges. The value must be less than the AllocationPriority of the + // ACC registers. + let AllocationPriority = 36; let Size = 512; } @@ -493,6 +505,12 @@ VSRp29, VSRp28, VSRp27, VSRp26, (sequence "VSRp%u", 0, 6), (sequence "VSRp%u", 15, 7))> { + // Give the VSRp registers a non-zero AllocationPriority. The value is less + // than 32 as these registers should not always be allocated before global + // ranges and the value should be less than the AllocationPriority - 32 for + // the UACC registers. Even global VSRp registers should be allocated after + // the UACC registers have been chosen. + let AllocationPriority = 2; let Size = 256; } diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll --- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -mcpu=pwr10 -ppc-track-subreg-liveness -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -mcpu=pwr10 -ppc-track-subreg-liveness -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) @@ -11,20 +11,19 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4, i8* %ptr) { ; CHECK-LABEL: intrinsics1: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18 ; CHECK-NEXT: vmr v1, v4 ; CHECK-NEXT: vmr v4, v3 ; CHECK-NEXT: ld r3, 96(r1) ; CHECK-NEXT: vmr v0, v2 +; CHECK-NEXT: xxlor vs3, v5, v5 ; CHECK-NEXT: xxlor vs0, v0, v0 ; CHECK-NEXT: xxlor vs1, v1, v1 ; CHECK-NEXT: xxlor vs2, v4, v4 -; CHECK-NEXT: xxlor vs3, v5, v5 ; CHECK-NEXT: xxmtacc acc0 ; CHECK-NEXT: xvi4ger8pp acc0, v2, v3 ; CHECK-NEXT: xvf16ger2pp acc0, v2, v1 ; CHECK-NEXT: pmxvf32gerpn acc0, v3, v5, 0, 0 -; CHECK-NEXT: vmr v3, v0 +; CHECK-NEXT: vmr v3, v2 ; CHECK-NEXT: vmr v2, v5 ; CHECK-NEXT: pmxvf64gernp acc0, vsp34, v0, 0, 0 ; CHECK-NEXT: xxmfacc acc0 @@ -36,20 +35,19 @@ ; ; CHECK-BE-LABEL: intrinsics1: ; CHECK-BE: # %bb.0: -; CHECK-BE-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18 ; CHECK-BE-NEXT: vmr v1, v4 ; CHECK-BE-NEXT: vmr v4, v3 ; CHECK-BE-NEXT: ld r3, 112(r1) ; CHECK-BE-NEXT: vmr v0, v2 +; CHECK-BE-NEXT: xxlor vs3, v5, v5 ; CHECK-BE-NEXT: xxlor vs0, v0, v0 ; CHECK-BE-NEXT: xxlor vs1, v1, v1 ; CHECK-BE-NEXT: xxlor vs2, v4, v4 -; CHECK-BE-NEXT: xxlor vs3, v5, v5 ; CHECK-BE-NEXT: xxmtacc acc0 ; CHECK-BE-NEXT: xvi4ger8pp acc0, v2, v3 ; CHECK-BE-NEXT: xvf16ger2pp acc0, v2, v1 ; CHECK-BE-NEXT: pmxvf32gerpn acc0, v3, v5, 0, 0 -; CHECK-BE-NEXT: vmr v3, v0 +; CHECK-BE-NEXT: vmr v3, v2 ; CHECK-BE-NEXT: vmr v2, v5 ; CHECK-BE-NEXT: pmxvf64gernp acc0, vsp34, v0, 0, 0 ; CHECK-BE-NEXT: xxmfacc acc0 @@ -78,10 +76,10 @@ ; CHECK-NEXT: lxv v4, 0(r5) ; CHECK-NEXT: lxv v5, 0(r6) ; CHECK-NEXT: xxlor vs0, v2, v2 -; CHECK-NEXT: vmr v1, v2 ; CHECK-NEXT: xxlor vs1, v3, v3 ; CHECK-NEXT: xxlor vs2, v4, v4 ; CHECK-NEXT: xxlor vs3, v5, v5 +; CHECK-NEXT: vmr v1, v2 ; CHECK-NEXT: vmr v0, v5 ; CHECK-NEXT: xxmtacc acc0 ; CHECK-NEXT: xvi8ger4pp acc0, v2, v3 @@ -102,10 +100,10 @@ ; CHECK-BE-NEXT: lxv v4, 0(r5) ; CHECK-BE-NEXT: lxv v5, 0(r6) ; CHECK-BE-NEXT: xxlor vs0, v2, v2 -; CHECK-BE-NEXT: vmr v1, v2 ; CHECK-BE-NEXT: xxlor vs1, v3, v3 ; CHECK-BE-NEXT: xxlor vs2, v4, v4 ; CHECK-BE-NEXT: xxlor vs3, v5, v5 +; CHECK-BE-NEXT: vmr v1, v2 ; CHECK-BE-NEXT: vmr v0, v5 ; CHECK-BE-NEXT: xxmtacc acc0 ; CHECK-BE-NEXT: xvi8ger4pp acc0, v2, v3 diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -123,7 +123,7 @@ ; CHECK-NEXT: sldi 31, 3, 1 ; CHECK-NEXT: std 8, 32(1) # 8-byte Folded Spill ; CHECK-NEXT: std 9, 40(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 41, 0(8) +; CHECK-NEXT: lxv 43, 0(8) ; CHECK-NEXT: mr 8, 6 ; CHECK-NEXT: sldi 6, 3, 3 ; CHECK-NEXT: std 2, 144(1) # 8-byte Folded Spill @@ -134,21 +134,21 @@ ; CHECK-NEXT: add 6, 6, 23 ; CHECK-NEXT: lxv 7, 0(28) ; CHECK-NEXT: add 28, 3, 31 -; CHECK-NEXT: lxv 40, 0(9) -; CHECK-NEXT: lxv 39, 0(10) -; CHECK-NEXT: lxv 38, 0(15) -; CHECK-NEXT: lxv 33, 0(14) -; CHECK-NEXT: lxv 32, 0(16) -; CHECK-NEXT: lxv 37, 0(17) -; CHECK-NEXT: lxv 35, 0(18) +; CHECK-NEXT: lxv 42, 0(9) +; CHECK-NEXT: lxv 41, 0(10) +; CHECK-NEXT: lxv 40, 0(15) +; CHECK-NEXT: lxv 39, 0(14) +; CHECK-NEXT: lxv 38, 0(16) +; CHECK-NEXT: lxv 33, 0(17) +; CHECK-NEXT: lxv 37, 0(18) ; CHECK-NEXT: lxv 13, 0(19) ; CHECK-NEXT: lxv 10, 0(20) ; CHECK-NEXT: lxv 8, 0(21) ; CHECK-NEXT: lxv 6, 0(22) ; CHECK-NEXT: lxv 4, 0(30) ; CHECK-NEXT: lxv 1, 0(12) -; CHECK-NEXT: lxv 36, 0(24) -; CHECK-NEXT: lxv 34, 0(25) +; CHECK-NEXT: lxv 32, 0(24) +; CHECK-NEXT: lxv 36, 0(25) ; CHECK-NEXT: lxv 12, 0(26) ; CHECK-NEXT: lxv 9, 0(27) ; CHECK-NEXT: lxv 5, 0(29) @@ -216,7 +216,7 @@ ; CHECK-NEXT: .LBB0_4: # %_loop_2_do_ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lxvp 42, 0(6) +; CHECK-NEXT: lxvp 34, 0(6) ; CHECK-NEXT: lxvp 44, 0(16) ; CHECK-NEXT: lxvp 46, 0(17) ; CHECK-NEXT: lxvp 48, 0(18) @@ -233,19 +233,19 @@ ; CHECK-NEXT: addi 17, 17, 64 ; CHECK-NEXT: addi 18, 18, 64 ; CHECK-NEXT: addi 19, 19, 64 -; CHECK-NEXT: xvmaddadp 41, 45, 43 -; CHECK-NEXT: xvmaddadp 40, 47, 43 -; CHECK-NEXT: xvmaddadp 39, 49, 43 -; CHECK-NEXT: xvmaddadp 38, 51, 43 -; CHECK-NEXT: xvmaddadp 33, 63, 43 -; CHECK-NEXT: xvmaddadp 32, 61, 43 -; CHECK-NEXT: xvmaddadp 37, 44, 42 -; CHECK-NEXT: xvmaddadp 35, 46, 42 -; CHECK-NEXT: xvmaddadp 13, 48, 42 -; CHECK-NEXT: xvmaddadp 11, 50, 42 -; CHECK-NEXT: xvmaddadp 10, 62, 42 -; CHECK-NEXT: xvmaddadp 8, 60, 42 -; CHECK-NEXT: lxvp 42, 32(20) +; CHECK-NEXT: xvmaddadp 43, 45, 35 +; CHECK-NEXT: xvmaddadp 42, 47, 35 +; CHECK-NEXT: xvmaddadp 41, 49, 35 +; CHECK-NEXT: xvmaddadp 40, 51, 35 +; CHECK-NEXT: xvmaddadp 39, 63, 35 +; CHECK-NEXT: xvmaddadp 38, 61, 35 +; CHECK-NEXT: xvmaddadp 33, 44, 34 +; CHECK-NEXT: xvmaddadp 37, 46, 34 +; CHECK-NEXT: xvmaddadp 13, 48, 34 +; CHECK-NEXT: xvmaddadp 11, 50, 34 +; CHECK-NEXT: xvmaddadp 10, 62, 34 +; CHECK-NEXT: xvmaddadp 8, 60, 34 +; CHECK-NEXT: lxvp 34, 32(20) ; CHECK-NEXT: lxvp 44, 32(21) ; CHECK-NEXT: addi 20, 20, 64 ; CHECK-NEXT: addi 21, 21, 64 @@ -253,13 +253,13 @@ ; CHECK-NEXT: xvmaddadp 4, 55, 59 ; CHECK-NEXT: xvmaddadp 3, 53, 59 ; CHECK-NEXT: xvmaddadp 2, 31, 59 -; CHECK-NEXT: xvmaddadp 36, 56, 58 -; CHECK-NEXT: xvmaddadp 34, 54, 58 +; CHECK-NEXT: xvmaddadp 32, 56, 58 +; CHECK-NEXT: xvmaddadp 36, 54, 58 ; CHECK-NEXT: xvmaddadp 12, 52, 58 ; CHECK-NEXT: xvmaddadp 9, 30, 58 -; CHECK-NEXT: xvmaddadp 1, 43, 59 +; CHECK-NEXT: xvmaddadp 1, 35, 59 ; CHECK-NEXT: xvmaddadp 0, 45, 59 -; CHECK-NEXT: xvmaddadp 7, 42, 58 +; CHECK-NEXT: xvmaddadp 7, 34, 58 ; CHECK-NEXT: xvmaddadp 5, 44, 58 ; CHECK-NEXT: bdnz .LBB0_4 ; CHECK-NEXT: # %bb.5: # %_loop_2_endl_ @@ -276,21 +276,21 @@ ; CHECK-NEXT: ble 0, .LBB0_3 ; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit ; CHECK-NEXT: ld 3, 32(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 41, 0(3) +; CHECK-NEXT: stxv 43, 0(3) ; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 40, 0(3) +; CHECK-NEXT: stxv 42, 0(3) ; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 39, 0(3) +; CHECK-NEXT: stxv 41, 0(3) ; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: stxv 40, 0(3) ; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 33, 0(3) +; CHECK-NEXT: stxv 39, 0(3) ; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: stxv 38, 0(3) ; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 37, 0(3) +; CHECK-NEXT: stxv 33, 0(3) ; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 35, 0(3) +; CHECK-NEXT: stxv 37, 0(3) ; CHECK-NEXT: ld 3, 96(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 13, 0(3) ; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload @@ -312,9 +312,9 @@ ; CHECK-NEXT: ld 3, 168(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 0, 0(3) ; CHECK-NEXT: ld 3, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: stxv 32, 0(3) ; CHECK-NEXT: ld 3, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 34, 0(3) +; CHECK-NEXT: stxv 36, 0(3) ; CHECK-NEXT: ld 3, 192(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 12, 0(3) ; CHECK-NEXT: ld 3, 200(1) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/ppc64-acc-regalloc.ll b/llvm/test/CodeGen/PowerPC/ppc64-acc-regalloc.ll --- a/llvm/test/CodeGen/PowerPC/ppc64-acc-regalloc.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-acc-regalloc.ll @@ -13,213 +13,194 @@ ; CHECK-LABEL: acc_regalloc: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: lwz r3, 0(r3) -; CHECK-NEXT: lxv vs0, 0(0) -; CHECK-NEXT: xxlxor vs2, vs2, vs2 -; CHECK-NEXT: xxlxor vs3, vs3, vs3 +; CHECK-NEXT: lxv v4, 0(0) +; CHECK-NEXT: xxlxor v0, v0, v0 +; CHECK-NEXT: xxlxor v1, v1, v1 ; CHECK-NEXT: stfd f14, -144(r1) # 8-byte Folded Spill ; CHECK-NEXT: stfd f15, -136(r1) # 8-byte Folded Spill ; CHECK-NEXT: xxlxor v2, v2, v2 ; CHECK-NEXT: li r6, 1 ; CHECK-NEXT: li r4, 16 -; CHECK-NEXT: stfd f16, -128(r1) # 8-byte Folded Spill -; CHECK-NEXT: stfd f17, -120(r1) # 8-byte Folded Spill ; CHECK-NEXT: extswsli r3, r3, 3 -; CHECK-NEXT: stfd f18, -112(r1) # 8-byte Folded Spill -; CHECK-NEXT: stfd f19, -104(r1) # 8-byte Folded Spill -; CHECK-NEXT: xvmaddadp vs3, vs0, vs3 -; CHECK-NEXT: lxvdsx vs1, 0, r3 -; CHECK-NEXT: xvmaddadp vs2, vs1, vs2 +; CHECK-NEXT: xvmaddadp v1, v4, v1 +; CHECK-NEXT: lxvdsx v5, 0, r3 +; CHECK-NEXT: xvmaddadp v0, v5, v0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %bb9 ; CHECK-NEXT: # ; CHECK-NEXT: addi r6, r6, 2 -; CHECK-NEXT: lxv vs5, -64(r5) -; CHECK-NEXT: lxv vs6, -16(r5) -; CHECK-NEXT: lxv vs4, 16(0) -; CHECK-NEXT: xxlor v7, vs2, vs2 -; CHECK-NEXT: xxlxor v8, v8, v8 -; CHECK-NEXT: xxlxor v1, v1, v1 +; CHECK-NEXT: lxv vs1, -64(r5) +; CHECK-NEXT: lxv vs2, -16(r5) +; CHECK-NEXT: lxv vs0, 16(0) +; CHECK-NEXT: vmr v9, v0 +; CHECK-NEXT: xxlxor v10, v10, v10 +; CHECK-NEXT: xxlxor v7, v7, v7 ; CHECK-NEXT: mulld r6, r6, r3 -; CHECK-NEXT: xvmaddadp v7, vs5, v2 -; CHECK-NEXT: xxlxor v6, v6, v6 -; CHECK-NEXT: xvmaddadp v8, vs6, v8 -; CHECK-NEXT: xvmaddadp v1, vs4, vs1 -; CHECK-NEXT: xvmuldp v0, vs4, v2 -; CHECK-NEXT: xvmaddadp v1, v2, v2 -; CHECK-NEXT: xvmaddadp v0, v2, v2 -; CHECK-NEXT: lxvdsx v4, r6, r4 -; CHECK-NEXT: xvmaddadp v6, vs5, v6 +; CHECK-NEXT: xvmaddadp v9, vs1, v2 +; CHECK-NEXT: xxlxor v8, v8, v8 +; CHECK-NEXT: xvmaddadp v10, vs2, v10 +; CHECK-NEXT: xvmaddadp v7, vs0, v5 +; CHECK-NEXT: xvmuldp v6, vs0, v2 +; CHECK-NEXT: xvmaddadp v7, v2, v2 +; CHECK-NEXT: xvmaddadp v6, v2, v2 +; CHECK-NEXT: lxvdsx v14, r6, r4 +; CHECK-NEXT: xvmaddadp v8, vs1, v8 ; CHECK-NEXT: li r6, 0 -; CHECK-NEXT: xvmuldp v9, vs6, v4 -; CHECK-NEXT: xvmuldp v3, vs5, v4 -; CHECK-NEXT: xvmuldp v11, vs0, v4 -; CHECK-NEXT: vmr v10, v2 -; CHECK-NEXT: xvmuldp v5, v4, v2 -; CHECK-NEXT: vmr v4, v2 -; CHECK-NEXT: xxlor vs18, v8, v8 +; CHECK-NEXT: xvmuldp v11, vs2, v14 +; CHECK-NEXT: xvmuldp v3, vs1, v14 +; CHECK-NEXT: xvmuldp vs5, v14, v2 +; CHECK-NEXT: xvmuldp v13, v4, v14 +; CHECK-NEXT: vmr v12, v2 +; CHECK-NEXT: xxlor vs14, v10, v10 +; CHECK-NEXT: xxlor vs0, v2, v2 ; CHECK-NEXT: xxlor vs4, v2, v2 -; CHECK-NEXT: xxlor vs12, v10, v10 -; CHECK-NEXT: xxlor vs13, v11, v11 -; CHECK-NEXT: xxlor v10, vs3, vs3 -; CHECK-NEXT: xxlor vs8, v4, v4 -; CHECK-NEXT: xxlor vs9, v5, v5 -; CHECK-NEXT: xxlor vs10, v0, v0 -; CHECK-NEXT: xxlor vs11, v1, v1 -; CHECK-NEXT: xxmtacc acc2 -; CHECK-NEXT: xxlor vs19, v9, v9 -; CHECK-NEXT: vmr v8, v2 -; CHECK-NEXT: xxlor vs5, v3, v3 +; CHECK-NEXT: # kill: def $vsrp2 killed $vsrp2 def $uacc1 ; CHECK-NEXT: xxlor vs6, v6, v6 ; CHECK-NEXT: xxlor vs7, v7, v7 -; CHECK-NEXT: xxlor vs14, v10, v10 +; CHECK-NEXT: xxlor vs8, v12, v12 +; CHECK-NEXT: xxlor vs9, v13, v13 +; CHECK-NEXT: vmr v12, v1 ; CHECK-NEXT: xxlor vs15, v11, v11 -; CHECK-NEXT: xxlor vs16, v8, v8 -; CHECK-NEXT: xxlor vs17, v9, v9 +; CHECK-NEXT: vmr v10, v2 +; CHECK-NEXT: xxlor vs1, v3, v3 +; CHECK-NEXT: xxlor vs2, v8, v8 +; CHECK-NEXT: xxlor vs3, v9, v9 +; CHECK-NEXT: xxlor vs10, v12, v12 +; CHECK-NEXT: xxlor vs11, v13, v13 ; CHECK-NEXT: xxmtacc acc1 +; CHECK-NEXT: xxlor vs12, v10, v10 +; CHECK-NEXT: xxlor vs13, v11, v11 +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xxmtacc acc2 +; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs0 ; CHECK-NEXT: xxmtacc acc3 ; CHECK-NEXT: xvf64gerpp acc1, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc2, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc3, vsp34, vs0 -; CHECK-NEXT: xxmtacc acc4 -; CHECK-NEXT: xvf64gerpp acc4, vsp34, vs0 +; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc1, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc2, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc3, vsp34, vs0 -; CHECK-NEXT: xvf64gerpp acc4, vsp34, vs0 +; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc1, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc2, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc3, vsp34, vs0 -; CHECK-NEXT: xvf64gerpp acc4, vsp34, vs0 +; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc1, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc2, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc3, vsp34, vs0 -; CHECK-NEXT: xvf64gerpp acc4, vsp34, vs0 +; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc1, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc2, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc3, vsp34, vs0 -; CHECK-NEXT: xvf64gerpp acc4, vsp34, vs0 +; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc1, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc2, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc3, vsp34, vs0 -; CHECK-NEXT: xvf64gerpp acc4, vsp34, vs0 +; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc1, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc2, vsp34, vs0 ; CHECK-NEXT: xvf64gerpp acc3, vsp34, vs0 -; CHECK-NEXT: xvf64gerpp acc4, vsp34, vs0 +; CHECK-NEXT: xxmfacc acc0 ; CHECK-NEXT: xxmfacc acc1 ; CHECK-NEXT: xxmfacc acc2 ; CHECK-NEXT: xxmfacc acc3 -; CHECK-NEXT: xxmfacc acc4 -; CHECK-NEXT: stxv vs5, 0(r3) -; CHECK-NEXT: stxv vs13, 32(r3) -; CHECK-NEXT: stxv vs8, 16(0) -; CHECK-NEXT: stxv vs16, 48(0) +; CHECK-NEXT: stxv vs1, 0(r3) +; CHECK-NEXT: stxv vs9, 32(r3) +; CHECK-NEXT: stxv vs4, 16(0) +; CHECK-NEXT: stxv vs12, 48(0) ; CHECK-NEXT: b .LBB0_1 ; ; TRACKLIVE-LABEL: acc_regalloc: ; TRACKLIVE: # %bb.0: # %bb ; TRACKLIVE-NEXT: lwz r3, 0(r3) -; TRACKLIVE-NEXT: lxv vs0, 0(0) -; TRACKLIVE-NEXT: xxlxor vs2, vs2, vs2 -; TRACKLIVE-NEXT: xxlxor vs3, vs3, vs3 +; TRACKLIVE-NEXT: lxv v4, 0(0) +; TRACKLIVE-NEXT: xxlxor v0, v0, v0 +; TRACKLIVE-NEXT: xxlxor v1, v1, v1 ; TRACKLIVE-NEXT: stfd f14, -144(r1) # 8-byte Folded Spill ; TRACKLIVE-NEXT: stfd f15, -136(r1) # 8-byte Folded Spill ; TRACKLIVE-NEXT: xxlxor v2, v2, v2 ; TRACKLIVE-NEXT: li r6, 1 ; TRACKLIVE-NEXT: li r4, 16 -; TRACKLIVE-NEXT: stfd f16, -128(r1) # 8-byte Folded Spill -; TRACKLIVE-NEXT: stfd f17, -120(r1) # 8-byte Folded Spill ; TRACKLIVE-NEXT: extswsli r3, r3, 3 -; TRACKLIVE-NEXT: stfd f18, -112(r1) # 8-byte Folded Spill -; TRACKLIVE-NEXT: stfd f19, -104(r1) # 8-byte Folded Spill -; TRACKLIVE-NEXT: xvmaddadp vs3, vs0, vs3 -; TRACKLIVE-NEXT: lxvdsx vs1, 0, r3 -; TRACKLIVE-NEXT: xvmaddadp vs2, vs1, vs2 +; TRACKLIVE-NEXT: xvmaddadp v1, v4, v1 +; TRACKLIVE-NEXT: lxvdsx v5, 0, r3 +; TRACKLIVE-NEXT: xvmaddadp v0, v5, v0 ; TRACKLIVE-NEXT: .p2align 4 ; TRACKLIVE-NEXT: .LBB0_1: # %bb9 ; TRACKLIVE-NEXT: # ; TRACKLIVE-NEXT: addi r6, r6, 2 -; TRACKLIVE-NEXT: lxv vs4, 16(0) -; TRACKLIVE-NEXT: xxlxor v1, v1, v1 -; TRACKLIVE-NEXT: lxv vs6, -16(r5) -; TRACKLIVE-NEXT: lxv vs5, -64(r5) -; TRACKLIVE-NEXT: xxlxor v8, v8, v8 -; TRACKLIVE-NEXT: xxlor v7, vs2, vs2 -; TRACKLIVE-NEXT: xxlxor v6, v6, v6 +; TRACKLIVE-NEXT: lxv vs0, 16(0) +; TRACKLIVE-NEXT: xxlxor vs7, vs7, vs7 +; TRACKLIVE-NEXT: lxv vs1, -64(r5) +; TRACKLIVE-NEXT: lxv vs4, -16(r5) +; TRACKLIVE-NEXT: xxlxor vs12, vs12, vs12 +; TRACKLIVE-NEXT: xxlor vs3, v0, v0 +; TRACKLIVE-NEXT: xxlxor vs2, vs2, vs2 ; TRACKLIVE-NEXT: mulld r6, r6, r3 -; TRACKLIVE-NEXT: vmr v10, v2 -; TRACKLIVE-NEXT: xxlor vs8, v10, v10 -; TRACKLIVE-NEXT: xvmaddadp v1, vs4, vs1 -; TRACKLIVE-NEXT: xvmuldp v0, vs4, v2 -; TRACKLIVE-NEXT: xvmaddadp v8, vs6, v8 -; TRACKLIVE-NEXT: xvmaddadp v7, vs5, v2 -; TRACKLIVE-NEXT: xvmaddadp v6, vs5, v6 -; TRACKLIVE-NEXT: xxlor vs4, v2, v2 -; TRACKLIVE-NEXT: lxvdsx v4, r6, r4 +; TRACKLIVE-NEXT: xxlor vs10, v2, v2 +; TRACKLIVE-NEXT: xxlor vs8, vs10, vs10 +; TRACKLIVE-NEXT: xxlor vs10, v1, v1 +; TRACKLIVE-NEXT: xvmaddadp vs7, vs0, v5 +; TRACKLIVE-NEXT: xvmuldp vs6, vs0, v2 +; TRACKLIVE-NEXT: xvmaddadp vs12, vs4, vs12 +; TRACKLIVE-NEXT: xvmaddadp vs3, vs1, v2 +; TRACKLIVE-NEXT: xvmaddadp vs2, vs1, vs2 +; TRACKLIVE-NEXT: xxlor vs0, v2, v2 +; TRACKLIVE-NEXT: lxvdsx v6, r6, r4 ; TRACKLIVE-NEXT: li r6, 0 -; TRACKLIVE-NEXT: xvmaddadp v1, v2, v2 -; TRACKLIVE-NEXT: xvmaddadp v0, v2, v2 -; TRACKLIVE-NEXT: xxlor vs18, v8, v8 -; TRACKLIVE-NEXT: vmr v8, v2 -; TRACKLIVE-NEXT: xxlor vs7, v7, v7 -; TRACKLIVE-NEXT: xxlor vs16, v8, v8 -; TRACKLIVE-NEXT: xvmuldp v3, vs5, v4 -; TRACKLIVE-NEXT: xvmuldp v5, vs0, v4 -; TRACKLIVE-NEXT: xvmuldp v9, vs6, v4 -; TRACKLIVE-NEXT: xvmuldp v11, v4, v2 -; TRACKLIVE-NEXT: vmr v4, v2 -; TRACKLIVE-NEXT: xxlor vs6, v6, v6 -; TRACKLIVE-NEXT: xxlor vs12, v4, v4 -; TRACKLIVE-NEXT: xxlor v4, vs3, vs3 -; TRACKLIVE-NEXT: xxlor vs10, v0, v0 -; TRACKLIVE-NEXT: xxlor vs11, v1, v1 -; TRACKLIVE-NEXT: xxlor vs14, v4, v4 -; TRACKLIVE-NEXT: xxlor vs5, v3, v3 -; TRACKLIVE-NEXT: xxlor vs9, v11, v11 -; TRACKLIVE-NEXT: xxlor vs13, v5, v5 -; TRACKLIVE-NEXT: xxlor vs15, v5, v5 -; TRACKLIVE-NEXT: xxlor vs19, v9, v9 -; TRACKLIVE-NEXT: xxlor vs17, v9, v9 +; TRACKLIVE-NEXT: xvmaddadp vs7, v2, v2 +; TRACKLIVE-NEXT: xvmaddadp vs6, v2, v2 +; TRACKLIVE-NEXT: xxlor vs14, vs12, vs12 +; TRACKLIVE-NEXT: xxlor vs12, v2, v2 +; TRACKLIVE-NEXT: xvmuldp v3, vs1, v6 +; TRACKLIVE-NEXT: xvmuldp vs11, v4, v6 +; TRACKLIVE-NEXT: xvmuldp vs13, vs4, v6 +; TRACKLIVE-NEXT: xvmuldp vs5, v6, v2 +; TRACKLIVE-NEXT: xxlor vs4, v2, v2 +; TRACKLIVE-NEXT: xxlor vs1, v3, v3 +; TRACKLIVE-NEXT: xxlor vs9, vs11, vs11 +; TRACKLIVE-NEXT: xxlor vs15, vs13, vs13 ; TRACKLIVE-NEXT: xxmtacc acc1 +; TRACKLIVE-NEXT: xxmtacc acc0 ; TRACKLIVE-NEXT: xxmtacc acc2 ; TRACKLIVE-NEXT: xxmtacc acc3 -; TRACKLIVE-NEXT: xxmtacc acc4 +; TRACKLIVE-NEXT: xvf64gerpp acc0, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc1, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc2, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc3, vsp34, vs0 -; TRACKLIVE-NEXT: xvf64gerpp acc4, vsp34, vs0 +; TRACKLIVE-NEXT: xvf64gerpp acc0, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc1, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc2, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc3, vsp34, vs0 -; TRACKLIVE-NEXT: xvf64gerpp acc4, vsp34, vs0 +; TRACKLIVE-NEXT: xvf64gerpp acc0, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc1, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc2, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc3, vsp34, vs0 -; TRACKLIVE-NEXT: xvf64gerpp acc4, vsp34, vs0 +; TRACKLIVE-NEXT: xvf64gerpp acc0, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc1, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc2, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc3, vsp34, vs0 -; TRACKLIVE-NEXT: xvf64gerpp acc4, vsp34, vs0 +; TRACKLIVE-NEXT: xvf64gerpp acc0, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc1, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc2, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc3, vsp34, vs0 -; TRACKLIVE-NEXT: xvf64gerpp acc4, vsp34, vs0 +; TRACKLIVE-NEXT: xvf64gerpp acc0, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc1, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc2, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc3, vsp34, vs0 -; TRACKLIVE-NEXT: xvf64gerpp acc4, vsp34, vs0 +; TRACKLIVE-NEXT: xvf64gerpp acc0, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc1, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc2, vsp34, vs0 ; TRACKLIVE-NEXT: xvf64gerpp acc3, vsp34, vs0 -; TRACKLIVE-NEXT: xvf64gerpp acc4, vsp34, vs0 +; TRACKLIVE-NEXT: xxmfacc acc0 ; TRACKLIVE-NEXT: xxmfacc acc1 ; TRACKLIVE-NEXT: xxmfacc acc2 ; TRACKLIVE-NEXT: xxmfacc acc3 -; TRACKLIVE-NEXT: xxmfacc acc4 -; TRACKLIVE-NEXT: stxv vs5, 0(r3) -; TRACKLIVE-NEXT: stxv vs13, 32(r3) -; TRACKLIVE-NEXT: stxv vs8, 16(0) -; TRACKLIVE-NEXT: stxv vs16, 48(0) +; TRACKLIVE-NEXT: stxv vs1, 0(r3) +; TRACKLIVE-NEXT: stxv vs9, 32(r3) +; TRACKLIVE-NEXT: stxv vs4, 16(0) +; TRACKLIVE-NEXT: stxv vs12, 48(0) ; TRACKLIVE-NEXT: b .LBB0_1 bb: %i = load i32, i32* %arg, align 4