diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -871,6 +871,10 @@ /// (3) Bottom-up allocation is no longer guaranteed to optimally color. virtual bool reverseLocalAssignment() const { return false; } + /// Add the allocation priority to global and split ranges as well as the + /// local ranges when registers are added to the queue. + virtual bool addAllocPriorityToGlobalRanges() const { return false; } + /// Allow the target to override the cost of using a callee-saved register for /// the first time. Default value of 0 means we will use a callee-saved /// register if it is available. diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -745,6 +745,7 @@ // Giant live ranges fall back to the global assignment heuristic, which // prevents excessive spilling in pathological cases. bool ReverseLocal = TRI->reverseLocalAssignment(); + bool AddPriorityToGlobal = TRI->addAllocPriorityToGlobalRanges(); const TargetRegisterClass &RC = *MRI->getRegClass(Reg); bool ForceGlobal = !ReverseLocal && (Size / SlotIndex::InstrDist) > (2 * RC.getNumRegs()); @@ -768,6 +769,9 @@ // don't fit should be spilled (or split) ASAP so they don't create // interference. Mark a bit to prioritize global above local ranges. Prio = (1u << 29) + Size; + + if (AddPriorityToGlobal) + Prio |= RC.AllocationPriority << 24; } // Mark a higher bit to prioritize global and local above RS_Split. Prio |= (1u << 31); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -94,6 +94,11 @@ bool isCallerPreservedPhysReg(MCRegister PhysReg, const MachineFunction &MF) const override; + bool getRegAllocationHints(Register VirtReg, ArrayRef Order, + SmallVectorImpl &Hints, + const MachineFunction &MF, const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const override; + /// We require the register scavenger. bool requiresRegisterScavenging(const MachineFunction &MF) const override { return true; @@ -137,6 +142,8 @@ unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; + bool addAllocPriorityToGlobalRanges() const override { return true; } + // Support for virtual base registers. bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -472,6 +472,54 @@ return false; } +bool PPCRegisterInfo::getRegAllocationHints(Register VirtReg, + ArrayRef Order, + SmallVectorImpl &Hints, + const MachineFunction &MF, + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const { + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + + bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( + VirtReg, Order, Hints, MF, VRM, Matrix); + for (MachineInstr &Use : MRI->reg_nodbg_instructions(VirtReg)) { + const MachineOperand *ResultOp = nullptr; + Register ResultReg; + switch (Use.getOpcode()) { + case TargetOpcode::COPY: { + ResultOp = &Use.getOperand(0); + ResultReg = ResultOp->getReg(); + if (Register::isVirtualRegister(ResultReg) && + MRI->getRegClass(ResultReg)->contains(PPC::UACC0) && + VRM->hasPhys(ResultReg)) { + Register UACCPhys = VRM->getPhys(ResultReg); + Register HintReg = getSubReg(UACCPhys, ResultOp->getSubReg()); + Hints.push_back(HintReg); + // We don't set BaseImplRetVal here. We only want to consider this + // regster first not force the regster allocator to use it. + } + break; + } + case PPC::BUILD_UACC: { + ResultOp = &Use.getOperand(0); + ResultReg = ResultOp->getReg(); + if (MRI->getRegClass(ResultReg)->contains(PPC::ACC0) && + VRM->hasPhys(ResultReg)) { + Register ACCPhys = VRM->getPhys(ResultReg); + // If we are working with an ACC type register give the corresponding + // UACC register as a hint. + if (ACCPhys >= PPC::ACC0 && ACCPhys >= PPC::ACC7) { + Register HintReg = PPC::UACC0 + (ACCPhys - PPC::ACC0); + Hints.push_back(HintReg); + } + } + break; + } + } + } + return BaseImplRetVal; +} + unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const PPCFrameLowering *TFI = getFrameLowering(MF); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -449,6 +449,7 @@ } def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3, ACC4, ACC5, ACC6, ACC7)> { + let AllocationPriority = 63; let Size = 512; } @@ -465,6 +466,7 @@ def UACCRC : RegisterClass<"PPC", [v512i1], 128, (add UACC0, UACC1, UACC2, UACC3, UACC4, UACC5, UACC6, UACC7)> { + let AllocationPriority = 36; let Size = 512; } @@ -476,6 +478,7 @@ VSRp16, VSRp19, VSRp20, VSRp21, VSRp22, VSRp23, VSRp24, VSRp25, VSRp31, VSRp30, VSRp29, VSRp28, VSRp27, VSRp26)> { + let AllocationPriority = 2; let Size = 256; } diff --git a/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll @@ -28,12 +28,12 @@ ; CHECK-NEXT: .LBB0_2: # %_loop_1_do_ ; CHECK-NEXT: # ; CHECK-NEXT: lxvp vsp2, 0(r3) -; CHECK-NEXT: lxvp vsp4, 32(r3) +; CHECK-NEXT: xvadddp vs4, vs0, vs3 +; CHECK-NEXT: lxvp vsp0, 32(r3) ; CHECK-NEXT: addi r3, r3, 128 -; CHECK-NEXT: xvadddp vs0, vs0, vs3 -; CHECK-NEXT: xvadddp vs0, vs0, vs2 -; CHECK-NEXT: xvadddp vs0, vs0, vs5 -; CHECK-NEXT: xvadddp vs0, vs0, vs4 +; CHECK-NEXT: xvadddp vs2, vs4, vs2 +; CHECK-NEXT: xvadddp vs2, vs2, vs1 +; CHECK-NEXT: xvadddp vs0, vs2, vs0 ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge ; CHECK-NEXT: stxv vs0, 0(r6) @@ -55,12 +55,12 @@ ; CHECK-BE-NEXT: .LBB0_2: # %_loop_1_do_ ; CHECK-BE-NEXT: # ; CHECK-BE-NEXT: lxvp vsp2, 0(r3) -; CHECK-BE-NEXT: lxvp vsp4, 32(r3) +; CHECK-BE-NEXT: xvadddp vs4, vs0, vs2 +; CHECK-BE-NEXT: lxvp vsp0, 32(r3) ; CHECK-BE-NEXT: addi r3, r3, 128 -; CHECK-BE-NEXT: xvadddp vs0, vs0, vs2 -; CHECK-BE-NEXT: xvadddp vs0, vs0, vs3 -; CHECK-BE-NEXT: xvadddp vs0, vs0, vs4 -; CHECK-BE-NEXT: xvadddp vs0, vs0, vs5 +; CHECK-BE-NEXT: xvadddp vs2, vs4, vs3 +; CHECK-BE-NEXT: xvadddp vs2, vs2, vs0 +; CHECK-BE-NEXT: xvadddp vs0, vs2, vs1 ; CHECK-BE-NEXT: bdnz .LBB0_2 ; CHECK-BE-NEXT: # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge ; CHECK-BE-NEXT: stxv vs0, 0(r6) diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -11,16 +11,12 @@ define void @ass_acc(<512 x i1>* %ptr, <16 x i8> %vc) { ; CHECK-LABEL: ass_acc: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxlor vs1, v2, v2 -; CHECK-NEXT: xxlor vs0, vs1, vs1 -; CHECK-NEXT: xxlor vs4, vs0, vs0 -; CHECK-NEXT: xxlor vs5, vs1, vs1 -; CHECK-NEXT: xxlor vs6, vs0, vs0 -; CHECK-NEXT: xxlor vs7, vs1, vs1 +; CHECK-NEXT: xxlor vs5, v2, v2 +; CHECK-NEXT: xxlor vs4, vs5, vs5 ; CHECK-NEXT: xxlor vs0, vs4, vs4 ; CHECK-NEXT: xxlor vs1, vs5, vs5 -; CHECK-NEXT: xxlor vs2, vs6, vs6 -; CHECK-NEXT: xxlor vs3, vs7, vs7 +; CHECK-NEXT: xxlor vs2, vs4, vs4 +; CHECK-NEXT: xxlor vs3, vs5, vs5 ; CHECK-NEXT: stxv vs0, 48(r3) ; CHECK-NEXT: stxv vs1, 32(r3) ; CHECK-NEXT: stxv vs2, 16(r3) @@ -29,16 +25,12 @@ ; ; CHECK-BE-LABEL: ass_acc: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxlor vs1, v2, v2 -; CHECK-BE-NEXT: xxlor vs0, vs1, vs1 -; CHECK-BE-NEXT: xxlor vs4, vs0, vs0 -; CHECK-BE-NEXT: xxlor vs5, vs1, vs1 -; CHECK-BE-NEXT: xxlor vs6, vs0, vs0 -; CHECK-BE-NEXT: xxlor vs7, vs1, vs1 +; CHECK-BE-NEXT: xxlor vs5, v2, v2 +; CHECK-BE-NEXT: xxlor vs4, vs5, vs5 ; CHECK-BE-NEXT: xxlor vs0, vs4, vs4 ; CHECK-BE-NEXT: xxlor vs1, vs5, vs5 -; CHECK-BE-NEXT: xxlor vs2, vs6, vs6 -; CHECK-BE-NEXT: xxlor vs3, vs7, vs7 +; CHECK-BE-NEXT: xxlor vs2, vs4, vs4 +; CHECK-BE-NEXT: xxlor vs3, vs5, vs5 ; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: stxv vs0, 0(r3) ; CHECK-BE-NEXT: stxv vs3, 48(r3) @@ -55,16 +47,12 @@ define void @int_xxmtacc(<512 x i1>* %ptr, <16 x i8> %vc) { ; CHECK-LABEL: int_xxmtacc: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxlor vs1, v2, v2 -; CHECK-NEXT: xxlor vs0, vs1, vs1 -; CHECK-NEXT: xxlor vs4, vs0, vs0 -; CHECK-NEXT: xxlor vs5, vs1, vs1 -; CHECK-NEXT: xxlor vs6, vs0, vs0 -; CHECK-NEXT: xxlor vs7, vs1, vs1 +; CHECK-NEXT: xxlor vs5, v2, v2 +; CHECK-NEXT: xxlor vs4, vs5, vs5 ; CHECK-NEXT: xxlor vs0, vs4, vs4 ; CHECK-NEXT: xxlor vs1, vs5, vs5 -; CHECK-NEXT: xxlor vs2, vs6, vs6 -; CHECK-NEXT: xxlor vs3, vs7, vs7 +; CHECK-NEXT: xxlor vs2, vs4, vs4 +; CHECK-NEXT: xxlor vs3, vs5, vs5 ; CHECK-NEXT: xxmtacc acc0 ; CHECK-NEXT: stxv vs0, 48(r3) ; CHECK-NEXT: stxv vs1, 32(r3) @@ -74,16 +62,12 @@ ; ; CHECK-BE-LABEL: int_xxmtacc: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxlor vs1, v2, v2 -; CHECK-BE-NEXT: xxlor vs0, vs1, vs1 -; CHECK-BE-NEXT: xxlor vs4, vs0, vs0 -; CHECK-BE-NEXT: xxlor vs5, vs1, vs1 -; CHECK-BE-NEXT: xxlor vs6, vs0, vs0 -; CHECK-BE-NEXT: xxlor vs7, vs1, vs1 +; CHECK-BE-NEXT: xxlor vs5, v2, v2 +; CHECK-BE-NEXT: xxlor vs4, vs5, vs5 ; CHECK-BE-NEXT: xxlor vs0, vs4, vs4 ; CHECK-BE-NEXT: xxlor vs1, vs5, vs5 -; CHECK-BE-NEXT: xxlor vs2, vs6, vs6 -; CHECK-BE-NEXT: xxlor vs3, vs7, vs7 +; CHECK-BE-NEXT: xxlor vs2, vs4, vs4 +; CHECK-BE-NEXT: xxlor vs3, vs5, vs5 ; CHECK-BE-NEXT: xxmtacc acc0 ; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: stxv vs0, 0(r3) @@ -104,16 +88,12 @@ define void @int_xxmfacc(<512 x i1>* %ptr, <16 x i8> %vc) { ; CHECK-LABEL: int_xxmfacc: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxlor vs1, v2, v2 -; CHECK-NEXT: xxlor vs0, vs1, vs1 -; CHECK-NEXT: xxlor vs4, vs0, vs0 -; CHECK-NEXT: xxlor vs5, vs1, vs1 -; CHECK-NEXT: xxlor vs6, vs0, vs0 -; CHECK-NEXT: xxlor vs7, vs1, vs1 +; CHECK-NEXT: xxlor vs5, v2, v2 +; CHECK-NEXT: xxlor vs4, vs5, vs5 ; CHECK-NEXT: xxlor vs0, vs4, vs4 ; CHECK-NEXT: xxlor vs1, vs5, vs5 -; CHECK-NEXT: xxlor vs2, vs6, vs6 -; CHECK-NEXT: xxlor vs3, vs7, vs7 +; CHECK-NEXT: xxlor vs2, vs4, vs4 +; CHECK-NEXT: xxlor vs3, vs5, vs5 ; CHECK-NEXT: stxv vs0, 48(r3) ; CHECK-NEXT: stxv vs1, 32(r3) ; CHECK-NEXT: stxv vs2, 16(r3) @@ -122,16 +102,12 @@ ; ; CHECK-BE-LABEL: int_xxmfacc: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxlor vs1, v2, v2 -; CHECK-BE-NEXT: xxlor vs0, vs1, vs1 -; CHECK-BE-NEXT: xxlor vs4, vs0, vs0 -; CHECK-BE-NEXT: xxlor vs5, vs1, vs1 -; CHECK-BE-NEXT: xxlor vs6, vs0, vs0 -; CHECK-BE-NEXT: xxlor vs7, vs1, vs1 +; CHECK-BE-NEXT: xxlor vs5, v2, v2 +; CHECK-BE-NEXT: xxlor vs4, vs5, vs5 ; CHECK-BE-NEXT: xxlor vs0, vs4, vs4 ; CHECK-BE-NEXT: xxlor vs1, vs5, vs5 -; CHECK-BE-NEXT: xxlor vs2, vs6, vs6 -; CHECK-BE-NEXT: xxlor vs3, vs7, vs7 +; CHECK-BE-NEXT: xxlor vs2, vs4, vs4 +; CHECK-BE-NEXT: xxlor vs3, vs5, vs5 ; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: stxv vs0, 0(r3) ; CHECK-BE-NEXT: stxv vs3, 48(r3) diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll --- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -mcpu=pwr10 -ppc-track-subreg-liveness -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -mcpu=pwr10 -ppc-track-subreg-liveness -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) @@ -11,22 +11,21 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4, i8* %ptr) { ; CHECK-LABEL: intrinsics1: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18 ; CHECK-NEXT: xxlor vs1, v4, v4 ; CHECK-NEXT: vmr v4, v3 ; CHECK-NEXT: ld r3, 96(r1) ; CHECK-NEXT: xxlor vs0, v2, v2 +; CHECK-NEXT: xxlor vs7, v5, v5 ; CHECK-NEXT: xxlor vs3, v2, v2 ; CHECK-NEXT: xxlor vs2, v5, v5 ; CHECK-NEXT: xxlor vs4, vs0, vs0 ; CHECK-NEXT: xxlor vs5, vs1, vs1 ; CHECK-NEXT: xxlor vs6, v4, v4 -; CHECK-NEXT: xxlor vs7, v5, v5 ; CHECK-NEXT: xxmtacc acc1 ; CHECK-NEXT: xvi4ger8pp acc1, v2, v3 ; CHECK-NEXT: xvf16ger2pp acc1, v2, vs1 ; CHECK-NEXT: pmxvf32gerpn acc1, v3, v5, 0, 0 -; CHECK-NEXT: pmxvf64gernp acc1, vsp2, vs0, 0, 0 +; CHECK-NEXT: pmxvf64gernp acc1, vsp2, v2, 0, 0 ; CHECK-NEXT: xxmfacc acc1 ; CHECK-NEXT: stxv vs4, 48(r3) ; CHECK-NEXT: stxv vs5, 32(r3) @@ -36,22 +35,21 @@ ; ; CHECK-BE-LABEL: intrinsics1: ; CHECK-BE: # %bb.0: -; CHECK-BE-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18 ; CHECK-BE-NEXT: xxlor vs1, v4, v4 ; CHECK-BE-NEXT: vmr v4, v3 ; CHECK-BE-NEXT: ld r3, 112(r1) ; CHECK-BE-NEXT: xxlor vs0, v2, v2 +; CHECK-BE-NEXT: xxlor vs7, v5, v5 ; CHECK-BE-NEXT: xxlor vs3, v2, v2 ; CHECK-BE-NEXT: xxlor vs2, v5, v5 ; CHECK-BE-NEXT: xxlor vs4, vs0, vs0 ; CHECK-BE-NEXT: xxlor vs5, vs1, vs1 ; CHECK-BE-NEXT: xxlor vs6, v4, v4 -; CHECK-BE-NEXT: xxlor vs7, v5, v5 ; CHECK-BE-NEXT: xxmtacc acc1 ; CHECK-BE-NEXT: xvi4ger8pp acc1, v2, v3 ; CHECK-BE-NEXT: xvf16ger2pp acc1, v2, vs1 ; CHECK-BE-NEXT: pmxvf32gerpn acc1, v3, v5, 0, 0 -; CHECK-BE-NEXT: pmxvf64gernp acc1, vsp2, vs0, 0, 0 +; CHECK-BE-NEXT: pmxvf64gernp acc1, vsp2, v2, 0, 0 ; CHECK-BE-NEXT: xxmfacc acc1 ; CHECK-BE-NEXT: stxv vs5, 16(r3) ; CHECK-BE-NEXT: stxvx vs4, 0, r3 @@ -78,10 +76,10 @@ ; CHECK-NEXT: lxv vs6, 0(r5) ; CHECK-NEXT: lxv vs7, 0(r6) ; CHECK-NEXT: xxlor vs0, vs4, vs4 -; CHECK-NEXT: xxlor vs9, vs4, vs4 ; CHECK-NEXT: xxlor vs1, vs5, vs5 ; CHECK-NEXT: xxlor vs2, vs6, vs6 ; CHECK-NEXT: xxlor vs3, vs7, vs7 +; CHECK-NEXT: xxlor vs9, vs4, vs4 ; CHECK-NEXT: xxlor vs8, vs7, vs7 ; CHECK-NEXT: xxmtacc acc0 ; CHECK-NEXT: xvi8ger4pp acc0, vs4, vs5 @@ -102,10 +100,10 @@ ; CHECK-BE-NEXT: lxv vs6, 0(r5) ; CHECK-BE-NEXT: lxv vs7, 0(r6) ; CHECK-BE-NEXT: xxlor vs0, vs4, vs4 -; CHECK-BE-NEXT: xxlor vs9, vs4, vs4 ; CHECK-BE-NEXT: xxlor vs1, vs5, vs5 ; CHECK-BE-NEXT: xxlor vs2, vs6, vs6 ; CHECK-BE-NEXT: xxlor vs3, vs7, vs7 +; CHECK-BE-NEXT: xxlor vs9, vs4, vs4 ; CHECK-BE-NEXT: xxlor vs8, vs7, vs7 ; CHECK-BE-NEXT: xxmtacc acc0 ; CHECK-BE-NEXT: xvi8ger4pp acc0, vs4, vs5 @@ -1368,26 +1366,26 @@ define void @test33(i8* %vqp, i8* %vpp, <16 x i8> %vc, i8* %resp) { ; CHECK-LABEL: test33: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxv vs1, 0(r4) -; CHECK-NEXT: lxv vs0, 16(r4) -; CHECK-NEXT: xvf64ger acc1, vsp0, v2 -; CHECK-NEXT: xxmfacc acc1 -; CHECK-NEXT: stxv vs4, 48(r7) -; CHECK-NEXT: stxv vs5, 32(r7) -; CHECK-NEXT: stxv vs6, 16(r7) -; CHECK-NEXT: stxv vs7, 0(r7) +; CHECK-NEXT: lxv vs5, 0(r4) +; CHECK-NEXT: lxv vs4, 16(r4) +; CHECK-NEXT: xvf64ger acc0, vsp4, v2 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r7) +; CHECK-NEXT: stxv vs1, 32(r7) +; CHECK-NEXT: stxv vs2, 16(r7) +; CHECK-NEXT: stxv vs3, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test33: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs1, 16(r4) -; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: xvf64ger acc1, vsp0, v2 -; CHECK-BE-NEXT: xxmfacc acc1 -; CHECK-BE-NEXT: stxv vs5, 16(r7) -; CHECK-BE-NEXT: stxv vs4, 0(r7) -; CHECK-BE-NEXT: stxv vs7, 48(r7) -; CHECK-BE-NEXT: stxv vs6, 32(r7) +; CHECK-BE-NEXT: lxv vs5, 16(r4) +; CHECK-BE-NEXT: lxv vs4, 0(r4) +; CHECK-BE-NEXT: xvf64ger acc0, vsp4, v2 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r7) +; CHECK-BE-NEXT: stxv vs0, 0(r7) +; CHECK-BE-NEXT: stxv vs3, 48(r7) +; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr entry: %0 = bitcast i8* %vpp to <256 x i1>* @@ -1596,26 +1594,26 @@ define void @test38(i8* %vqp, i8* %vpp, <16 x i8> %vc, i8* %resp) { ; CHECK-LABEL: test38: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxv vs1, 0(r4) -; CHECK-NEXT: lxv vs0, 16(r4) -; CHECK-NEXT: pmxvf64ger acc1, vsp0, v2, 0, 0 -; CHECK-NEXT: xxmfacc acc1 -; CHECK-NEXT: stxv vs4, 48(r7) -; CHECK-NEXT: stxv vs5, 32(r7) -; CHECK-NEXT: stxv vs6, 16(r7) -; CHECK-NEXT: stxv vs7, 0(r7) +; CHECK-NEXT: lxv vs5, 0(r4) +; CHECK-NEXT: lxv vs4, 16(r4) +; CHECK-NEXT: pmxvf64ger acc0, vsp4, v2, 0, 0 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r7) +; CHECK-NEXT: stxv vs1, 32(r7) +; CHECK-NEXT: stxv vs2, 16(r7) +; CHECK-NEXT: stxv vs3, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test38: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs1, 16(r4) -; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: pmxvf64ger acc1, vsp0, v2, 0, 0 -; CHECK-BE-NEXT: xxmfacc acc1 -; CHECK-BE-NEXT: stxv vs5, 16(r7) -; CHECK-BE-NEXT: stxv vs4, 0(r7) -; CHECK-BE-NEXT: stxv vs7, 48(r7) -; CHECK-BE-NEXT: stxv vs6, 32(r7) +; CHECK-BE-NEXT: lxv vs5, 16(r4) +; CHECK-BE-NEXT: lxv vs4, 0(r4) +; CHECK-BE-NEXT: pmxvf64ger acc0, vsp4, v2, 0, 0 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r7) +; CHECK-BE-NEXT: stxv vs0, 0(r7) +; CHECK-BE-NEXT: stxv vs3, 48(r7) +; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr entry: %0 = bitcast i8* %vpp to <256 x i1>* diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -116,42 +116,42 @@ ; CHECK-NEXT: ld 4, 624(1) ; CHECK-NEXT: std 19, 96(1) # 8-byte Folded Spill ; CHECK-NEXT: std 4, 104(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 11, 0(4) +; CHECK-NEXT: lxv 13, 0(4) ; CHECK-NEXT: mr 4, 5 ; CHECK-NEXT: ld 5, 216(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 15, 576(1) ; CHECK-NEXT: sldi 31, 3, 1 ; CHECK-NEXT: std 8, 32(1) # 8-byte Folded Spill ; CHECK-NEXT: std 9, 40(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 41, 0(8) +; CHECK-NEXT: lxv 43, 0(8) ; CHECK-NEXT: mr 8, 6 ; CHECK-NEXT: sldi 6, 3, 3 ; CHECK-NEXT: std 2, 144(1) # 8-byte Folded Spill ; CHECK-NEXT: std 11, 152(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 3, 0(2) -; CHECK-NEXT: lxv 2, 0(11) -; CHECK-NEXT: lxv 0, 0(7) +; CHECK-NEXT: lxv 5, 0(2) +; CHECK-NEXT: lxv 4, 0(11) +; CHECK-NEXT: lxv 2, 0(7) ; CHECK-NEXT: add 6, 6, 23 -; CHECK-NEXT: lxv 7, 0(28) +; CHECK-NEXT: lxv 9, 0(28) ; CHECK-NEXT: add 28, 3, 31 -; CHECK-NEXT: lxv 40, 0(9) -; CHECK-NEXT: lxv 39, 0(10) -; CHECK-NEXT: lxv 38, 0(15) -; CHECK-NEXT: lxv 33, 0(14) -; CHECK-NEXT: lxv 32, 0(16) -; CHECK-NEXT: lxv 37, 0(17) -; CHECK-NEXT: lxv 35, 0(18) -; CHECK-NEXT: lxv 13, 0(19) -; CHECK-NEXT: lxv 10, 0(20) -; CHECK-NEXT: lxv 8, 0(21) -; CHECK-NEXT: lxv 6, 0(22) -; CHECK-NEXT: lxv 4, 0(30) -; CHECK-NEXT: lxv 1, 0(12) -; CHECK-NEXT: lxv 36, 0(24) -; CHECK-NEXT: lxv 34, 0(25) -; CHECK-NEXT: lxv 12, 0(26) -; CHECK-NEXT: lxv 9, 0(27) -; CHECK-NEXT: lxv 5, 0(29) +; CHECK-NEXT: lxv 42, 0(9) +; CHECK-NEXT: lxv 41, 0(10) +; CHECK-NEXT: lxv 40, 0(15) +; CHECK-NEXT: lxv 39, 0(14) +; CHECK-NEXT: lxv 38, 0(16) +; CHECK-NEXT: lxv 33, 0(17) +; CHECK-NEXT: lxv 37, 0(18) +; CHECK-NEXT: lxv 35, 0(19) +; CHECK-NEXT: lxv 12, 0(20) +; CHECK-NEXT: lxv 10, 0(21) +; CHECK-NEXT: lxv 8, 0(22) +; CHECK-NEXT: lxv 6, 0(30) +; CHECK-NEXT: lxv 3, 0(12) +; CHECK-NEXT: lxv 32, 0(24) +; CHECK-NEXT: lxv 36, 0(25) +; CHECK-NEXT: lxv 34, 0(26) +; CHECK-NEXT: lxv 11, 0(27) +; CHECK-NEXT: lxv 7, 0(29) ; CHECK-NEXT: addi 5, 5, -2 ; CHECK-NEXT: sldi 11, 3, 4 ; CHECK-NEXT: std 12, 160(1) # 8-byte Folded Spill @@ -216,7 +216,7 @@ ; CHECK-NEXT: .LBB0_4: # %_loop_2_do_ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lxvp 42, 0(6) +; CHECK-NEXT: lxvp 0, 0(6) ; CHECK-NEXT: lxvp 44, 0(16) ; CHECK-NEXT: lxvp 46, 0(17) ; CHECK-NEXT: lxvp 48, 0(18) @@ -233,34 +233,34 @@ ; CHECK-NEXT: addi 17, 17, 64 ; CHECK-NEXT: addi 18, 18, 64 ; CHECK-NEXT: addi 19, 19, 64 -; CHECK-NEXT: xvmaddadp 41, 45, 43 -; CHECK-NEXT: xvmaddadp 40, 47, 43 -; CHECK-NEXT: xvmaddadp 39, 49, 43 -; CHECK-NEXT: xvmaddadp 38, 51, 43 -; CHECK-NEXT: xvmaddadp 33, 31, 43 -; CHECK-NEXT: xvmaddadp 32, 29, 43 -; CHECK-NEXT: xvmaddadp 37, 44, 42 -; CHECK-NEXT: xvmaddadp 35, 46, 42 -; CHECK-NEXT: xvmaddadp 13, 48, 42 -; CHECK-NEXT: xvmaddadp 11, 50, 42 -; CHECK-NEXT: xvmaddadp 10, 30, 42 -; CHECK-NEXT: xvmaddadp 8, 28, 42 -; CHECK-NEXT: lxvp 42, 32(20) +; CHECK-NEXT: xvmaddadp 43, 45, 1 +; CHECK-NEXT: xvmaddadp 42, 47, 1 +; CHECK-NEXT: xvmaddadp 41, 49, 1 +; CHECK-NEXT: xvmaddadp 40, 51, 1 +; CHECK-NEXT: xvmaddadp 39, 31, 1 +; CHECK-NEXT: xvmaddadp 38, 29, 1 +; CHECK-NEXT: xvmaddadp 33, 44, 0 +; CHECK-NEXT: xvmaddadp 37, 46, 0 +; CHECK-NEXT: xvmaddadp 35, 48, 0 +; CHECK-NEXT: xvmaddadp 13, 50, 0 +; CHECK-NEXT: xvmaddadp 12, 30, 0 +; CHECK-NEXT: xvmaddadp 10, 28, 0 +; CHECK-NEXT: lxvp 0, 32(20) ; CHECK-NEXT: lxvp 44, 32(21) ; CHECK-NEXT: addi 20, 20, 64 ; CHECK-NEXT: addi 21, 21, 64 -; CHECK-NEXT: xvmaddadp 6, 25, 27 -; CHECK-NEXT: xvmaddadp 4, 23, 27 -; CHECK-NEXT: xvmaddadp 3, 21, 27 -; CHECK-NEXT: xvmaddadp 2, 19, 27 -; CHECK-NEXT: xvmaddadp 36, 24, 26 -; CHECK-NEXT: xvmaddadp 34, 22, 26 -; CHECK-NEXT: xvmaddadp 12, 20, 26 -; CHECK-NEXT: xvmaddadp 9, 18, 26 -; CHECK-NEXT: xvmaddadp 1, 43, 27 -; CHECK-NEXT: xvmaddadp 0, 45, 27 -; CHECK-NEXT: xvmaddadp 7, 42, 26 -; CHECK-NEXT: xvmaddadp 5, 44, 26 +; CHECK-NEXT: xvmaddadp 8, 25, 27 +; CHECK-NEXT: xvmaddadp 6, 23, 27 +; CHECK-NEXT: xvmaddadp 5, 21, 27 +; CHECK-NEXT: xvmaddadp 4, 19, 27 +; CHECK-NEXT: xvmaddadp 32, 24, 26 +; CHECK-NEXT: xvmaddadp 36, 22, 26 +; CHECK-NEXT: xvmaddadp 34, 20, 26 +; CHECK-NEXT: xvmaddadp 11, 18, 26 +; CHECK-NEXT: xvmaddadp 3, 1, 27 +; CHECK-NEXT: xvmaddadp 2, 45, 27 +; CHECK-NEXT: xvmaddadp 9, 0, 26 +; CHECK-NEXT: xvmaddadp 7, 44, 26 ; CHECK-NEXT: bdnz .LBB0_4 ; CHECK-NEXT: # %bb.5: # %_loop_2_endl_ ; CHECK-NEXT: # @@ -276,53 +276,53 @@ ; CHECK-NEXT: ble 0, .LBB0_3 ; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit ; CHECK-NEXT: ld 3, 32(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 41, 0(3) +; CHECK-NEXT: stxv 43, 0(3) ; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 40, 0(3) +; CHECK-NEXT: stxv 42, 0(3) ; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 39, 0(3) +; CHECK-NEXT: stxv 41, 0(3) ; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: stxv 40, 0(3) ; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 33, 0(3) +; CHECK-NEXT: stxv 39, 0(3) ; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: stxv 38, 0(3) ; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 37, 0(3) +; CHECK-NEXT: stxv 33, 0(3) ; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 35, 0(3) +; CHECK-NEXT: stxv 37, 0(3) ; CHECK-NEXT: ld 3, 96(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 13, 0(3) +; CHECK-NEXT: stxv 35, 0(3) ; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 11, 0(3) +; CHECK-NEXT: stxv 13, 0(3) ; CHECK-NEXT: ld 3, 112(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 10, 0(3) +; CHECK-NEXT: stxv 12, 0(3) ; CHECK-NEXT: ld 3, 120(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 8, 0(3) +; CHECK-NEXT: stxv 10, 0(3) ; CHECK-NEXT: ld 3, 128(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 6, 0(3) +; CHECK-NEXT: stxv 8, 0(3) ; CHECK-NEXT: ld 3, 136(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 4, 0(3) +; CHECK-NEXT: stxv 6, 0(3) ; CHECK-NEXT: ld 3, 144(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 3, 0(3) +; CHECK-NEXT: stxv 5, 0(3) ; CHECK-NEXT: ld 3, 152(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 2, 0(3) +; CHECK-NEXT: stxv 4, 0(3) ; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 1, 0(3) +; CHECK-NEXT: stxv 3, 0(3) ; CHECK-NEXT: ld 3, 168(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 0, 0(3) +; CHECK-NEXT: stxv 2, 0(3) ; CHECK-NEXT: ld 3, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: stxv 32, 0(3) ; CHECK-NEXT: ld 3, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 34, 0(3) +; CHECK-NEXT: stxv 36, 0(3) ; CHECK-NEXT: ld 3, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 12, 0(3) +; CHECK-NEXT: stxv 34, 0(3) ; CHECK-NEXT: ld 3, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 9, 0(3) +; CHECK-NEXT: stxv 11, 0(3) ; CHECK-NEXT: ld 3, 208(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: stxv 9, 0(3) ; CHECK-NEXT: ld 3, 216(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 5, 0(3) +; CHECK-NEXT: stxv 7, 0(3) ; CHECK-NEXT: .LBB0_7: # %_return_bb ; CHECK-NEXT: lfd 31, 472(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 30, 464(1) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/ppc64-acc-schedule.ll b/llvm/test/CodeGen/PowerPC/ppc64-acc-schedule.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ppc64-acc-schedule.ll @@ -0,0 +1,328 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +target triple = "powerpc64le-unknown-linux-gnu" +; RUN: llc -verify-machineinstrs -mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-track-subreg-liveness < %s | FileCheck %s --check-prefix=TRACKLIVE + +%0 = type <{ double }> +%1 = type <{ double }> + +define void @accschedule(i32* %arg, [0 x %0]* %arg1, [0 x %1]* %arg2) local_unnamed_addr { +; CHECK-LABEL: accschedule: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: lwz 3, 0(3) +; CHECK-NEXT: lxv 32, 0(0) +; CHECK-NEXT: xxlxor 38, 38, 38 +; CHECK-NEXT: xxlxor 36, 36, 36 +; CHECK-NEXT: stfd 14, -144(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 15, -136(1) # 8-byte Folded Spill +; CHECK-NEXT: xxlxor 34, 34, 34 +; CHECK-NEXT: li 6, 1 +; CHECK-NEXT: li 4, 16 +; CHECK-NEXT: extswsli 3, 3, 3 +; CHECK-NEXT: xvmaddadp 36, 32, 36 +; CHECK-NEXT: lxvdsx 33, 0, 3 +; CHECK-NEXT: xvmaddadp 38, 33, 38 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: # %bb9 +; CHECK-NEXT: # +; CHECK-NEXT: addi 6, 6, 2 +; CHECK-NEXT: lxv 0, 16(0) +; CHECK-NEXT: xxlxor 9, 9, 9 +; CHECK-NEXT: xxlor 5, 38, 38 +; CHECK-NEXT: lxv 1, -16(5) +; CHECK-NEXT: mulld 6, 6, 3 +; CHECK-NEXT: xxlxor 40, 40, 40 +; CHECK-NEXT: xvmaddadp 9, 0, 33 +; CHECK-NEXT: xvmuldp 8, 0, 34 +; CHECK-NEXT: lxv 0, -64(5) +; CHECK-NEXT: xvmaddadp 9, 34, 34 +; CHECK-NEXT: xvmaddadp 8, 34, 34 +; CHECK-NEXT: xvmaddadp 40, 1, 40 +; CHECK-NEXT: lxvdsx 39, 6, 4 +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: xvmaddadp 5, 0, 34 +; CHECK-NEXT: xxlxor 4, 4, 4 +; CHECK-NEXT: xvmaddadp 4, 0, 4 +; CHECK-NEXT: xvmuldp 35, 0, 39 +; CHECK-NEXT: xvmuldp 11, 32, 39 +; CHECK-NEXT: xvmuldp 13, 1, 39 +; CHECK-NEXT: xvmuldp 7, 39, 34 +; CHECK-NEXT: xxlor 6, 34, 34 +; CHECK-NEXT: xxlor 10, 34, 34 +; CHECK-NEXT: xxlor 0, 34, 34 +; CHECK-NEXT: xxlor 12, 34, 34 +; CHECK-NEXT: # kill: def $vsrp6 killed $vsrp6 def $uacc3 +; CHECK-NEXT: xxlor 14, 40, 40 +; CHECK-NEXT: xxlor 15, 41, 41 +; CHECK-NEXT: xxlor 1, 35, 35 +; CHECK-NEXT: xxlor 2, 4, 4 +; CHECK-NEXT: xxlor 3, 5, 5 +; CHECK-NEXT: xxlor 4, 6, 6 +; CHECK-NEXT: xxlor 5, 7, 7 +; CHECK-NEXT: xxlor 6, 8, 8 +; CHECK-NEXT: xxlor 7, 9, 9 +; CHECK-NEXT: xxlor 8, 10, 10 +; CHECK-NEXT: xxlor 9, 11, 11 +; CHECK-NEXT: xxlor 10, 36, 36 +; CHECK-NEXT: xxlor 11, 37, 37 +; CHECK-NEXT: xxmtacc 3 +; CHECK-NEXT: xxmtacc 0 +; CHECK-NEXT: xxmtacc 1 +; CHECK-NEXT: xxmtacc 2 +; CHECK-NEXT: xvf64gerpp 0, 0, 0 +; CHECK-NEXT: xvf64gerpp 1, 0, 0 +; CHECK-NEXT: xvf64gerpp 2, 0, 0 +; CHECK-NEXT: xvf64gerpp 3, 0, 0 +; CHECK-NEXT: xvf64gerpp 0, 0, 0 +; CHECK-NEXT: xvf64gerpp 1, 0, 0 +; CHECK-NEXT: xvf64gerpp 2, 0, 0 +; CHECK-NEXT: xvf64gerpp 3, 0, 0 +; CHECK-NEXT: xvf64gerpp 0, 0, 0 +; CHECK-NEXT: xvf64gerpp 1, 0, 0 +; CHECK-NEXT: xvf64gerpp 2, 0, 0 +; CHECK-NEXT: xvf64gerpp 3, 0, 0 +; CHECK-NEXT: xvf64gerpp 0, 0, 0 +; CHECK-NEXT: xvf64gerpp 1, 0, 0 +; CHECK-NEXT: xvf64gerpp 2, 0, 0 +; CHECK-NEXT: xvf64gerpp 3, 0, 0 +; CHECK-NEXT: xvf64gerpp 0, 0, 0 +; CHECK-NEXT: xvf64gerpp 1, 0, 0 +; CHECK-NEXT: xvf64gerpp 2, 0, 0 +; CHECK-NEXT: xvf64gerpp 3, 0, 0 +; CHECK-NEXT: xvf64gerpp 0, 0, 0 +; CHECK-NEXT: xvf64gerpp 1, 0, 0 +; CHECK-NEXT: xvf64gerpp 2, 0, 0 +; CHECK-NEXT: xvf64gerpp 3, 0, 0 +; CHECK-NEXT: xvf64gerpp 0, 0, 0 +; CHECK-NEXT: xvf64gerpp 1, 0, 0 +; CHECK-NEXT: xvf64gerpp 2, 0, 0 +; CHECK-NEXT: xvf64gerpp 3, 0, 0 +; CHECK-NEXT: xxmfacc 0 +; CHECK-NEXT: xxmfacc 1 +; CHECK-NEXT: xxmfacc 2 +; CHECK-NEXT: xxmfacc 3 +; CHECK-NEXT: stxv 1, 0(3) +; CHECK-NEXT: stxv 9, 32(3) +; CHECK-NEXT: stxv 4, 16(0) +; CHECK-NEXT: stxv 12, 48(0) +; CHECK-NEXT: b .LBB0_1 +; +; TRACKLIVE-LABEL: accschedule: +; TRACKLIVE: # %bb.0: # %bb +; TRACKLIVE-NEXT: lwz 3, 0(3) +; TRACKLIVE-NEXT: lxv 37, 0(0) +; TRACKLIVE-NEXT: xxlxor 33, 33, 33 +; TRACKLIVE-NEXT: xxlxor 36, 36, 36 +; TRACKLIVE-NEXT: stfd 14, -144(1) # 8-byte Folded Spill +; TRACKLIVE-NEXT: stfd 15, -136(1) # 8-byte Folded Spill +; TRACKLIVE-NEXT: xxlxor 34, 34, 34 +; TRACKLIVE-NEXT: li 6, 1 +; TRACKLIVE-NEXT: li 4, 16 +; TRACKLIVE-NEXT: extswsli 3, 3, 3 +; TRACKLIVE-NEXT: xvmaddadp 36, 37, 36 +; TRACKLIVE-NEXT: lxvdsx 32, 0, 3 +; TRACKLIVE-NEXT: xvmaddadp 33, 32, 33 +; TRACKLIVE-NEXT: .p2align 4 +; TRACKLIVE-NEXT: .LBB0_1: # %bb9 +; TRACKLIVE-NEXT: # +; TRACKLIVE-NEXT: addi 6, 6, 2 +; TRACKLIVE-NEXT: lxv 0, 16(0) +; TRACKLIVE-NEXT: xxlxor 7, 7, 7 +; TRACKLIVE-NEXT: lxv 1, -16(5) +; TRACKLIVE-NEXT: xxlor 3, 33, 33 +; TRACKLIVE-NEXT: mulld 6, 6, 3 +; TRACKLIVE-NEXT: xxlxor 2, 2, 2 +; TRACKLIVE-NEXT: xxlxor 14, 14, 14 +; TRACKLIVE-NEXT: xxlor 4, 34, 34 +; TRACKLIVE-NEXT: xxlor 8, 34, 34 +; TRACKLIVE-NEXT: xxlor 10, 36, 36 +; TRACKLIVE-NEXT: xxlor 11, 37, 37 +; TRACKLIVE-NEXT: xxlor 12, 34, 34 +; TRACKLIVE-NEXT: xvmaddadp 7, 0, 32 +; TRACKLIVE-NEXT: xvmuldp 6, 0, 34 +; TRACKLIVE-NEXT: lxv 0, -64(5) +; TRACKLIVE-NEXT: xvmaddadp 14, 1, 14 +; TRACKLIVE-NEXT: lxvdsx 38, 6, 4 +; TRACKLIVE-NEXT: li 6, 0 +; TRACKLIVE-NEXT: xvmaddadp 3, 0, 34 +; TRACKLIVE-NEXT: xvmaddadp 2, 0, 2 +; TRACKLIVE-NEXT: xvmaddadp 7, 34, 34 +; TRACKLIVE-NEXT: xvmaddadp 6, 34, 34 +; TRACKLIVE-NEXT: xvmuldp 35, 0, 38 +; TRACKLIVE-NEXT: xvmuldp 9, 37, 38 +; TRACKLIVE-NEXT: xvmuldp 13, 1, 38 +; TRACKLIVE-NEXT: xvmuldp 5, 38, 34 +; TRACKLIVE-NEXT: xxlor 0, 34, 34 +; TRACKLIVE-NEXT: xxlor 1, 35, 35 +; TRACKLIVE-NEXT: xxmtacc 1 +; TRACKLIVE-NEXT: xxmtacc 2 +; TRACKLIVE-NEXT: xxmtacc 3 +; TRACKLIVE-NEXT: xxmtacc 0 +; TRACKLIVE-NEXT: xvf64gerpp 0, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 1, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 2, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 3, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 0, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 1, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 2, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 3, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 0, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 1, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 2, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 3, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 0, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 1, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 2, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 3, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 0, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 1, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 2, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 3, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 0, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 1, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 2, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 3, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 0, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 1, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 2, 0, 0 +; TRACKLIVE-NEXT: xvf64gerpp 3, 0, 0 +; TRACKLIVE-NEXT: xxmfacc 0 +; TRACKLIVE-NEXT: xxmfacc 1 +; TRACKLIVE-NEXT: xxmfacc 2 +; TRACKLIVE-NEXT: xxmfacc 3 +; TRACKLIVE-NEXT: stxv 1, 0(3) +; TRACKLIVE-NEXT: stxv 9, 32(3) +; TRACKLIVE-NEXT: stxv 4, 16(0) +; TRACKLIVE-NEXT: stxv 12, 48(0) +; TRACKLIVE-NEXT: b .LBB0_1 +bb: + %i = load i32, i32* %arg, align 4 + %i3 = sext i32 %i to i64 + %i4 = shl nsw i64 %i3, 3 + %i5 = bitcast [0 x %0]* %arg1 to i8* + %i6 = getelementptr i8, i8* %i5, i64 undef + %i7 = getelementptr [0 x %1], [0 x %1]* %arg2, i64 0, i64 -8 + %i8 = getelementptr i8, i8* %i6, i64 undef + br label %bb9 + +bb9: ; preds = %bb95, %bb + %i10 = phi i64 [ 1, %bb ], [ 0, %bb95 ] + %i11 = getelementptr %1, %1* null, i64 2 + %i12 = bitcast %1* %i11 to <2 x double>* + %i13 = load <2 x double>, <2 x double>* %i12, align 1 + %i14 = add nuw nsw i64 %i10, 2 + %i15 = getelementptr inbounds %1, %1* %i7, i64 undef + %i16 = bitcast %1* %i15 to <2 x double>* + %i17 = load <2 x double>, <2 x double>* %i16, align 1 + %i18 = load <2 x double>, <2 x double>* null, align 1 + %i19 = getelementptr %1, %1* %i15, i64 6 + %i20 = bitcast %1* %i19 to <2 x double>* + %i21 = load <2 x double>, <2 x double>* %i20, align 1 + %i22 = load i64, i64* undef, align 8 + %i23 = insertelement <2 x i64> poison, i64 %i22, i32 0 + %i24 = bitcast <2 x i64> %i23 to <2 x double> + %i25 = shufflevector <2 x double> %i24, <2 x double> undef, <2 x i32> zeroinitializer + %i26 = mul i64 %i14, %i4 + %i27 = getelementptr i8, i8* null, i64 %i26 + %i28 = getelementptr inbounds i8, i8* %i27, i64 0 + %i29 = getelementptr i8, i8* %i28, i64 16 + %i30 = bitcast i8* %i29 to i64* + %i31 = load i64, i64* %i30, align 8 + %i32 = insertelement <2 x i64> poison, i64 %i31, i32 0 + %i33 = bitcast <2 x i64> %i32 to <2 x double> + %i34 = shufflevector <2 x double> %i33, <2 x double> undef, <2 x i32> zeroinitializer + %i35 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> zeroinitializer, <2 x double> %i25, <2 x double> zeroinitializer) + %i36 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i13, <2 x double> %i25, <2 x double> zeroinitializer) + %i37 = fmul contract <2 x double> %i13, zeroinitializer + %i38 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i17, <2 x double> zeroinitializer, <2 x double> %i35) + %i39 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, <2 x double> %i36) + %i40 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i17, <2 x double> zeroinitializer, <2 x double> zeroinitializer) + %i41 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, <2 x double> %i37) + %i42 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i18, <2 x double> zeroinitializer, <2 x double> zeroinitializer) + %i43 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i21, <2 x double> zeroinitializer, <2 x double> zeroinitializer) + %i44 = fmul contract <2 x double> %i17, %i34 + %i45 = fmul contract <2 x double> zeroinitializer, %i34 + %i46 = fmul contract <2 x double> %i18, %i34 + %i47 = fmul contract <2 x double> %i21, %i34 + %i48 = bitcast <2 x double> %i44 to <16 x i8> + %i49 = bitcast <2 x double> %i40 to <16 x i8> + %i50 = bitcast <2 x double> %i38 to <16 x i8> + %i51 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> zeroinitializer, <16 x i8> %i48, <16 x i8> %i49, <16 x i8> %i50) + %i52 = bitcast <2 x double> %i45 to <16 x i8> + %i53 = bitcast <2 x double> %i41 to <16 x i8> + %i54 = bitcast <2 x double> %i39 to <16 x i8> + %i55 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> zeroinitializer, <16 x i8> %i52, <16 x i8> %i53, <16 x i8> %i54) + %i56 = bitcast <2 x double> %i46 to <16 x i8> + %i57 = bitcast <2 x double> %i42 to <16 x i8> + %i58 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> zeroinitializer, <16 x i8> %i56, <16 x i8> %i57, <16 x i8> undef) + %i59 = bitcast <2 x double> %i47 to <16 x i8> + %i60 = bitcast <2 x double> %i43 to <16 x i8> + %i61 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> zeroinitializer, <16 x i8> %i59, <16 x i8> %i60, <16 x i8> undef) + %i62 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i51, <256 x i1> undef, <16 x i8> undef) + %i63 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i55, <256 x i1> undef, <16 x i8> undef) + %i64 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i58, <256 x i1> undef, <16 x i8> undef) + %i65 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i61, <256 x i1> undef, <16 x i8> undef) + %i66 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i62, <256 x i1> undef, <16 x i8> undef) + %i67 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i63, <256 x i1> undef, <16 x i8> undef) + %i68 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i64, <256 x i1> undef, <16 x i8> undef) + %i69 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i65, <256 x i1> undef, <16 x i8> undef) + %i70 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i66, <256 x i1> undef, <16 x i8> undef) + %i71 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i67, <256 x i1> undef, <16 x i8> undef) + %i72 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i68, <256 x i1> undef, <16 x i8> undef) + %i73 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i69, <256 x i1> undef, <16 x i8> undef) + %i74 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i70, <256 x i1> undef, <16 x i8> undef) + %i75 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i71, <256 x i1> undef, <16 x i8> undef) + %i76 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i72, <256 x i1> undef, <16 x i8> undef) + %i77 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i73, <256 x i1> undef, <16 x i8> undef) + %i78 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i74, <256 x i1> undef, <16 x i8> undef) + %i79 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i75, <256 x i1> undef, <16 x i8> undef) + %i80 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i76, <256 x i1> undef, <16 x i8> undef) + %i81 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i77, <256 x i1> undef, <16 x i8> undef) + br label %bb82 + +bb82: ; preds = %bb82, %bb9 + %i83 = phi <512 x i1> [ %i94, %bb82 ], [ %i81, %bb9 ] + %i84 = phi <512 x i1> [ %i93, %bb82 ], [ %i80, %bb9 ] + %i85 = phi <512 x i1> [ %i92, %bb82 ], [ %i79, %bb9 ] + %i86 = phi <512 x i1> [ %i91, %bb82 ], [ %i78, %bb9 ] + %i87 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i86, <256 x i1> undef, <16 x i8> undef) + %i88 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i85, <256 x i1> undef, <16 x i8> undef) + %i89 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i84, <256 x i1> undef, <16 x i8> undef) + %i90 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i83, <256 x i1> undef, <16 x i8> undef) + %i91 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i87, <256 x i1> undef, <16 x i8> undef) + %i92 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i88, <256 x i1> undef, <16 x i8> undef) + %i93 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i89, <256 x i1> undef, <16 x i8> undef) + %i94 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i90, <256 x i1> undef, <16 x i8> undef) + br i1 undef, label %bb95, label %bb82 + +bb95: ; preds = %bb82 + %i96 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %i91) + %i97 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %i96, 2 + %i98 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %i92) + %i99 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %i98, 3 + %i100 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %i93) + %i101 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %i100, 2 + %i102 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %i94) + %i103 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %i102, 3 + %i104 = getelementptr inbounds i8, i8* %i8, i64 undef + %i105 = bitcast i8* %i104 to <16 x i8>* + store <16 x i8> %i97, <16 x i8>* %i105, align 1 + %i106 = getelementptr i8, i8* %i104, i64 32 + %i107 = bitcast i8* %i106 to <16 x i8>* + store <16 x i8> %i101, <16 x i8>* %i107, align 1 + %i108 = getelementptr i8, i8* null, i64 16 + %i109 = bitcast i8* %i108 to <16 x i8>* + store <16 x i8> %i99, <16 x i8>* %i109, align 1 + %i110 = getelementptr i8, i8* null, i64 48 + %i111 = bitcast i8* %i110 to <16 x i8>* + store <16 x i8> %i103, <16 x i8>* %i111, align 1 + br label %bb9 +} + +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) +declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) +declare <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1>, <256 x i1>, <16 x i8>) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>) +