diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -123,52 +123,72 @@ SOK_VectorFloat8Spill, SOK_VectorFloat4Spill, SOK_SpillToVSR, + SOK_PairedVecSpill, + SOK_AccumulatorSpill, + SOK_UAccumulatorSpill, SOK_SPESpill, SOK_LastOpcodeSpill // This must be last on the enum. }; // Define list of load and store spill opcodes. +#define NoInstr PPC::INSTRUCTION_LIST_END #define Pwr8LoadOpcodes \ { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \ - PPC::SPILLTOVSR_LD, PPC::EVLDD \ + PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, PPC::EVLDD \ } #define Pwr9LoadOpcodes \ { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ - PPC::DFLOADf32, PPC::SPILLTOVSR_LD \ + PPC::DFLOADf32, PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, NoInstr \ + } + +#define Pwr10LoadOpcodes \ + { \ + PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ + PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ + PPC::DFLOADf32, PPC::SPILLTOVSR_LD, PPC::LXVP, PPC::RESTORE_ACC, \ + PPC::RESTORE_UACC, NoInstr \ } #define Pwr8StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, \ - PPC::SPILLTOVSR_ST, PPC::EVSTDD \ + PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, PPC::EVSTDD \ } #define Pwr9StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ - PPC::SPILLTOVSR_ST \ + PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr \ + } + +#define Pwr10StoreOpcodes \ + { \ + PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ + PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ + PPC::SPILLTOVSR_ST, PPC::STXVP, PPC::SPILL_ACC, PPC::SPILL_UACC, \ + NoInstr \ } // Initialize arrays for load and store spill opcodes on supported subtargets. #define StoreOpcodesForSpill \ - { Pwr8StoreOpcodes, Pwr9StoreOpcodes } + { Pwr8StoreOpcodes, Pwr9StoreOpcodes, Pwr10StoreOpcodes } #define LoadOpcodesForSpill \ - { Pwr8LoadOpcodes, Pwr9LoadOpcodes } + { Pwr8LoadOpcodes, Pwr9LoadOpcodes, Pwr10LoadOpcodes } class PPCSubtarget; class PPCInstrInfo : public PPCGenInstrInfo { PPCSubtarget &Subtarget; const PPCRegisterInfo RI; - const unsigned StoreSpillOpcodesArray[2][SOK_LastOpcodeSpill] = + const unsigned StoreSpillOpcodesArray[3][SOK_LastOpcodeSpill] = StoreOpcodesForSpill; - const unsigned LoadSpillOpcodesArray[2][SOK_LastOpcodeSpill] = + const unsigned LoadSpillOpcodesArray[3][SOK_LastOpcodeSpill] = LoadOpcodesForSpill; void StoreRegToStackSlot(MachineFunction &MF, unsigned SrcReg, bool isKill, @@ -226,6 +246,7 @@ unsigned getSpillTarget() const; const unsigned *getStoreOpcodesForSpillArray() const; const unsigned *getLoadOpcodesForSpillArray() const; + unsigned getSpillIndex(const TargetRegisterClass *RC) const; int16_t getFMAOpIdxInfo(unsigned Opcode) const; void reassociateFMA(MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl &InsInstrs, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1361,7 +1361,33 @@ Opc = PPC::CROR; else if (PPC::SPERCRegClass.contains(DestReg, SrcReg)) Opc = PPC::EVOR; - else + else if ((PPC::ACCRCRegClass.contains(DestReg) || + PPC::UACCRCRegClass.contains(DestReg)) && + (PPC::ACCRCRegClass.contains(SrcReg) || + PPC::UACCRCRegClass.contains(SrcReg))) { + // If primed, de-prime the source register, copy the individual registers + // and prime the destination if needed. The vector subregisters are + // vs[(u)acc * 4] - vs[(u)acc * 4 + 3]. If the copy is not a kill and the + // source is primed, we need to re-prime it after the copy as well. + PPCRegisterInfo::emitAccCopyInfo(MBB, DestReg, SrcReg); + bool DestPrimed = PPC::ACCRCRegClass.contains(DestReg); + bool SrcPrimed = PPC::ACCRCRegClass.contains(SrcReg); + MCRegister VSLSrcReg = + PPC::VSL0 + (SrcReg - (SrcPrimed ? PPC::ACC0 : PPC::UACC0)) * 4; + MCRegister VSLDestReg = + PPC::VSL0 + (DestReg - (DestPrimed ? PPC::ACC0 : PPC::UACC0)) * 4; + if (SrcPrimed) + BuildMI(MBB, I, DL, get(PPC::XXMFACC), SrcReg).addReg(SrcReg); + for (unsigned Idx = 0; Idx < 4; Idx++) + BuildMI(MBB, I, DL, get(PPC::XXLOR), VSLDestReg + Idx) + .addReg(VSLSrcReg + Idx) + .addReg(VSLSrcReg + Idx, getKillRegState(KillSrc)); + if (DestPrimed) + BuildMI(MBB, I, DL, get(PPC::XXMTACC), DestReg).addReg(DestReg); + if (SrcPrimed && !KillSrc) + BuildMI(MBB, I, DL, get(PPC::XXMTACC), SrcReg).addReg(SrcReg); + return; + } else llvm_unreachable("Impossible reg-to-reg copy"); const MCInstrDesc &MCID = get(Opc); @@ -1372,7 +1398,7 @@ BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc)); } -static unsigned getSpillIndex(const TargetRegisterClass *RC) { +unsigned PPCInstrInfo::getSpillIndex(const TargetRegisterClass *RC) const { int OpcodeIndex = 0; if (PPC::GPRCRegClass.hasSubClassEq(RC) || @@ -1401,6 +1427,18 @@ OpcodeIndex = SOK_VectorFloat4Spill; } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_SpillToVSR; + } else if (PPC::ACCRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_AccumulatorSpill; + } else if (PPC::UACCRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_UAccumulatorSpill; + } else if (PPC::VSRpRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_PairedVecSpill; } else { llvm_unreachable("Unknown regclass!"); } @@ -2799,7 +2837,10 @@ } unsigned PPCInstrInfo::getSpillTarget() const { - return Subtarget.hasP9Vector() ? 1 : 0; + // With P10, we may need to spill paired vector registers or accumulator + // registers. MMA implies paired vectors, so we can just check that. + bool IsP10Variant = Subtarget.isISA3_1() || Subtarget.pairedVectorMemops(); + return IsP10Variant ? 2 : Subtarget.hasP9Vector() ? 1 : 0; } const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const { diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1294,6 +1294,18 @@ XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB), "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + let mayStore = 1 in { + def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst), + "#SPILL_ACC", []>; + def SPILL_UACC: PPCEmitTimePseudo<(outs), (ins uacc:$AT, memrix16:$dst), + "#SPILL_UACC", []>; + } + let mayLoad = 1, hasSideEffects = 0 in { + def RESTORE_ACC: PPCEmitTimePseudo<(outs acc:$AT), (ins memrix16:$src), + "#RESTORE_ACC", []>; + def RESTORE_UACC: PPCEmitTimePseudo<(outs uacc:$AT), (ins memrix16:$src), + "#RESTORE_UACC", []>; + } } let Predicates = [MMA, PrefixInstrs] in { diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -120,6 +120,14 @@ void lowerCRBitRestore(MachineBasicBlock::iterator II, unsigned FrameIndex) const; + void lowerACCSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerACCRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + + static void emitAccCopyInfo(MachineBasicBlock &MBB, MCRegister DestReg, + MCRegister SrcReg); + bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg, int &FrameIdx) const override; void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -75,6 +75,21 @@ "spill on ppc"), cl::Hidden, cl::init(100)); +// Copies/moves of physical accumulators are expensive operations +// that should be avoided whenever possible. MMA instructions are +// meant to be used in performance-sensitive computational kernels. +// This option is provided, at least for the time being, to give the +// user a tool to detect this expensive operation and either rework +// their code or report a compiler bug if that turns out to be the +// cause. +#ifndef NDEBUG +static cl::opt +ReportAccMoves("ppc-report-acc-moves", + cl::desc("Emit information about accumulator register spills " + "and copies"), + cl::Hidden, cl::init(false)); +#endif + static unsigned offsetMinAlignForOpcode(unsigned OpC); PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM) @@ -936,6 +951,109 @@ MBB.erase(II); } +void PPCRegisterInfo::emitAccCopyInfo(MachineBasicBlock &MBB, + MCRegister DestReg, MCRegister SrcReg) { +#ifdef NDEBUG + return; +#else + if (ReportAccMoves) { + std::string Dest = PPC::ACCRCRegClass.contains(DestReg) ? "acc" : "uacc"; + std::string Src = PPC::ACCRCRegClass.contains(SrcReg) ? "acc" : "uacc"; + dbgs() << "Emitting copy from " << Src << " to " << Dest << ":\n"; + MBB.dump(); + } +#endif +} + +static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed, + bool IsRestore) { +#ifdef NDEBUG + return; +#else + if (ReportAccMoves) { + dbgs() << "Emitting " << (IsPrimed ? "acc" : "uacc") << " register " + << (IsRestore ? "restore" : "spill") << ":\n"; + MBB.dump(); + } +#endif +} + +/// lowerACCSpilling - Generate the code for spilling the accumulator register. +/// Similarly to other spills/reloads that use pseudo-ops, we do not actually +/// eliminate the FrameIndex here nor compute the stack offset. We simply +/// create a real instruction with an FI and rely on eliminateFrameIndex to +/// handle the FI elimination. +void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + MachineInstr &MI = *II; // SPILL_ACC , + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + Register SrcReg = MI.getOperand(0).getReg(); + bool IsKilled = MI.getOperand(0).isKill(); + + bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg); + Register Reg = + PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2; + bool IsLittleEndian = Subtarget.isLittleEndian(); + + emitAccSpillRestoreInfo(MBB, IsPrimed, false); + + // De-prime the register being spilled, create two stores for the pair + // subregisters accounting for endianness and then re-prime the register if + // it isn't killed. This uses the Offset parameter to addFrameReference() to + // adjust the offset of the store that is within the 64-byte stack slot. + if (IsPrimed) + BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(Reg, getKillRegState(IsKilled)), + FrameIndex, IsLittleEndian ? 32 : 0); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(Reg + 1, getKillRegState(IsKilled)), + FrameIndex, IsLittleEndian ? 0 : 32); + if (IsPrimed && !IsKilled) + BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +/// lowerACCRestore - Generate the code to restore the accumulator register. +void PPCRegisterInfo::lowerACCRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + MachineInstr &MI = *II; // = RESTORE_ACC + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + + Register DestReg = MI.getOperand(0).getReg(); + assert(MI.definesRegister(DestReg) && + "RESTORE_ACC does not define its destination"); + + bool IsPrimed = PPC::ACCRCRegClass.contains(DestReg); + Register Reg = + PPC::VSRp0 + (DestReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2; + bool IsLittleEndian = Subtarget.isLittleEndian(); + + emitAccSpillRestoreInfo(MBB, IsPrimed, true); + + // Create two loads for the pair subregisters accounting for endianness and + // then prime the accumulator register being restored. + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg), + FrameIndex, IsLittleEndian ? 32 : 0); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg + 1), + FrameIndex, IsLittleEndian ? 0 : 32); + if (IsPrimed) + BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), DestReg).addReg(DestReg); + + // Discard the pseudo instruction. + MBB.erase(II); +} + bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, Register Reg, int &FrameIdx) const { // For the nonvolatile condition registers (CR2, CR3, CR4) return true to @@ -1067,6 +1185,12 @@ } else if (OpC == PPC::RESTORE_CRBIT) { lowerCRBitRestore(II, FrameIndex); return; + } else if (OpC == PPC::SPILL_ACC || OpC == PPC::SPILL_UACC) { + lowerACCSpilling(II, FrameIndex); + return; + } else if (OpC == PPC::RESTORE_ACC || OpC == PPC::RESTORE_UACC) { + lowerACCRestore(II, FrameIndex); + return; } // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP). diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE + +declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>) +declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) +declare void @foo() +define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4, i8* %ptr) { +; CHECK-LABEL: intrinsics1: +; CHECK: .localentry intrinsics1, 1 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: .cfi_def_cfa_offset 176 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: .cfi_offset r30, -16 +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r0, 16(r1) +; CHECK-NEXT: stdu r1, -176(r1) +; CHECK-NEXT: li r3, 128 +; CHECK-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18 +; CHECK-NEXT: # kill: def $v4 killed $v4 killed $vsrp18 def $vsrp18 +; CHECK-NEXT: # kill: def $v3 killed $v3 killed $vsrp17 def $vsrp17 +; CHECK-NEXT: # kill: def $v2 killed $v2 killed $vsrp17 def $vsrp17 +; CHECK-NEXT: xxlor vs0, v2, v2 +; CHECK-NEXT: xxlor vs1, v3, v3 +; CHECK-NEXT: ld r30, 272(r1) +; CHECK-NEXT: stxvp vsp34, r1(r3) # 32-byte Folded Spill +; CHECK-NEXT: li r3, 96 +; CHECK-NEXT: xxlor vs2, v4, v4 +; CHECK-NEXT: xxlor vs3, v5, v5 +; CHECK-NEXT: stxvp vsp36, r1(r3) # 32-byte Folded Spill +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: li r3, 64 +; CHECK-NEXT: xvf16ger2pp acc0, v2, v4 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxvp vsp0, r1(r3) +; CHECK-NEXT: li r3, 32 +; CHECK-NEXT: stxvp vsp2, r1(r3) +; CHECK-NEXT: bl foo@notoc +; CHECK-NEXT: li r3, 64 +; CHECK-NEXT: lxvp vsp0, r1(r3) +; CHECK-NEXT: li r3, 32 +; CHECK-NEXT: lxvp vsp2, r1(r3) +; CHECK-NEXT: li r3, 128 +; CHECK-NEXT: lxvp vsp34, r1(r3) # 32-byte Folded Reload +; CHECK-NEXT: li r3, 96 +; CHECK-NEXT: lxvp vsp36, r1(r3) # 32-byte Folded Reload +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvf16ger2pp acc0, v2, v4 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r30) +; CHECK-NEXT: stxv vs1, 32(r30) +; CHECK-NEXT: stxv vs2, 16(r30) +; CHECK-NEXT: stxvx vs3, 0, r30 +; CHECK-NEXT: addi r1, r1, 176 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: intrinsics1: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: mflr r0 +; CHECK-BE-NEXT: std r0, 16(r1) +; CHECK-BE-NEXT: stdu r1, -256(r1) +; CHECK-BE-NEXT: .cfi_def_cfa_offset 256 +; CHECK-BE-NEXT: .cfi_offset lr, 16 +; CHECK-BE-NEXT: .cfi_offset r30, -16 +; CHECK-BE-NEXT: li r3, 208 +; CHECK-BE-NEXT: std r30, 240(r1) # 8-byte Folded Spill +; CHECK-BE-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18 +; CHECK-BE-NEXT: # kill: def $v4 killed $v4 killed $vsrp18 def $vsrp18 +; CHECK-BE-NEXT: # kill: def $v3 killed $v3 killed $vsrp17 def $vsrp17 +; CHECK-BE-NEXT: # kill: def $v2 killed $v2 killed $vsrp17 def $vsrp17 +; CHECK-BE-NEXT: xxlor vs0, v2, v2 +; CHECK-BE-NEXT: ld r30, 368(r1) +; CHECK-BE-NEXT: stxvp vsp34, r1(r3) # 32-byte Folded Spill +; CHECK-BE-NEXT: xxlor vs1, v3, v3 +; CHECK-BE-NEXT: li r3, 176 +; CHECK-BE-NEXT: xxlor vs2, v4, v4 +; CHECK-BE-NEXT: xxlor vs3, v5, v5 +; CHECK-BE-NEXT: stxvp vsp36, r1(r3) # 32-byte Folded Spill +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: li r3, 112 +; CHECK-BE-NEXT: xvf16ger2pp acc0, v2, v4 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxvp vsp0, r1(r3) +; CHECK-BE-NEXT: li r3, 144 +; CHECK-BE-NEXT: stxvp vsp2, r1(r3) +; CHECK-BE-NEXT: bl foo +; CHECK-BE-NEXT: nop +; CHECK-BE-NEXT: li r3, 112 +; CHECK-BE-NEXT: lxvp vsp0, r1(r3) +; CHECK-BE-NEXT: li r3, 144 +; CHECK-BE-NEXT: lxvp vsp2, r1(r3) +; CHECK-BE-NEXT: li r3, 208 +; CHECK-BE-NEXT: lxvp vsp34, r1(r3) # 32-byte Folded Reload +; CHECK-BE-NEXT: li r3, 176 +; CHECK-BE-NEXT: lxvp vsp36, r1(r3) # 32-byte Folded Reload +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvf16ger2pp acc0, v2, v4 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r30) +; CHECK-BE-NEXT: stxvx vs0, 0, r30 +; CHECK-BE-NEXT: stxv vs3, 48(r30) +; CHECK-BE-NEXT: stxv vs2, 32(r30) +; CHECK-BE-NEXT: ld r30, 240(r1) # 8-byte Folded Reload +; CHECK-BE-NEXT: addi r1, r1, 256 +; CHECK-BE-NEXT: ld r0, 16(r1) +; CHECK-BE-NEXT: mtlr r0 +; CHECK-BE-NEXT: blr + %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4) + %2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3) + tail call void @foo() + %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3) + %4 = bitcast i8* %ptr to <512 x i1>* + store <512 x i1> %3, <512 x i1>* %4, align 64 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -38,10 +38,6 @@ ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr -; CHECK-O0-LABEL: ass_acc: -; CHECK-O0: # %bb.0: # %entry -; CHECK-BE-O0-LABEL: ass_acc: -; CHECK-BE-O0: # %bb.0: # %entry entry: %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, <512 x i1>* %ptr, align 64 @@ -248,3 +244,457 @@ ret void } +declare <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1>, <16 x i8>, <16 x i8>) +define void @testBranch(<512 x i1>* %ptr, <16 x i8> %vc, i32 %val) { +; CHECK-LABEL: testBranch: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmplwi r7, 0 +; CHECK-NEXT: beq cr0, .LBB7_2 +; CHECK-NEXT: # %bb.1: # %if.then +; CHECK-NEXT: xxsetaccz acc0 +; CHECK-NEXT: b .LBB7_3 +; CHECK-NEXT: .LBB7_2: # %if.else +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvi4ger8pp acc0, v2, v2 +; CHECK-NEXT: .LBB7_3: # %if.end +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r3) +; CHECK-NEXT: stxv vs1, 32(r3) +; CHECK-NEXT: stxv vs2, 16(r3) +; CHECK-NEXT: stxv vs3, 0(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testBranch: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: cmplwi r7, 0 +; CHECK-BE-NEXT: beq cr0, .LBB7_2 +; CHECK-BE-NEXT: # %bb.1: # %if.then +; CHECK-BE-NEXT: xxsetaccz acc0 +; CHECK-BE-NEXT: b .LBB7_3 +; CHECK-BE-NEXT: .LBB7_2: # %if.else +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvi4ger8pp acc0, v2, v2 +; CHECK-BE-NEXT: .LBB7_3: # %if.end +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: stxv vs2, 32(r3) +; CHECK-BE-NEXT: blr +entry: + %tobool = icmp eq i32 %val, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + br label %if.end + +if.else: + %1 = load <512 x i1>, <512 x i1>* %ptr, align 64 + %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc) + br label %if.end + +if.end: + %vq1.0 = phi <512 x i1> [ %0, %if.then ], [ %2, %if.else ] + store <512 x i1> %vq1.0, <512 x i1>* %ptr, align 64 + ret void +} + +; The following test cases check that the xxsetaccz instruction is correctly rematerialized +declare <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1>, <16 x i8>, <16 x i8>) +declare <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1>, <16 x i8>, <16 x i8>) +declare <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1>, <16 x i8>, <16 x i8>) + +define void @testcse(<512 x i1>* %res, <16 x i8> %vc) { +; CHECK-LABEL: testcse: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxsetaccz acc0 +; CHECK-NEXT: xvf32gerpp acc0, v2, v2 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r3) +; CHECK-NEXT: stxv vs1, 32(r3) +; CHECK-NEXT: stxv vs2, 16(r3) +; CHECK-NEXT: stxv vs3, 0(r3) +; CHECK-NEXT: stxv vs0, 112(r3) +; CHECK-NEXT: stxv vs1, 96(r3) +; CHECK-NEXT: stxv vs2, 80(r3) +; CHECK-NEXT: stxv vs3, 64(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testcse: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsetaccz acc0 +; CHECK-BE-NEXT: xvf32gerpp acc0, v2, v2 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: stxv vs2, 32(r3) +; CHECK-BE-NEXT: stxv vs1, 80(r3) +; CHECK-BE-NEXT: stxv vs0, 64(r3) +; CHECK-BE-NEXT: stxv vs3, 112(r3) +; CHECK-BE-NEXT: stxv vs2, 96(r3) +; CHECK-BE-NEXT: blr +entry: + %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) + %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc) + %4 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 0 + %5 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 1 + store <512 x i1> %2, <512 x i1>* %4, align 64 + store <512 x i1> %3, <512 x i1>* %5, align 64 + ret void +} + +define void @testcse2(<512 x i1>* %res, <16 x i8> %vc) { +; CHECK-LABEL: testcse2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxsetaccz acc0 +; CHECK-NEXT: xxsetaccz acc1 +; CHECK-NEXT: xvf32gerpp acc1, v2, v2 +; CHECK-NEXT: xvf32gerpn acc0, v2, v2 +; CHECK-NEXT: xxmfacc acc1 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs4, 48(r3) +; CHECK-NEXT: stxv vs5, 32(r3) +; CHECK-NEXT: stxv vs6, 16(r3) +; CHECK-NEXT: stxv vs7, 0(r3) +; CHECK-NEXT: stxv vs0, 112(r3) +; CHECK-NEXT: stxv vs1, 96(r3) +; CHECK-NEXT: stxv vs2, 80(r3) +; CHECK-NEXT: stxv vs3, 64(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testcse2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsetaccz acc0 +; CHECK-BE-NEXT: xxsetaccz acc1 +; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2 +; CHECK-BE-NEXT: xvf32gerpn acc0, v2, v2 +; CHECK-BE-NEXT: xxmfacc acc1 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs5, 16(r3) +; CHECK-BE-NEXT: stxv vs4, 0(r3) +; CHECK-BE-NEXT: stxv vs7, 48(r3) +; CHECK-BE-NEXT: stxv vs6, 32(r3) +; CHECK-BE-NEXT: stxv vs1, 80(r3) +; CHECK-BE-NEXT: stxv vs0, 64(r3) +; CHECK-BE-NEXT: stxv vs3, 112(r3) +; CHECK-BE-NEXT: stxv vs2, 96(r3) +; CHECK-BE-NEXT: blr +entry: + %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) + %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc) + %4 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 0 + %5 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 1 + store <512 x i1> %2, <512 x i1>* %4, align 64 + store <512 x i1> %3, <512 x i1>* %5, align 64 + ret void +} + +define void @testcse3(<512 x i1>* %res, <16 x i8> %vc) { +; CHECK-LABEL: testcse3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxsetaccz acc0 +; CHECK-NEXT: xxsetaccz acc1 +; CHECK-NEXT: xvf32gerpp acc1, v2, v2 +; CHECK-NEXT: xvf32gerpn acc0, v2, v2 +; CHECK-NEXT: xxmfacc acc1 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs4, 48(r3) +; CHECK-NEXT: stxv vs5, 32(r3) +; CHECK-NEXT: stxv vs6, 16(r3) +; CHECK-NEXT: stxv vs7, 0(r3) +; CHECK-NEXT: stxv vs0, 112(r3) +; CHECK-NEXT: stxv vs1, 96(r3) +; CHECK-NEXT: stxv vs2, 80(r3) +; CHECK-NEXT: stxv vs3, 64(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testcse3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsetaccz acc0 +; CHECK-BE-NEXT: xxsetaccz acc1 +; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2 +; CHECK-BE-NEXT: xvf32gerpn acc0, v2, v2 +; CHECK-BE-NEXT: xxmfacc acc1 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs5, 16(r3) +; CHECK-BE-NEXT: stxv vs4, 0(r3) +; CHECK-BE-NEXT: stxv vs7, 48(r3) +; CHECK-BE-NEXT: stxv vs6, 32(r3) +; CHECK-BE-NEXT: stxv vs1, 80(r3) +; CHECK-BE-NEXT: stxv vs0, 64(r3) +; CHECK-BE-NEXT: stxv vs3, 112(r3) +; CHECK-BE-NEXT: stxv vs2, 96(r3) +; CHECK-BE-NEXT: blr +entry: + %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) + %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) + %3 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 0 + %4 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 1 + store <512 x i1> %1, <512 x i1>* %3, align 64 + store <512 x i1> %2, <512 x i1>* %4, align 64 + ret void +} + +define void @testcse4(<512 x i1>* %res, i32 %lim, <16 x i8>* %vc) { +; CHECK-LABEL: testcse4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpwi r4, 1 +; CHECK-NEXT: bltlr cr0 +; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: li r4, 0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB11_2: # %for.body +; CHECK-NEXT: # +; CHECK-NEXT: rldic r7, r6, 4, 28 +; CHECK-NEXT: addi r6, r6, 6 +; CHECK-NEXT: xxsetaccz acc2 +; CHECK-NEXT: xxsetaccz acc1 +; CHECK-NEXT: lxvx vs0, r5, r7 +; CHECK-NEXT: add r7, r5, r7 +; CHECK-NEXT: lxv vs1, 16(r7) +; CHECK-NEXT: xvf32gerpp acc2, vs0, vs1 +; CHECK-NEXT: lxv vs0, 32(r7) +; CHECK-NEXT: lxv vs1, 48(r7) +; CHECK-NEXT: xxmfacc acc2 +; CHECK-NEXT: xvf32gerpn acc1, vs0, vs1 +; CHECK-NEXT: lxv vs12, 64(r7) +; CHECK-NEXT: lxv vs13, 80(r7) +; CHECK-NEXT: rldic r7, r4, 6, 26 +; CHECK-NEXT: addi r4, r4, 3 +; CHECK-NEXT: xxsetaccz acc0 +; CHECK-NEXT: xxmfacc acc1 +; CHECK-NEXT: xvf32gernp acc0, vs12, vs13 +; CHECK-NEXT: stxvx vs11, r3, r7 +; CHECK-NEXT: add r7, r3, r7 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs8, 48(r7) +; CHECK-NEXT: stxv vs9, 32(r7) +; CHECK-NEXT: stxv vs10, 16(r7) +; CHECK-NEXT: stxv vs4, 112(r7) +; CHECK-NEXT: stxv vs5, 96(r7) +; CHECK-NEXT: stxv vs6, 80(r7) +; CHECK-NEXT: stxv vs7, 64(r7) +; CHECK-NEXT: stxv vs0, 176(r7) +; CHECK-NEXT: stxv vs1, 160(r7) +; CHECK-NEXT: stxv vs2, 144(r7) +; CHECK-NEXT: stxv vs3, 128(r7) +; CHECK-NEXT: bdnz .LBB11_2 +; CHECK-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testcse4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: cmpwi r4, 1 +; CHECK-BE-NEXT: bltlr cr0 +; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-NEXT: clrldi r4, r4, 32 +; CHECK-BE-NEXT: li r6, 0 +; CHECK-BE-NEXT: mtctr r4 +; CHECK-BE-NEXT: li r4, 0 +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .LBB11_2: # %for.body +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: rldic r7, r6, 4, 28 +; CHECK-BE-NEXT: addi r6, r6, 6 +; CHECK-BE-NEXT: xxsetaccz acc2 +; CHECK-BE-NEXT: xxsetaccz acc1 +; CHECK-BE-NEXT: lxvx vs0, r5, r7 +; CHECK-BE-NEXT: add r7, r5, r7 +; CHECK-BE-NEXT: lxv vs1, 16(r7) +; CHECK-BE-NEXT: xvf32gerpp acc2, vs0, vs1 +; CHECK-BE-NEXT: lxv vs0, 32(r7) +; CHECK-BE-NEXT: lxv vs1, 48(r7) +; CHECK-BE-NEXT: xxmfacc acc2 +; CHECK-BE-NEXT: xvf32gerpn acc1, vs0, vs1 +; CHECK-BE-NEXT: lxv vs12, 64(r7) +; CHECK-BE-NEXT: lxv vs13, 80(r7) +; CHECK-BE-NEXT: rldic r7, r4, 6, 26 +; CHECK-BE-NEXT: addi r4, r4, 3 +; CHECK-BE-NEXT: xxsetaccz acc0 +; CHECK-BE-NEXT: xxmfacc acc1 +; CHECK-BE-NEXT: xvf32gernp acc0, vs12, vs13 +; CHECK-BE-NEXT: stxvx vs8, r3, r7 +; CHECK-BE-NEXT: add r7, r3, r7 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs9, 16(r7) +; CHECK-BE-NEXT: stxv vs11, 48(r7) +; CHECK-BE-NEXT: stxv vs10, 32(r7) +; CHECK-BE-NEXT: stxv vs5, 80(r7) +; CHECK-BE-NEXT: stxv vs4, 64(r7) +; CHECK-BE-NEXT: stxv vs7, 112(r7) +; CHECK-BE-NEXT: stxv vs6, 96(r7) +; CHECK-BE-NEXT: stxv vs1, 144(r7) +; CHECK-BE-NEXT: stxv vs0, 128(r7) +; CHECK-BE-NEXT: stxv vs3, 176(r7) +; CHECK-BE-NEXT: stxv vs2, 160(r7) +; CHECK-BE-NEXT: bdnz .LBB11_2 +; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-BE-NEXT: blr +entry: + %cmp55 = icmp sgt i32 %lim, 0 + br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %lim to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %1 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %2 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %3 = trunc i64 %indvars.iv to i32 + %mul = mul nsw i32 %3, 6 + %idxprom = zext i32 %mul to i64 + %arrayidx = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom + %4 = load <16 x i8>, <16 x i8>* %arrayidx, align 16 + %add2 = or i32 %mul, 1 + %idxprom3 = zext i32 %add2 to i64 + %arrayidx4 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom3 + %5 = load <16 x i8>, <16 x i8>* %arrayidx4, align 16 + %6 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %4, <16 x i8> %5) + %add6 = add nuw nsw i32 %mul, 2 + %idxprom7 = zext i32 %add6 to i64 + %arrayidx8 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom7 + %7 = load <16 x i8>, <16 x i8>* %arrayidx8, align 16 + %add10 = add nuw nsw i32 %mul, 3 + %idxprom11 = zext i32 %add10 to i64 + %arrayidx12 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom11 + %8 = load <16 x i8>, <16 x i8>* %arrayidx12, align 16 + %9 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %7, <16 x i8> %8) + %add14 = add nuw nsw i32 %mul, 4 + %idxprom15 = zext i32 %add14 to i64 + %arrayidx16 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom15 + %10 = load <16 x i8>, <16 x i8>* %arrayidx16, align 16 + %add18 = add nuw nsw i32 %mul, 5 + %idxprom19 = zext i32 %add18 to i64 + %arrayidx20 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom19 + %11 = load <16 x i8>, <16 x i8>* %arrayidx20, align 16 + %12 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %2, <16 x i8> %10, <16 x i8> %11) + %mul21 = mul i64 %indvars.iv, 3 + %idx.ext = and i64 %mul21, 4294967295 + %add.ptr = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 %idx.ext + store <512 x i1> %6, <512 x i1>* %add.ptr, align 64 + %add.ptr26 = getelementptr inbounds <512 x i1>, <512 x i1>* %add.ptr, i64 1 + store <512 x i1> %9, <512 x i1>* %add.ptr26, align 64 + %add.ptr30 = getelementptr inbounds <512 x i1>, <512 x i1>* %add.ptr, i64 2 + store <512 x i1> %12, <512 x i1>* %add.ptr30, align 64 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare i32 @testRedundantPrimeUnprimeF() +define void @testRedundantPrimeUnprime(<512 x i1>* %dst, <16 x i8> %vc) nounwind { +; CHECK-LABEL: testRedundantPrimeUnprime: +; CHECK: .localentry testRedundantPrimeUnprime, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r0, 16(r1) +; CHECK-NEXT: stdu r1, -112(r1) +; CHECK-NEXT: xxsetaccz acc0 +; CHECK-NEXT: xxsetaccz acc1 +; CHECK-NEXT: mr r30, r3 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r3) +; CHECK-NEXT: stxv vs1, 32(r3) +; CHECK-NEXT: stxv vs2, 16(r3) +; CHECK-NEXT: stxv vs3, 0(r3) +; CHECK-NEXT: xvf32gerpp acc1, v2, v2 +; CHECK-NEXT: li r3, 64 +; CHECK-NEXT: xxmfacc acc1 +; CHECK-NEXT: stxvp vsp4, r1(r3) +; CHECK-NEXT: li r3, 32 +; CHECK-NEXT: stxvp vsp6, r1(r3) +; CHECK-NEXT: bl testRedundantPrimeUnprimeF@notoc +; CHECK-NEXT: li r3, 64 +; CHECK-NEXT: lxvp vsp0, r1(r3) +; CHECK-NEXT: li r3, 32 +; CHECK-NEXT: lxvp vsp2, r1(r3) +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 112(r30) +; CHECK-NEXT: stxv vs1, 96(r30) +; CHECK-NEXT: stxv vs2, 80(r30) +; CHECK-NEXT: stxv vs3, 64(r30) +; CHECK-NEXT: addi r1, r1, 112 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testRedundantPrimeUnprime: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mflr r0 +; CHECK-BE-NEXT: std r0, 16(r1) +; CHECK-BE-NEXT: stdu r1, -192(r1) +; CHECK-BE-NEXT: xxsetaccz acc0 +; CHECK-BE-NEXT: xxsetaccz acc1 +; CHECK-BE-NEXT: std r30, 176(r1) # 8-byte Folded Spill +; CHECK-BE-NEXT: mr r30, r3 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: stxv vs2, 32(r3) +; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2 +; CHECK-BE-NEXT: li r3, 112 +; CHECK-BE-NEXT: xxmfacc acc1 +; CHECK-BE-NEXT: stxvp vsp4, r1(r3) +; CHECK-BE-NEXT: li r3, 144 +; CHECK-BE-NEXT: stxvp vsp6, r1(r3) +; CHECK-BE-NEXT: bl testRedundantPrimeUnprimeF +; CHECK-BE-NEXT: nop +; CHECK-BE-NEXT: li r3, 112 +; CHECK-BE-NEXT: lxvp vsp0, r1(r3) +; CHECK-BE-NEXT: li r3, 144 +; CHECK-BE-NEXT: lxvp vsp2, r1(r3) +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs3, 112(r30) +; CHECK-BE-NEXT: stxv vs2, 96(r30) +; CHECK-BE-NEXT: stxv vs1, 80(r30) +; CHECK-BE-NEXT: stxv vs0, 64(r30) +; CHECK-BE-NEXT: ld r30, 176(r1) # 8-byte Folded Reload +; CHECK-BE-NEXT: addi r1, r1, 192 +; CHECK-BE-NEXT: ld r0, 16(r1) +; CHECK-BE-NEXT: mtlr r0 +; CHECK-BE-NEXT: blr +entry: + %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + store <512 x i1> %0, <512 x i1>* %dst, align 64 + %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) + %call = tail call signext i32 bitcast (i32 ()* @testRedundantPrimeUnprimeF to i32 ()*)() + %add.ptr1 = getelementptr inbounds <512 x i1>, <512 x i1>* %dst, i64 1 + store <512 x i1> %1, <512 x i1>* %add.ptr1, align 64 + ret void +} + +declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32) +declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>)