diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1306,7 +1306,10 @@ setOperationAction(ISD::STORE, MVT::v256i1, Custom); } if (Subtarget.hasMMA()) { - addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass); + if (Subtarget.isISAFuture()) + addRegisterClass(MVT::v512i1, &PPC::WACCRCRegClass); + else + addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass); setOperationAction(ISD::LOAD, MVT::v512i1, Custom); setOperationAction(ISD::STORE, MVT::v512i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom); @@ -10487,7 +10490,46 @@ return DAG.getRegister(PPC::X13, MVT::i64); return DAG.getRegister(PPC::R2, MVT::i32); - case Intrinsic::ppc_mma_disassemble_acc: + case Intrinsic::ppc_mma_disassemble_acc: { + if (Subtarget.isISAFuture()) { + EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; + SDValue WideVec = SDValue(DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, + makeArrayRef(ReturnTypes, 2), + Op.getOperand(1)), + 0); + SmallVector RetOps; + SDValue Value = SDValue(WideVec.getNode(), 0); + SDValue Value2 = SDValue(WideVec.getNode(), 1); + + SDValue Extract; + Extract = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Subtarget.isLittleEndian() ? Value2 : Value, + DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0, + dl, getPointerTy(DAG.getDataLayout()))); + RetOps.push_back(Extract); + Extract = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Subtarget.isLittleEndian() ? Value2 : Value, + DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1, + dl, getPointerTy(DAG.getDataLayout()))); + RetOps.push_back(Extract); + Extract = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Subtarget.isLittleEndian() ? Value : Value2, + DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0, + dl, getPointerTy(DAG.getDataLayout()))); + RetOps.push_back(Extract); + Extract = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Subtarget.isLittleEndian() ? Value : Value2, + DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1, + dl, getPointerTy(DAG.getDataLayout()))); + RetOps.push_back(Extract); + return DAG.getMergeValues(RetOps, dl); + } + LLVM_FALLTHROUGH; + } case Intrinsic::ppc_vsx_disassemble_pair: { int NumVecs = 2; SDValue WideVec = Op.getOperand(1); @@ -10941,6 +10983,7 @@ SDValue StoreChain = SN->getChain(); SDValue BasePtr = SN->getBasePtr(); SDValue Value = SN->getValue(); + SDValue Value2 = SN->getValue(); EVT StoreVT = Value.getValueType(); if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1) @@ -10957,13 +11000,30 @@ SmallVector Stores; unsigned NumVecs = 2; if (StoreVT == MVT::v512i1) { - Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value); + if (Subtarget.isISAFuture()) { + EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; + MachineSDNode *ExtNode = DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, + makeArrayRef(ReturnTypes, 2), + Op.getOperand(1)); + + Value = SDValue(ExtNode, 0); + Value2 = SDValue(ExtNode, 1); + } else + Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value); NumVecs = 4; } for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx; - SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, - DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); + SDValue Elt; + if (Subtarget.isISAFuture()) { + VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2); + Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Idx > 1 ? Value2 : Value, + DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); + } else + Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, + DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); + SDValue Store = DAG.getStore(StoreChain, dl, Elt, BasePtr, SN->getPointerInfo().getWithOffset(Idx * 16), diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -130,6 +130,7 @@ SOK_PairedVecSpill, SOK_AccumulatorSpill, SOK_UAccumulatorSpill, + SOK_WAccumulatorSpill, SOK_SPESpill, SOK_PairedG8Spill, SOK_LastOpcodeSpill // This must be last on the enum. @@ -141,7 +142,7 @@ { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \ - PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, PPC::EVLDD, \ + PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, NoInstr, PPC::EVLDD, \ PPC::RESTORE_QUADWORD \ } @@ -150,7 +151,7 @@ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ PPC::DFLOADf32, PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, \ - NoInstr, PPC::RESTORE_QUADWORD \ + NoInstr, NoInstr, PPC::RESTORE_QUADWORD \ } #define Pwr10LoadOpcodes \ @@ -158,14 +159,22 @@ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ PPC::DFLOADf32, PPC::SPILLTOVSR_LD, PPC::LXVP, PPC::RESTORE_ACC, \ - PPC::RESTORE_UACC, NoInstr, PPC::RESTORE_QUADWORD \ + PPC::RESTORE_UACC, NoInstr, NoInstr, PPC::RESTORE_QUADWORD \ + } + +#define FutureLoadOpcodes \ + { \ + PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ + PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ + PPC::DFLOADf32, PPC::SPILLTOVSR_LD, PPC::LXVP, PPC::RESTORE_ACC, \ + PPC::RESTORE_UACC, PPC::RESTORE_WACC, NoInstr, PPC::RESTORE_QUADWORD \ } #define Pwr8StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, \ - PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, PPC::EVSTDD, \ + PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr, PPC::EVSTDD, \ PPC::SPILL_QUADWORD \ } @@ -173,7 +182,7 @@ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ - PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr, \ + PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr, NoInstr, \ PPC::SPILL_QUADWORD \ } @@ -182,22 +191,30 @@ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ PPC::SPILLTOVSR_ST, PPC::STXVP, PPC::SPILL_ACC, PPC::SPILL_UACC, \ - NoInstr, PPC::SPILL_QUADWORD \ + NoInstr, NoInstr, PPC::SPILL_QUADWORD \ + } + +#define FutureStoreOpcodes \ + { \ + PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ + PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ + PPC::SPILLTOVSR_ST, PPC::STXVP, PPC::SPILL_ACC, PPC::SPILL_UACC, \ + PPC::SPILL_WACC, NoInstr, PPC::SPILL_QUADWORD \ } // Initialize arrays for load and store spill opcodes on supported subtargets. #define StoreOpcodesForSpill \ - { Pwr8StoreOpcodes, Pwr9StoreOpcodes, Pwr10StoreOpcodes } + { Pwr8StoreOpcodes, Pwr9StoreOpcodes, Pwr10StoreOpcodes, FutureStoreOpcodes } #define LoadOpcodesForSpill \ - { Pwr8LoadOpcodes, Pwr9LoadOpcodes, Pwr10LoadOpcodes } + { Pwr8LoadOpcodes, Pwr9LoadOpcodes, Pwr10LoadOpcodes, FutureLoadOpcodes } class PPCSubtarget; class PPCInstrInfo : public PPCGenInstrInfo { PPCSubtarget &Subtarget; const PPCRegisterInfo RI; - const unsigned StoreSpillOpcodesArray[3][SOK_LastOpcodeSpill] = + const unsigned StoreSpillOpcodesArray[4][SOK_LastOpcodeSpill] = StoreOpcodesForSpill; - const unsigned LoadSpillOpcodesArray[3][SOK_LastOpcodeSpill] = + const unsigned LoadSpillOpcodesArray[4][SOK_LastOpcodeSpill] = LoadOpcodesForSpill; void StoreRegToStackSlot(MachineFunction &MF, unsigned SrcReg, bool isKill, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1122,6 +1122,7 @@ case PPC::CRSET: case PPC::CRUNSET: case PPC::XXSETACCZ: + case PPC::XXSETACCZW: return true; } return false; @@ -1907,6 +1908,10 @@ assert(Subtarget.pairedVectorMemops() && "Register unexpected when paired memops are disabled."); OpcodeIndex = SOK_UAccumulatorSpill; + } else if (PPC::WACCRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_WAccumulatorSpill; } else if (PPC::VSRpRCRegClass.hasSubClassEq(RC)) { assert(Subtarget.pairedVectorMemops() && "Register unexpected when paired memops are disabled."); @@ -3373,7 +3378,9 @@ // With P10, we may need to spill paired vector registers or accumulator // registers. MMA implies paired vectors, so we can just check that. bool IsP10Variant = Subtarget.isISA3_1() || Subtarget.pairedVectorMemops(); - return IsP10Variant ? 2 : Subtarget.hasP9Vector() ? 1 : 0; + return Subtarget.isISAFuture() ? 3 : IsP10Variant ? + 2 : Subtarget.hasP9Vector() ? + 1 : 0; } const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -711,6 +711,7 @@ def IsAIX : Predicate<"Subtarget->isAIXABI()">; def NotAIX : Predicate<"!Subtarget->isAIXABI()">; def IsISAFuture : Predicate<"Subtarget->isISAFuture()">; +def IsNotISAFuture : Predicate<"!Subtarget->isISAFuture()">; //===----------------------------------------------------------------------===// // PowerPC Multiclass Definitions. diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td --- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td @@ -14,7 +14,7 @@ // is even/odd. multiclass ACC_UM_XOEO opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { - let Predicates = [MMA] in { + let Predicates = [MMA, IsNotISAFuture] in { def NAME : XX3Form_AT3_XAB6, @@ -24,6 +24,16 @@ !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } + let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { + def NAME#W : + XX3Form_AT3_XAB6, + RegConstraint<"@earlyclobber $AT">; + def WPP : + XX3Form_AT3_XAB6, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } } // Defines 4 instructions, masked/unmasked with masks 8, 4, 4 bits. @@ -31,7 +41,7 @@ multiclass ACC_UM_M844_XOEO opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { defm NAME : ACC_UM_XOEO; - let Predicates = [MMA, PrefixInstrs] in { + let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PM#NAME : MMIRR_XX3Form_XY4P8_XAB6< opcode, !or(xo, 0x01), (outs acc:$AT), @@ -48,6 +58,23 @@ IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } + let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { + def PM#NAME#W : + MMIRR_XX3Form_XY4P8_XAB6< + opcode, !or(xo, 0x01), (outs wacc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#WPP : + MMIRR_XX3Form_XY4P8_XAB6< + opcode, xo, (outs wacc:$AT), + !con((ins wacc:$ATi), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } } // Defines 4 instructions, masked/unmasked with masks 4, 4, 4 bits. @@ -55,7 +82,7 @@ multiclass ACC_UM_M444_XOEO opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { defm NAME : ACC_UM_XOEO; - let Predicates = [MMA, PrefixInstrs] in { + let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PM#NAME : MMIRR_XX3Form_XYP4_XAB6< opcode, !or(xo, 0x01), (outs acc:$AT), @@ -72,6 +99,23 @@ IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } + let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { + def PM#NAME#W : + MMIRR_XX3Form_XYP4_XAB6< + opcode, !or(xo, 0x01), (outs wacc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#WPP : + MMIRR_XX3Form_XYP4_XAB6< + opcode, xo, (outs wacc:$AT), + !con((ins wacc:$ATi), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } } // Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits. @@ -79,7 +123,7 @@ multiclass ACC_UM_M244_XOEO opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { defm NAME : ACC_UM_XOEO; - let Predicates = [MMA, PrefixInstrs] in { + let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PM#NAME : MMIRR_XX3Form_XY4P2_XAB6< opcode, !or(xo, 0x01), (outs acc:$AT), @@ -95,13 +139,29 @@ IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } + let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { + def PM#NAME#W : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0x01), (outs wacc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#WPP : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, xo, (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } } // Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits. // Upper nibble of XO field for acc/non-acc version is 0x4/0x6. multiclass ACC_UM_M244_XO46 opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { - let Predicates = [MMA] in { + let Predicates = [MMA, IsNotISAFuture] in { def NAME : XX3Form_AT3_XAB6, @@ -112,7 +172,7 @@ !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } - let Predicates = [MMA, PrefixInstrs] in { + let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PM#NAME : MMIRR_XX3Form_XY4P2_XAB6< opcode, xo, (outs acc:$AT), @@ -129,6 +189,34 @@ IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } + let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { + def NAME#W : + XX3Form_AT3_XAB6, + RegConstraint<"@earlyclobber $AT">; + def WPP : + XX3Form_AT3_XAB6< + opcode, !or(xo, 0x20), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), + !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } + let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { + def PM#NAME#W : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, xo, (outs wacc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#WPP : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0x20), (outs acc:$AT), + !con((ins wacc:$ATi), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } } // Defines 10 instructions, operand negating, unmasked, masked with 2, 4, 4 @@ -136,7 +224,7 @@ multiclass ACC_NEG_UM_M244_XOM84C opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { defm NAME : ACC_UM_M244_XOEO; - let Predicates = [MMA] in { + let Predicates = [MMA, IsNotISAFuture] in { def PN : XX3Form_AT3_XAB6< opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL), !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, @@ -150,7 +238,21 @@ !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } - let Predicates = [MMA, PrefixInstrs] in { + let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { + def WPN : XX3Form_AT3_XAB6< + opcode, !or(xo, 0x80), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), + !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def WNP : XX3Form_AT3_XAB6< + opcode, !or(xo, 0x40), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), + !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def WNN : XX3Form_AT3_XAB6< + opcode, !or(xo, 0xC0), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), + !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } + let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PM#NAME#PN : MMIRR_XX3Form_XY4P2_XAB6< opcode, !or(xo, 0x80), (outs acc:$AT), @@ -173,6 +275,29 @@ IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } + let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { + def PM#NAME#WPN : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0x80), (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#WNP : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0x40), (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#WNN : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0xC0), (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } } // Defines 5 instructions, unmasked, operand negating. @@ -180,7 +305,7 @@ multiclass ACC_NEG_UM_XOM84C opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { defm NAME : ACC_UM_XOEO; - let Predicates = [MMA] in { + let Predicates = [MMA, IsNotISAFuture] in { def PN : XX3Form_AT3_XAB6, @@ -194,6 +319,20 @@ !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } + let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { + def WPN : XX3Form_AT3_XAB6, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def WNP : XX3Form_AT3_XAB6, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def WNN : XX3Form_AT3_XAB6, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } } // Defines 10 instructions, operand negating, unmasked, masked with 4, 4 bits. @@ -201,7 +340,7 @@ multiclass ACC_NEG_UM_M44_XOM84C opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { defm NAME : ACC_NEG_UM_XOM84C; - let Predicates = [MMA, PrefixInstrs] in { + let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PM#NAME : MMIRR_XX3Form_XY4_XAB6< opcode, !or(xo, 0x01), (outs acc:$AT), @@ -238,6 +377,43 @@ IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } + let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { + def PM#NAME#W : + MMIRR_XX3Form_XY4_XAB6< + opcode, !or(xo, 0x01), (outs wacc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#WPP : + MMIRR_XX3Form_XY4_XAB6< + opcode, xo, (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#WPN : + MMIRR_XX3Form_XY4_XAB6< + opcode, !or(xo, 0x80), (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), + !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#WNP : + MMIRR_XX3Form_XY4_XAB6< + opcode, !or(xo, 0x40), (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), + !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#WNN : + MMIRR_XX3Form_XY4_XAB6< + opcode, !or(xo, 0xC0), (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), + !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } } // Defines 10 instructions, operand negating, unmasked, masked with 4, 2 bits. @@ -245,7 +421,7 @@ multiclass ACC_NEG_UM_M42_XOM84C opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { defm NAME : ACC_NEG_UM_XOM84C; - let Predicates = [MMA, PrefixInstrs] in { + let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PM#NAME : MMIRR_XX3Form_X4Y2_XAB6< opcode, !or(xo, 0x01), (outs acc:$AT), @@ -282,12 +458,49 @@ IIC_VecFP, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } + let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { + def PM#NAME#W : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, !or(xo, 0x01), (outs wacc:$AT), + !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#WPP : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, xo, (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#WPN : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, !or(xo, 0x80), (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), + !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#WNP : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, !or(xo, 0x40), (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), + !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#WNN : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, !or(xo, 0xC0), (outs wacc:$AT), + !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), + !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } } // End of class definitions. //----------------------------------------------------------------------------- -let Predicates = [MMA] in { +let Predicates = [MMA, IsNotISAFuture] in { def XXMFACC : XForm_AT3<31, 0, 177, (outs acc:$ASo), (ins acc:$AS), "xxmfacc $AS", IIC_VecGeneral, @@ -329,7 +542,40 @@ } } -let Predicates = [MMA, PrefixInstrs] in { +let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { + // For Future and up XXMFACCW and XXMTACCW will not have patterns. + def XXMFACCW : + XForm_AT3<31, 0, 177, (outs wacc:$ASo), (ins wacc:$AS), "xxmfacc $AS", + IIC_VecGeneral, []>, + RegConstraint<"$ASo = $AS">, NoEncode<"$ASo">; + def XXMTACCW : + XForm_AT3<31, 1, 177, (outs wacc:$AT), (ins wacc:$ATi), "xxmtacc $AT", + IIC_VecGeneral, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + + let isAsCheapAsAMove = 1, isReMaterializable = 1 in { + def XXSETACCZW : + XForm_AT3<31, 3, 177, (outs wacc:$AT), (ins), "xxsetaccz $AT", + IIC_VecGeneral, [(set v512i1:$AT, (int_ppc_mma_xxsetaccz))]>; + } + + def XVI8GER4WSPP : + XX3Form_AT3_XAB6<59, 99, (outs wacc:$AT), + (ins wacc:$ATi, vsrc:$XA, vsrc:$XB), + "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + + let mayStore = 1 in { + def SPILL_WACC: PPCEmitTimePseudo<(outs), (ins wacc:$AT, memrix16:$dst), + "#SPILL_WACC", []>; + } + let mayLoad = 1, hasSideEffects = 0 in { + def RESTORE_WACC: PPCEmitTimePseudo<(outs wacc:$AT), (ins memrix16:$src), + "#RESTORE_WACC", []>; + } +} + +let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PMXVI8GER4SPP : MMIRR_XX3Form_XYP4_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA,vsrc:$XB, u4imm:$XMSK, @@ -339,6 +585,16 @@ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; } +let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { + def PMXVI8GER4WSPP : + MMIRR_XX3Form_XYP4_XAB6<59, 99, (outs wacc:$AT), + (ins wacc:$ATi, vsrc:$XA,vsrc:$XB, u4imm:$XMSK, + u4imm:$YMSK, u4imm:$PMSK), + "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK", + IIC_VecGeneral, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; +} + // MMA accumulating/non-accumulating instructions. //------------------------------------------------------------------------------ @@ -380,7 +636,7 @@ //------------------------------------------------------------------------------ // MMA Intrinsics -let Predicates = [MMA] in { +let Predicates = [MMA, IsNotISAFuture] in { def : Pat<(v512i1 (int_ppc_mma_xvi4ger8 v16i8:$XA, v16i8:$XB)), (XVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), @@ -395,7 +651,26 @@ (XVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), (XVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; +} + +let Predicates = [MMA, IsISAFuture] in { + def : Pat<(v512i1 (int_ppc_mma_xvi4ger8 v16i8:$XA, v16i8:$XB)), + (XVI4GER8W RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI4GER8WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + + def : Pat<(v512i1 (int_ppc_mma_xvi8ger4 v16i8:$XA, v16i8:$XB)), + (XVI8GER4W RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI8GER4WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + + def : Pat<(v512i1 (int_ppc_mma_xvi16ger2s v16i8:$XA, v16i8:$XB)), + (XVI16GER2SW RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI16GER2SWPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; +} +let Predicates = [MMA, IsNotISAFuture] in { def : Pat<(v512i1 (int_ppc_mma_xvf16ger2 v16i8:$XA, v16i8:$XB)), (XVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), @@ -406,7 +681,22 @@ (XVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), (XVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; +} +let Predicates = [MMA, IsISAFuture] in { + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2 v16i8:$XA, v16i8:$XB)), + (XVF16GER2W RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF16GER2WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF16GER2WPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF16GER2WNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF16GER2WNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; +} + +let Predicates = [MMA, IsNotISAFuture] in { def : Pat<(v512i1 (int_ppc_mma_xvf32ger v16i8:$XA, v16i8:$XB)), (XVF32GER RCCp.AToVSRC, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), @@ -446,8 +736,48 @@ (XVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; } +let Predicates = [MMA, IsISAFuture] in { + def : Pat<(v512i1 (int_ppc_mma_xvf32ger v16i8:$XA, v16i8:$XB)), + (XVF32GERW RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF32GERWPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF32GERWPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF32GERWNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF32GERWNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64ger v256i1:$XA, v16i8:$XB)), + (XVF64GERW $XA, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB)), + (XVF64GERWPP $ATi, $XA, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), + (XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)), + (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), + (XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>; + + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2 v16i8:$XA, v16i8:$XB)), + (XVBF16GER2W RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVBF16GER2WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVBF16GER2WPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVBF16GER2WNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVBF16GER2WNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi16ger2 v16i8:$XA, v16i8:$XB)), + (XVI16GER2W RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI16GER2WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI8GER4WSPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; +} // MMA Intrinsics -let Predicates = [MMA, PrefixInstrs] in { + +let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, Msk4Imm:$YMSK, Msk8Imm:$PMSK)), (PMXVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, @@ -583,6 +913,142 @@ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; } +let Predicates = [MMA, PrefixInstrs, IsISAFuture] in { + def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk8Imm:$PMSK)), + (PMXVI4GER8W RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk8Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk8Imm:$PMSK)), + (PMXVI4GER8WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk8Imm:$PMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk4Imm:$PMSK)), + (PMXVI8GER4W RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk4Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk4Imm:$PMSK)), + (PMXVI8GER4WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk4Imm:$PMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2s v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMXVI16GER2SW RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVI16GER2SWPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMXVF16GER2W RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2WPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2WNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2WNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvf32ger v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)), + (PMXVF32GERW RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK)), + (PMXVF32GERWPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK)), + (PMXVF32GERWPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK)), + (PMXVF32GERWNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK)), + (PMXVF32GERWNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvf64ger v256i1:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)), + (PMXVF64GERW $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, Msk2Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk2Imm:$YMSK)), + (PMXVF64GERWPP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk2Imm:$YMSK)), + (PMXVF64GERWPN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk2Imm:$YMSK)), + (PMXVF64GERWNP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk2Imm:$YMSK)), + (PMXVF64GERWNN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMXVBF16GER2W RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVBF16GER2WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVBF16GER2WPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVBF16GER2WNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVBF16GER2WNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMXVI16GER2W RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVI8GER4WSPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVI16GER2WPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; +} + def ConcatsMMA { dag VecsToVecPair0 = (v256i1 (INSERT_SUBREG @@ -608,7 +1074,7 @@ dag Vec3 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx1)); } -let Predicates = [MMA] in { +let Predicates = [MMA, IsNotISAFuture] in { def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)), (XXMTACC ConcatsMMA.VecsToVecQuad)>; def : Pat<(v512i1 (int_ppc_mma_assemble_acc v16i8:$vs1, v16i8:$vs0, @@ -625,4 +1091,11 @@ Extracts.Vec3>; } - +let Predicates = [MMA, IsISAFuture] in { + def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)), + (DMXXINSTFDMR512 ConcatsMMA.VecsToVecPair0, ConcatsMMA.VecsToVecPair1)>; + def : Pat<(v512i1 (int_ppc_mma_assemble_acc v16i8:$vs1, v16i8:$vs0, + v16i8:$vs3, v16i8:$vs2)), + (DMXXINSTFDMR512 ConcatsMMA.VecsToVecPair0, ConcatsMMA.VecsToVecPair1)>; + def : Pat<(v512i1 immAllZerosV), (XXSETACCZW)>; +} diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -137,6 +137,11 @@ void lowerACCRestore(MachineBasicBlock::iterator II, unsigned FrameIndex) const; + void lowerWACCSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerWACCRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerQuadwordSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex) const; void lowerQuadwordRestore(MachineBasicBlock::iterator II, diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -575,6 +575,13 @@ // as we are just looking to provide a hint. bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( VirtReg, Order, Hints, MF, VRM, Matrix); + + // Don't use the allocation hints for ISAFuture. + // The WACC regsiters used in ISAFuture are unlike the ACC registers on + // Power 10 and so this logic to register allocation hints does not apply. + if (MF.getSubtarget().isISAFuture()) + return BaseImplRetVal; + // We are interested in instructions that copy values to ACC/UACC. // The copy into UACC will be simply a COPY to a subreg so we // want to allocate the corresponding physical subreg for the source. @@ -1234,6 +1241,11 @@ unsigned FrameIndex, bool IsLittleEndian, bool IsKilled, bool TwoPairs) { unsigned Offset = 0; + // The register arithmetic in this function does not support virtual + // registers. + assert(!SrcReg.isVirtual() && + "Spilling register pairs does not support virtual registers."); + if (TwoPairs) Offset = IsLittleEndian ? 48 : 0; else @@ -1281,6 +1293,18 @@ MBB.erase(II); } +static void emitWAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsRestore) { +#ifdef NDEBUG + return; +#else + if (ReportAccMoves) { + dbgs() << "Emitting wacc register " << (IsRestore ? "restore" : "spill") + << ":\n"; + MBB.dump(); + } +#endif +} + /// lowerACCSpilling - Generate the code for spilling the accumulator register. /// Similarly to other spills/reloads that use pseudo-ops, we do not actually /// eliminate the FrameIndex here nor compute the stack offset. We simply @@ -1362,6 +1386,73 @@ MBB.erase(II); } +/// lowerWACCSpilling - Generate the code for spilling the wide accumulator +/// register. +void PPCRegisterInfo::lowerWACCSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + MachineInstr &MI = *II; // SPILL_WACC , + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + bool IsLittleEndian = Subtarget.isLittleEndian(); + + emitWAccSpillRestoreInfo(MBB, false); + + const TargetRegisterClass *RC = &PPC::VSRpRCRegClass; + Register VSRpReg0 = MF.getRegInfo().createVirtualRegister(RC); + Register VSRpReg1 = MF.getRegInfo().createVirtualRegister(RC); + Register SrcReg = MI.getOperand(0).getReg(); + + BuildMI(MBB, II, DL, TII.get(PPC::DMXXEXTFDMR512), VSRpReg0) + .addDef(VSRpReg1) + .addReg(SrcReg); + + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(VSRpReg0, RegState::Kill), + FrameIndex, IsLittleEndian ? 32 : 0); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(VSRpReg1, RegState::Kill), + FrameIndex, IsLittleEndian ? 0 : 32); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +/// lowerWACCRestore - Generate the code to restore the wide accumulator +/// register. +void PPCRegisterInfo::lowerWACCRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + MachineInstr &MI = *II; // = RESTORE_WACC + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + bool IsLittleEndian = Subtarget.isLittleEndian(); + + emitWAccSpillRestoreInfo(MBB, true); + + const TargetRegisterClass *RC = &PPC::VSRpRCRegClass; + Register VSRpReg0 = MF.getRegInfo().createVirtualRegister(RC); + Register VSRpReg1 = MF.getRegInfo().createVirtualRegister(RC); + Register DestReg = MI.getOperand(0).getReg(); + + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg0), + FrameIndex, IsLittleEndian ? 32 : 0); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg1), + FrameIndex, IsLittleEndian ? 0 : 32); + + // Kill VSRpReg0, VSRpReg1 (killedRegState::Killed) + BuildMI(MBB, II, DL, TII.get(PPC::DMXXINSTFDMR512), DestReg) + .addReg(VSRpReg0, RegState::Kill) + .addReg(VSRpReg1, RegState::Kill); + + // Discard the pseudo instruction. + MBB.erase(II); +} + /// lowerQuadwordSpilling - Generate code to spill paired general register. void PPCRegisterInfo::lowerQuadwordSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex) const { @@ -1559,6 +1650,12 @@ } else if (OpC == PPC::STXVP && DisableAutoPairedVecSt) { lowerOctWordSpilling(II, FrameIndex); return; + } else if (OpC == PPC::SPILL_WACC) { + lowerWACCSpilling(II, FrameIndex); + return; + } else if (OpC == PPC::RESTORE_WACC) { + lowerWACCRestore(II, FrameIndex); + return; } else if (OpC == PPC::SPILL_QUADWORD) { lowerQuadwordSpilling(II, FrameIndex); return; diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; This test is a copy of mma-acc-spill.ll except that it uses mcpu=future. +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -disable-auto-paired-vec-st=false \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -disable-auto-paired-vec-st=false \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE + +declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>) +declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) +declare void @foo() +define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4, ptr %ptr) { +; CHECK-LABEL: intrinsics1: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: std r0, 16(r1) +; CHECK-NEXT: stdu r1, -176(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 176 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: .cfi_offset r30, -16 +; CHECK-NEXT: .cfi_offset v28, -80 +; CHECK-NEXT: .cfi_offset v29, -64 +; CHECK-NEXT: .cfi_offset v30, -48 +; CHECK-NEXT: .cfi_offset v31, -32 +; CHECK-NEXT: stxv v28, 96(r1) # 16-byte Folded Spill +; CHECK-NEXT: stxv v29, 112(r1) # 16-byte Folded Spill +; CHECK-NEXT: stxv v30, 128(r1) # 16-byte Folded Spill +; CHECK-NEXT: stxv v31, 144(r1) # 16-byte Folded Spill +; CHECK-NEXT: vmr v31, v5 +; CHECK-NEXT: vmr v30, v4 +; CHECK-NEXT: vmr v29, v3 +; CHECK-NEXT: vmr v28, v2 +; CHECK-NEXT: std r30, 160(r1) # 8-byte Folded Spill +; CHECK-NEXT: ld r30, 272(r1) +; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-NEXT: xvf16ger2pp wacc0, v2, v4 +; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: stxvp vsp36, 64(r1) +; CHECK-NEXT: stxvp vsp34, 32(r1) +; CHECK-NEXT: bl foo@notoc +; CHECK-NEXT: lxvp vsp34, 64(r1) +; CHECK-NEXT: lxvp vsp36, 32(r1) +; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: xvf16ger2pp wacc0, v28, v30 +; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: stxv v4, 48(r30) +; CHECK-NEXT: stxv v5, 32(r30) +; CHECK-NEXT: stxv v2, 16(r30) +; CHECK-NEXT: stxv v3, 0(r30) +; CHECK-NEXT: lxv v31, 144(r1) # 16-byte Folded Reload +; CHECK-NEXT: lxv v30, 128(r1) # 16-byte Folded Reload +; CHECK-NEXT: lxv v29, 112(r1) # 16-byte Folded Reload +; CHECK-NEXT: lxv v28, 96(r1) # 16-byte Folded Reload +; CHECK-NEXT: ld r30, 160(r1) # 8-byte Folded Reload +; CHECK-NEXT: addi r1, r1, 176 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: intrinsics1: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: mflr r0 +; CHECK-BE-NEXT: std r0, 16(r1) +; CHECK-BE-NEXT: stdu r1, -256(r1) +; CHECK-BE-NEXT: .cfi_def_cfa_offset 256 +; CHECK-BE-NEXT: .cfi_offset lr, 16 +; CHECK-BE-NEXT: .cfi_offset r30, -16 +; CHECK-BE-NEXT: .cfi_offset v28, -80 +; CHECK-BE-NEXT: .cfi_offset v29, -64 +; CHECK-BE-NEXT: .cfi_offset v30, -48 +; CHECK-BE-NEXT: .cfi_offset v31, -32 +; CHECK-BE-NEXT: stxv v28, 176(r1) # 16-byte Folded Spill +; CHECK-BE-NEXT: stxv v29, 192(r1) # 16-byte Folded Spill +; CHECK-BE-NEXT: stxv v30, 208(r1) # 16-byte Folded Spill +; CHECK-BE-NEXT: stxv v31, 224(r1) # 16-byte Folded Spill +; CHECK-BE-NEXT: vmr v31, v5 +; CHECK-BE-NEXT: vmr v30, v4 +; CHECK-BE-NEXT: vmr v29, v3 +; CHECK-BE-NEXT: vmr v28, v2 +; CHECK-BE-NEXT: std r30, 240(r1) # 8-byte Folded Spill +; CHECK-BE-NEXT: ld r30, 368(r1) +; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-BE-NEXT: xvf16ger2pp wacc0, v2, v4 +; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: stxvp vsp36, 112(r1) +; CHECK-BE-NEXT: stxvp vsp34, 144(r1) +; CHECK-BE-NEXT: bl foo +; CHECK-BE-NEXT: nop +; CHECK-BE-NEXT: lxvp vsp34, 112(r1) +; CHECK-BE-NEXT: lxvp vsp36, 144(r1) +; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-NEXT: xvf16ger2pp wacc0, v28, v30 +; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-NEXT: stxv v5, 48(r30) +; CHECK-BE-NEXT: stxv v4, 32(r30) +; CHECK-BE-NEXT: stxv v3, 16(r30) +; CHECK-BE-NEXT: stxv v2, 0(r30) +; CHECK-BE-NEXT: lxv v31, 224(r1) # 16-byte Folded Reload +; CHECK-BE-NEXT: lxv v30, 208(r1) # 16-byte Folded Reload +; CHECK-BE-NEXT: lxv v29, 192(r1) # 16-byte Folded Reload +; CHECK-BE-NEXT: lxv v28, 176(r1) # 16-byte Folded Reload +; CHECK-BE-NEXT: ld r30, 240(r1) # 8-byte Folded Reload +; CHECK-BE-NEXT: addi r1, r1, 256 +; CHECK-BE-NEXT: ld r0, 16(r1) +; CHECK-BE-NEXT: mtlr r0 +; CHECK-BE-NEXT: blr + %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4) + %2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3) + tail call void @foo() + %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3) + store <512 x i1> %3, ptr %ptr, align 64 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll @@ -0,0 +1,386 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; This test is a copy of mma-intrinsics.ll except that it uses mcpu=future. +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -O0 < %s | FileCheck %s --check-prefix=CHECK-O0 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -O0 < %s | FileCheck %s --check-prefix=CHECK-O0-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-aix- \ +; RUN: -mcpu=future -vec-extabi \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-AIX64 +; RUN: llc -verify-machineinstrs -mtriple=powerpc-aix- \ +; RUN: -mcpu=future -vec-extabi \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-AIX32 + +; TODO: This test is missing some of the tests from mma-intrinsics.ll because +; those tests do not work for mcpu=future. Once the fixes are in they +; should be added back to this file. + +; assemble_acc +declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) +define void @ass_acc(ptr %ptr, <16 x i8> %vc) { +; CHECK-LABEL: ass_acc: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: stxv v4, 48(r3) +; CHECK-NEXT: stxv v5, 32(r3) +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: ass_acc: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: vmr v3, v2 +; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-NEXT: stxv v5, 48(r3) +; CHECK-BE-NEXT: stxv v4, 32(r3) +; CHECK-BE-NEXT: stxv v3, 16(r3) +; CHECK-BE-NEXT: stxv v2, 0(r3) +; CHECK-BE-NEXT: blr +; +; CHECK-O0-LABEL: ass_acc: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: vmr v4, v2 +; CHECK-O0-NEXT: # implicit-def: $vsrp17 +; CHECK-O0-NEXT: vmr v3, v4 +; CHECK-O0-NEXT: vmr v2, v4 +; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: xxlor vs0, v4, v4 +; CHECK-O0-NEXT: stxv vs0, 48(r3) +; CHECK-O0-NEXT: xxlor vs0, v5, v5 +; CHECK-O0-NEXT: stxv vs0, 32(r3) +; CHECK-O0-NEXT: xxlor vs0, v2, v2 +; CHECK-O0-NEXT: stxv vs0, 16(r3) +; CHECK-O0-NEXT: xxlor vs0, v3, v3 +; CHECK-O0-NEXT: stxv vs0, 0(r3) +; CHECK-O0-NEXT: blr +; +; CHECK-O0-BE-LABEL: ass_acc: +; CHECK-O0-BE: # %bb.0: # %entry +; CHECK-O0-BE-NEXT: vmr v4, v2 +; CHECK-O0-BE-NEXT: # implicit-def: $vsrp17 +; CHECK-O0-BE-NEXT: vmr v3, v4 +; CHECK-O0-BE-NEXT: vmr v2, v4 +; CHECK-O0-BE-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-O0-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-BE-NEXT: xxlor vs0, v5, v5 +; CHECK-O0-BE-NEXT: stxv vs0, 48(r3) +; CHECK-O0-BE-NEXT: xxlor vs0, v4, v4 +; CHECK-O0-BE-NEXT: stxv vs0, 32(r3) +; CHECK-O0-BE-NEXT: xxlor vs0, v3, v3 +; CHECK-O0-BE-NEXT: stxv vs0, 16(r3) +; CHECK-O0-BE-NEXT: xxlor vs0, v2, v2 +; CHECK-O0-BE-NEXT: stxv vs0, 0(r3) +; CHECK-O0-BE-NEXT: blr +; +; CHECK-AIX64-LABEL: ass_acc: +; CHECK-AIX64: # %bb.0: # %entry +; CHECK-AIX64-NEXT: vmr 3, 2 +; CHECK-AIX64-NEXT: dmxxinstfdmr512 0, 34, 34, 0 +; CHECK-AIX64-NEXT: dmxxextfdmr512 0, 34, 36, 0 +; CHECK-AIX64-NEXT: stxv 5, 48(3) +; CHECK-AIX64-NEXT: stxv 4, 32(3) +; CHECK-AIX64-NEXT: stxv 3, 16(3) +; CHECK-AIX64-NEXT: stxv 2, 0(3) +; CHECK-AIX64-NEXT: blr +; +; CHECK-AIX32-LABEL: ass_acc: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: vmr 3, 2 +; CHECK-AIX32-NEXT: dmxxinstfdmr512 0, 34, 34, 0 +; CHECK-AIX32-NEXT: dmxxextfdmr512 0, 34, 36, 0 +; CHECK-AIX32-NEXT: stxv 5, 48(3) +; CHECK-AIX32-NEXT: stxv 4, 32(3) +; CHECK-AIX32-NEXT: stxv 3, 16(3) +; CHECK-AIX32-NEXT: stxv 2, 0(3) +; CHECK-AIX32-NEXT: blr +entry: + %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc) + store <512 x i1> %0, ptr %ptr, align 64 + ret void +} + +; xxsetaccz +declare <512 x i1> @llvm.ppc.mma.xxsetaccz() +define void @int_xxsetaccz(ptr %ptr) { +; CHECK-LABEL: int_xxsetaccz: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxsetaccz wacc0 +; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: stxv v4, 48(r3) +; CHECK-NEXT: stxv v5, 32(r3) +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: int_xxsetaccz: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsetaccz wacc0 +; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-NEXT: stxv v5, 48(r3) +; CHECK-BE-NEXT: stxv v4, 32(r3) +; CHECK-BE-NEXT: stxv v3, 16(r3) +; CHECK-BE-NEXT: stxv v2, 0(r3) +; CHECK-BE-NEXT: blr +; +; CHECK-O0-LABEL: int_xxsetaccz: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: xxsetaccz wacc0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: xxlor vs0, v4, v4 +; CHECK-O0-NEXT: stxv vs0, 48(r3) +; CHECK-O0-NEXT: xxlor vs0, v5, v5 +; CHECK-O0-NEXT: stxv vs0, 32(r3) +; CHECK-O0-NEXT: xxlor vs0, v2, v2 +; CHECK-O0-NEXT: stxv vs0, 16(r3) +; CHECK-O0-NEXT: xxlor vs0, v3, v3 +; CHECK-O0-NEXT: stxv vs0, 0(r3) +; CHECK-O0-NEXT: blr +; +; CHECK-O0-BE-LABEL: int_xxsetaccz: +; CHECK-O0-BE: # %bb.0: # %entry +; CHECK-O0-BE-NEXT: xxsetaccz wacc0 +; CHECK-O0-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-BE-NEXT: xxlor vs0, v5, v5 +; CHECK-O0-BE-NEXT: stxv vs0, 48(r3) +; CHECK-O0-BE-NEXT: xxlor vs0, v4, v4 +; CHECK-O0-BE-NEXT: stxv vs0, 32(r3) +; CHECK-O0-BE-NEXT: xxlor vs0, v3, v3 +; CHECK-O0-BE-NEXT: stxv vs0, 16(r3) +; CHECK-O0-BE-NEXT: xxlor vs0, v2, v2 +; CHECK-O0-BE-NEXT: stxv vs0, 0(r3) +; CHECK-O0-BE-NEXT: blr +; +; CHECK-AIX64-LABEL: int_xxsetaccz: +; CHECK-AIX64: # %bb.0: # %entry +; CHECK-AIX64-NEXT: xxsetaccz 0 +; CHECK-AIX64-NEXT: dmxxextfdmr512 0, 34, 36, 0 +; CHECK-AIX64-NEXT: stxv 5, 48(3) +; CHECK-AIX64-NEXT: stxv 4, 32(3) +; CHECK-AIX64-NEXT: stxv 3, 16(3) +; CHECK-AIX64-NEXT: stxv 2, 0(3) +; CHECK-AIX64-NEXT: blr +; +; CHECK-AIX32-LABEL: int_xxsetaccz: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: xxsetaccz 0 +; CHECK-AIX32-NEXT: dmxxextfdmr512 0, 34, 36, 0 +; CHECK-AIX32-NEXT: stxv 5, 48(3) +; CHECK-AIX32-NEXT: stxv 4, 32(3) +; CHECK-AIX32-NEXT: stxv 3, 16(3) +; CHECK-AIX32-NEXT: stxv 2, 0(3) +; CHECK-AIX32-NEXT: blr +entry: + %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + store <512 x i1> %0, ptr %ptr, align 64 + ret void +} + +; disassemble_acc +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>) +define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) { +; CHECK-LABEL: disass_acc: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxsetaccz wacc0 +; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: stxv v5, 0(r3) +; CHECK-NEXT: stxv v4, 0(r4) +; CHECK-NEXT: stxv v3, 0(r5) +; CHECK-NEXT: stxv v2, 0(r6) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: disass_acc: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsetaccz wacc0 +; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-NEXT: stxv v2, 0(r3) +; CHECK-BE-NEXT: stxv v3, 0(r4) +; CHECK-BE-NEXT: stxv v4, 0(r5) +; CHECK-BE-NEXT: stxv v5, 0(r6) +; CHECK-BE-NEXT: blr +; +; CHECK-O0-LABEL: disass_acc: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: xxsetaccz wacc0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-O0-NEXT: vmr v2, v0 +; CHECK-O0-NEXT: xxlor vs0, v1, v1 +; CHECK-O0-NEXT: xxlor vs1, v4, v4 +; CHECK-O0-NEXT: xxlor vs2, v5, v5 +; CHECK-O0-NEXT: stxv vs2, 0(r3) +; CHECK-O0-NEXT: stxv vs1, 0(r4) +; CHECK-O0-NEXT: stxv vs0, 0(r5) +; CHECK-O0-NEXT: stxv v2, 0(r6) +; CHECK-O0-NEXT: blr +; +; CHECK-O0-BE-LABEL: disass_acc: +; CHECK-O0-BE: # %bb.0: # %entry +; CHECK-O0-BE-NEXT: xxsetaccz wacc0 +; CHECK-O0-BE-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp32, 0 +; CHECK-O0-BE-NEXT: vmr v2, v1 +; CHECK-O0-BE-NEXT: xxlor vs0, v0, v0 +; CHECK-O0-BE-NEXT: xxlor vs1, v5, v5 +; CHECK-O0-BE-NEXT: xxlor vs2, v4, v4 +; CHECK-O0-BE-NEXT: stxv vs2, 0(r3) +; CHECK-O0-BE-NEXT: stxv vs1, 0(r4) +; CHECK-O0-BE-NEXT: stxv vs0, 0(r5) +; CHECK-O0-BE-NEXT: stxv v2, 0(r6) +; CHECK-O0-BE-NEXT: blr +; +; CHECK-AIX64-LABEL: disass_acc: +; CHECK-AIX64: # %bb.0: # %entry +; CHECK-AIX64-NEXT: xxsetaccz 0 +; CHECK-AIX64-NEXT: dmxxextfdmr512 0, 34, 36, 0 +; CHECK-AIX64-NEXT: stxv 2, 0(3) +; CHECK-AIX64-NEXT: stxv 3, 0(4) +; CHECK-AIX64-NEXT: stxv 4, 0(5) +; CHECK-AIX64-NEXT: stxv 5, 0(6) +; CHECK-AIX64-NEXT: blr +; +; CHECK-AIX32-LABEL: disass_acc: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: xxsetaccz 0 +; CHECK-AIX32-NEXT: dmxxextfdmr512 0, 34, 36, 0 +; CHECK-AIX32-NEXT: stxv 2, 0(3) +; CHECK-AIX32-NEXT: stxv 3, 0(4) +; CHECK-AIX32-NEXT: stxv 4, 0(5) +; CHECK-AIX32-NEXT: stxv 5, 0(6) +; CHECK-AIX32-NEXT: blr +entry: + %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0) + %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0 + %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1 + %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2 + %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3 + store <16 x i8> %2, ptr %ptr1, align 16 + store <16 x i8> %3, ptr %ptr2, align 16 + store <16 x i8> %4, ptr %ptr3, align 16 + store <16 x i8> %5, ptr %ptr4, align 16 + ret void +} + +declare <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1>, <16 x i8>, <16 x i8>) +declare <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1>, <16 x i8>, <16 x i8>) +declare <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1>, <16 x i8>, <16 x i8>) + +define void @testcse(ptr %res, <16 x i8> %vc) { +; CHECK-LABEL: testcse: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxsetaccz wacc0 +; CHECK-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: stxv v4, 48(r3) +; CHECK-NEXT: stxv v5, 32(r3) +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: stxv v4, 112(r3) +; CHECK-NEXT: stxv v5, 96(r3) +; CHECK-NEXT: stxv v2, 80(r3) +; CHECK-NEXT: stxv v3, 64(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testcse: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsetaccz wacc0 +; CHECK-BE-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-NEXT: stxv v5, 48(r3) +; CHECK-BE-NEXT: stxv v4, 32(r3) +; CHECK-BE-NEXT: stxv v3, 16(r3) +; CHECK-BE-NEXT: stxv v2, 0(r3) +; CHECK-BE-NEXT: stxv v5, 112(r3) +; CHECK-BE-NEXT: stxv v4, 96(r3) +; CHECK-BE-NEXT: stxv v3, 80(r3) +; CHECK-BE-NEXT: stxv v2, 64(r3) +; CHECK-BE-NEXT: blr +; +; CHECK-O0-LABEL: testcse: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: xxsetaccz wacc0 +; CHECK-O0-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: xxlor vs3, v4, v4 +; CHECK-O0-NEXT: stxv vs3, 48(r3) +; CHECK-O0-NEXT: xxlor vs2, v5, v5 +; CHECK-O0-NEXT: stxv vs2, 32(r3) +; CHECK-O0-NEXT: xxlor vs1, v2, v2 +; CHECK-O0-NEXT: stxv vs1, 16(r3) +; CHECK-O0-NEXT: xxlor vs0, v3, v3 +; CHECK-O0-NEXT: stxv vs0, 0(r3) +; CHECK-O0-NEXT: stxv vs3, 112(r3) +; CHECK-O0-NEXT: stxv vs2, 96(r3) +; CHECK-O0-NEXT: stxv vs1, 80(r3) +; CHECK-O0-NEXT: stxv vs0, 64(r3) +; CHECK-O0-NEXT: blr +; +; CHECK-O0-BE-LABEL: testcse: +; CHECK-O0-BE: # %bb.0: # %entry +; CHECK-O0-BE-NEXT: xxsetaccz wacc0 +; CHECK-O0-BE-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-O0-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-BE-NEXT: xxlor vs3, v5, v5 +; CHECK-O0-BE-NEXT: stxv vs3, 48(r3) +; CHECK-O0-BE-NEXT: xxlor vs2, v4, v4 +; CHECK-O0-BE-NEXT: stxv vs2, 32(r3) +; CHECK-O0-BE-NEXT: xxlor vs1, v3, v3 +; CHECK-O0-BE-NEXT: stxv vs1, 16(r3) +; CHECK-O0-BE-NEXT: xxlor vs0, v2, v2 +; CHECK-O0-BE-NEXT: stxv vs0, 0(r3) +; CHECK-O0-BE-NEXT: stxv vs3, 112(r3) +; CHECK-O0-BE-NEXT: stxv vs2, 96(r3) +; CHECK-O0-BE-NEXT: stxv vs1, 80(r3) +; CHECK-O0-BE-NEXT: stxv vs0, 64(r3) +; CHECK-O0-BE-NEXT: blr +; +; CHECK-AIX64-LABEL: testcse: +; CHECK-AIX64: # %bb.0: # %entry +; CHECK-AIX64-NEXT: xxsetaccz 0 +; CHECK-AIX64-NEXT: xvf32gerpp 0, 2, 2 +; CHECK-AIX64-NEXT: dmxxextfdmr512 0, 34, 36, 0 +; CHECK-AIX64-NEXT: stxv 5, 48(3) +; CHECK-AIX64-NEXT: stxv 4, 32(3) +; CHECK-AIX64-NEXT: stxv 3, 16(3) +; CHECK-AIX64-NEXT: stxv 2, 0(3) +; CHECK-AIX64-NEXT: stxv 5, 112(3) +; CHECK-AIX64-NEXT: stxv 4, 96(3) +; CHECK-AIX64-NEXT: stxv 3, 80(3) +; CHECK-AIX64-NEXT: stxv 2, 64(3) +; CHECK-AIX64-NEXT: blr +; +; CHECK-AIX32-LABEL: testcse: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: xxsetaccz 0 +; CHECK-AIX32-NEXT: xvf32gerpp 0, 2, 2 +; CHECK-AIX32-NEXT: dmxxextfdmr512 0, 34, 36, 0 +; CHECK-AIX32-NEXT: stxv 5, 48(3) +; CHECK-AIX32-NEXT: stxv 4, 32(3) +; CHECK-AIX32-NEXT: stxv 3, 16(3) +; CHECK-AIX32-NEXT: stxv 2, 0(3) +; CHECK-AIX32-NEXT: stxv 5, 112(3) +; CHECK-AIX32-NEXT: stxv 4, 96(3) +; CHECK-AIX32-NEXT: stxv 3, 80(3) +; CHECK-AIX32-NEXT: stxv 2, 64(3) +; CHECK-AIX32-NEXT: blr +entry: + %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) + %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc) + %4 = getelementptr inbounds <512 x i1>, ptr %res, i64 0 + %5 = getelementptr inbounds <512 x i1>, ptr %res, i64 1 + store <512 x i1> %2, ptr %4, align 64 + store <512 x i1> %3, ptr %5, align 64 + ret void +}