Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -647,6 +647,8 @@ return AMDGPU::SReg_256RegClassID; case 16: return AMDGPU::SReg_512RegClassID; + case 32: + return AMDGPU::SReg_1024RegClassID; } llvm_unreachable("invalid vector size"); @@ -665,12 +667,12 @@ return; } - assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not " "supported yet"); - // 16 = Max Num Vector Elements + // 32 = Max Num Vector Elements // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) // 1 = Vector Register Class - SmallVector RegSeqArgs(NumVectorElts * 2 + 1); + SmallVector RegSeqArgs(NumVectorElts * 2 + 1); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); bool IsRegSeq = true; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -355,6 +355,7 @@ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -103,5 +103,27 @@ def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; + foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence; Index: llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -365,6 +365,9 @@ continue; unsigned R = Op.getReg(); + if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) + continue; + unsigned ShiftedBank = Bank; if (Bank != -1 && R == Reg && Op.getSubReg()) { Index: llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h +++ llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h @@ -31,6 +31,8 @@ SGPR_TUPLE, VGPR32, VGPR_TUPLE, + AGPR32, + AGPR_TUPLE, TOTAL_KINDS }; @@ -43,9 +45,10 @@ void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } unsigned getSGPRNum() const { return Value[SGPR32]; } - unsigned getVGPRNum() const { return Value[VGPR32]; } + unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); } - unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } + unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE], + Value[AGPR_TUPLE]); } unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } unsigned getOccupancy(const GCNSubtarget &ST) const { Index: llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -89,7 +89,9 @@ auto STI = static_cast(MRI.getTargetRegisterInfo()); return STI->isSGPRClass(RC) ? (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) : - (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); + STI->hasAGPRs(RC) ? + (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) : + (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); } void GCNRegPressure::inc(unsigned Reg, @@ -110,16 +112,18 @@ switch (auto Kind = getRegKind(Reg, MRI)) { case SGPR32: case VGPR32: + case AGPR32: assert(PrevMask.none() && NewMask == MaxMask); Value[Kind] += Sign; break; case SGPR_TUPLE: case VGPR_TUPLE: + case AGPR_TUPLE: assert(NewMask < MaxMask || NewMask == MaxMask); assert(PrevMask < NewMask); - Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] += + Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] += Sign * (~PrevMask & NewMask).getNumLanes(); if (PrevMask.none()) { Index: llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -143,14 +143,15 @@ return new SIFixSGPRCopies(); } -static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { +static bool hasVectorOperands(const MachineInstr &MI, + const SIRegisterInfo *TRI) { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (!MI.getOperand(i).isReg() || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) continue; - if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) + if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) return true; } return false; @@ -183,14 +184,14 @@ const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && - TRI.hasVGPRs(SrcRC); + TRI.hasVectorRegisters(SrcRC); } static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && - TRI.hasVGPRs(DstRC); + TRI.hasVectorRegisters(DstRC); } static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, @@ -277,6 +278,7 @@ // VGPRz = REG_SEQUENCE VGPRx, sub0 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + bool IsAGPR = TRI->hasAGPRs(DstRC); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { unsigned SrcReg = MI.getOperand(I).getReg(); @@ -295,6 +297,17 @@ TmpReg) .add(MI.getOperand(I)); + if (IsAGPR) { + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); + unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC); + unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), + TmpAReg) + .addReg(TmpReg, RegState::Kill); + TmpReg = TmpAReg; + } + MI.getOperand(I).setReg(TmpReg); } @@ -682,8 +695,8 @@ break; } case AMDGPU::REG_SEQUENCE: - if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) { + if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || + !hasVectorOperands(MI, TRI)) { foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); continue; } @@ -698,7 +711,8 @@ Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && - (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { + (TRI->hasVectorRegisters(Src0RC) || + TRI->hasVectorRegisters(Src1RC))) { LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); TII->moveToVALU(MI, MDT); } Index: llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -187,6 +187,7 @@ if (Fold.isImm()) { if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && + !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) && AMDGPU::isInlinableLiteralV216(static_cast(Fold.ImmToFold), ST.hasInv2PiInlineImm())) { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is @@ -419,6 +420,71 @@ //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } +static bool tryToFoldACImm(const SIInstrInfo *TII, + const MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl &FoldList) { + const MCInstrDesc &Desc = UseMI->getDesc(); + const MCOperandInfo *OpInfo = Desc.OpInfo; + if (!OpInfo || UseOpIdx >= Desc.getNumOperands()) + return false; + + uint8_t OpTy = OpInfo[UseOpIdx].OperandType; + if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || + OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) + return false; + + if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) { + UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); + return true; + } + + if (!OpToFold.isReg()) + return false; + + unsigned UseReg = OpToFold.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(UseReg)) + return false; + + if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) { + return FC.UseMI == UseMI; }) != FoldList.end()) + return false; + + MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo(); + const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); + if (!Def || !Def->isRegSequence()) + return false; + + int64_t Imm; + MachineOperand *Op; + for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { + const MachineOperand &Sub = Def->getOperand(I); + if (!Sub.isReg() || Sub.getSubReg()) + return false; + MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg()); + while (SubDef && !SubDef->isMoveImmediate() && + !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef)) + SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg()); + if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm()) + return false; + Op = &SubDef->getOperand(1); + auto SubImm = Op->getImm(); + if (I == 1) { + if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy)) + return false; + + Imm = SubImm; + continue; + } + if (Imm != SubImm) + return false; // Can only fold splat constants + } + + FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); + return true; +} + void SIFoldOperands::foldOperand( MachineOperand &OpToFold, MachineInstr *UseMI, @@ -462,6 +528,11 @@ Next = std::next(RSUse); MachineInstr *RSUseMI = RSUse->getParent(); + + if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI, + RSUse.getOperandNo(), FoldList)) + continue; + if (RSUse->getSubReg() != RegSeqDstSubReg) continue; @@ -472,6 +543,9 @@ return; } + if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList)) + return; + if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { // Sanity check that this is a stack access. // FIXME: Should probably use stack pseudos before frame lowering. @@ -505,7 +579,7 @@ if (TargetRegisterInfo::isVirtualRegister(DestReg) && TargetRegisterInfo::isVirtualRegister(SrcReg)) { const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); - if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) { + if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { MachineRegisterInfo::use_iterator NextUse; SmallVector CopyUses; for (MachineRegisterInfo::use_iterator @@ -523,6 +597,14 @@ } } + if (DestRC == &AMDGPU::AGPR_32RegClass && + TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + CopiesToReplace.push_back(UseMI); + return; + } + // In order to fold immediates into copies, we need to change the // copy to a MOV. @@ -535,14 +617,23 @@ } else { if (UseMI->isCopy() && OpToFold.isReg() && TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && !UseMI->getOperand(1).getSubReg()) { + unsigned Size = TII->getOpSize(*UseMI, 1); UseMI->getOperand(1).setReg(OpToFold.getReg()); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); OpToFold.setIsKill(false); + if (Size != 4) + return; + if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32)); return; } Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -151,6 +151,10 @@ addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } + if (Subtarget->hasMAIInsts()) { + addRegisterClass(MVT::v32i32, &AMDGPU::AReg_1024RegClass); + } + computeRegisterProperties(Subtarget->getRegisterInfo()); // We need to custom lower vector stores from local memory @@ -10194,6 +10198,36 @@ if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); + + // Prefer VGPRs over AGPRs in mAI instructions where possible. + // This saves a chain-copy of registers and better ballance register + // use between vgpr and agpr as agpr tuples tend to be big. + if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) { + unsigned Opc = MI.getOpcode(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) { + if (I == -1) + break; + MachineOperand &Op = MI.getOperand(I); + if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && + OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || + !TargetRegisterInfo::isVirtualRegister(Op.getReg()) || + !TRI->isAGPR(MRI, Op.getReg())) + continue; + auto *Src = MRI.getUniqueVRegDef(Op.getReg()); + if (!Src || !Src->isCopy() || + !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) + continue; + auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); + auto *NewRC = TRI->getEquivalentVGPRClass(RC); + // All uses of agpr64 and agpr32 can also accept vgpr except for + // v_accvgpr_read, but we do not produce agpr reads during selection, + // so no use checks are needed. + MRI.setRegClass(Op.getReg(), NewRC); + } + } + return; } Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -463,7 +463,7 @@ unsigned OpNo, bool Def) const { const MachineOperand &Op = MI->getOperand(OpNo); if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || - (Def && !Op.isDef())) + (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg())) return {-1, -1}; // A use via a PW operand does not need a waitcnt. Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -512,8 +512,11 @@ if (RC == &AMDGPU::VGPR_32RegClass) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)); + unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? + AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32; + BuildMI(MBB, MI, DL, get(Opc), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } @@ -586,6 +589,78 @@ return; } + if (RC == &AMDGPU::AGPR_32RegClass) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)); + if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { + // First try to find defining accvgpr_write to avoid temporary registers. + for (auto Def = MI, E = MBB.begin(); Def != E; ) { + --Def; + if (!Def->definesRegister(SrcReg, &RI)) + continue; + if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) + break; + + MachineOperand &DefOp = Def->getOperand(1); + assert(DefOp.isReg() || DefOp.isImm()); + + if (DefOp.isReg()) { + // Check that register source operand if not clobbered before MI. + // Immediate operands are always safe to propagate. + bool SafeToPropagate = true; + for (auto I = Def; I != MI && SafeToPropagate; ++I) + if (I->modifiesRegister(DefOp.getReg(), &RI)) + SafeToPropagate = false; + + if (!SafeToPropagate) + break; + + DefOp.setIsKill(false); + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .add(DefOp); + return; + } + + RegScavenger RS; + RS.enterBasicBlock(MBB); + RS.forward(MI); + + // Ideally we want to have three registers for a long reg_sequence copy + // to hide 2 waitstates between v_mov_b32 and accvgpr_write. + unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, + *MBB.getParent()); + + // Registers in the sequence are allocated contiguously so we can just + // use register number to pick one of three round-robin temps. + unsigned RegNo = DestReg % 3; + unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp) + report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); + RS.setRegUsed(Tmp); + // Only loop through if there are any free registers left, otherwise + // scavenger may report a fatal error without emergency spill slot + // or spill with the slot. + while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { + unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) + break; + Tmp = Tmp2; + RS.setRegUsed(Tmp); + } + copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .addReg(Tmp, RegState::Kill); + return; + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { @@ -602,6 +677,11 @@ reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; } + } else if (RI.hasAGPRs(RC)) { + Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + Opcode = AMDGPU::V_ACCVGPR_READ_B32; } ArrayRef SubIndices = RI.getRegSplitParts(RC, EltSize); @@ -614,6 +694,12 @@ else SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + if (Opcode == TargetOpcode::COPY) { + copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), + RI.getSubReg(SrcReg, SubIdx), KillSrc); + continue; + } + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)); @@ -862,6 +948,8 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + if (RI.hasAGPRs(DstRC)) + return AMDGPU::COPY; if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { @@ -1922,7 +2010,7 @@ CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? // Limit to equal cost for branch vs. N v_cndmask_b32s. - return !RI.isSGPRClass(RC) && NumInsts <= 6; + return RI.hasVGPRs(RC) && NumInsts <= 6; } case SCC_TRUE: case SCC_FALSE: { @@ -2056,6 +2144,8 @@ case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: + case AMDGPU::V_ACCVGPR_WRITE_B32: + case AMDGPU::V_ACCVGPR_READ_B32: return true; default: return false; @@ -2108,6 +2198,7 @@ case AMDGPU::V_MOV_B32_e32: case AMDGPU::S_MOV_B32: + case AMDGPU::V_ACCVGPR_WRITE_B32: break; } @@ -2121,6 +2212,11 @@ if (Opc == AMDGPU::COPY) { bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { + if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32)) + return false; + NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; + } UseMI.setDesc(get(NewOpc)); UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); @@ -2628,7 +2724,9 @@ case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: { + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { int32_t Trunc = static_cast(Imm); return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } @@ -2641,7 +2739,9 @@ case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -2657,7 +2757,9 @@ case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { uint32_t Trunc = static_cast(Imm); return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); } @@ -3026,7 +3128,11 @@ case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; @@ -3475,9 +3581,12 @@ case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; case AMDGPU::WWM: return AMDGPU::WWM; - case AMDGPU::S_MOV_B32: - return MI.getOperand(1).isReg() ? + case AMDGPU::S_MOV_B32: { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + return MI.getOperand(1).isReg() || + RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; + } case AMDGPU::S_ADD_I32: return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; case AMDGPU::S_ADDC_U32: @@ -3755,27 +3864,24 @@ unsigned Opc = MI.getOpcode(); const MCInstrDesc &InstrDesc = get(Opc); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); MachineOperand &Src1 = MI.getOperand(Src1Idx); // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 // we need to only have one constant bus use before GFX10. bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; - if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1) { - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - - if (Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || - isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) - legalizeOpWithMove(MI, Src0Idx); - } + if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && + Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || + isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) + legalizeOpWithMove(MI, Src0Idx); // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for // both the value to write (src0) and lane select (src1). Fix up non-SGPR // src0/src1 with V_READFIRSTLANE. if (Opc == AMDGPU::V_WRITELANE_B32) { - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); const DebugLoc &DL = MI.getDebugLoc(); if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -3793,6 +3899,13 @@ return; } + // No VOP2 instructions support AGPRs. + if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) + legalizeOpWithMove(MI, Src0Idx); + + if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) + legalizeOpWithMove(MI, Src1Idx); + // VOP2 src0 instructions support all operand types, so we don't need to check // their legality. If src1 is already legal, we don't need to do anything. if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) @@ -3820,9 +3933,6 @@ return; } - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - // If src0 can be used as src1, commuting will make the operands legal. // Otherwise we have to give up and insert a move. // @@ -3923,6 +4033,12 @@ continue; } + if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && + !isOperandLegal(MI, Idx, &MO)) { + legalizeOpWithMove(MI, Idx); + continue; + } + if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) continue; // VGPRs are legal @@ -3949,6 +4065,15 @@ unsigned DstReg = MRI.createVirtualRegister(SRC); unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; + if (RI.hasAGPRs(VRC)) { + VRC = RI.getEquivalentVGPRClass(VRC); + unsigned NewSrcReg = MRI.createVirtualRegister(VRC); + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(TargetOpcode::COPY), NewSrcReg) + .addReg(SrcReg); + SrcReg = NewSrcReg; + } + if (SubRegs == 1) { BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(AMDGPU::V_READFIRSTLANE_B32), DstReg) @@ -4258,7 +4383,7 @@ continue; const TargetRegisterClass *OpRC = MRI.getRegClass(MI.getOperand(i).getReg()); - if (RI.hasVGPRs(OpRC)) { + if (RI.hasVectorRegisters(OpRC)) { VRC = OpRC; } else { SRC = OpRC; @@ -4271,7 +4396,8 @@ if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); - VRC = RI.getEquivalentVGPRClass(SRC); + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) + : RI.getEquivalentVGPRClass(SRC); } RC = VRC; } else { @@ -4340,7 +4466,7 @@ // Legalize SI_INIT_M0 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { MachineOperand &Src = MI.getOperand(0); - if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg()))) + if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); return; } @@ -5342,7 +5468,7 @@ break; } - if (!RI.hasVGPRs(getOpRegClass(UseMI, OpNo))) { + if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { Worklist.insert(&UseMI); do { @@ -5449,14 +5575,26 @@ case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: - case AMDGPU::WWM: - if (RI.hasVGPRs(NewDstRC)) - return nullptr; + case AMDGPU::WWM: { + const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); + if (RI.hasAGPRs(SrcRC)) { + if (RI.hasAGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + } else { + if (RI.hasVGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + } - NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); - if (!NewDstRC) - return nullptr; return NewDstRC; + } default: return NewDstRC; } Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -891,6 +891,16 @@ (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) >; +foreach Index = 0-31 in { + def Extract_Element_v32i32_#Index : Extract_Element < + i32, v32i32, Index, !cast(sub#Index) + >; + + def Insert_Element_v32i32_#Index : Insert_Element < + i32, v32i32, Index, !cast(sub#Index) + >; +} + // FIXME: Why do only some of these type combinations for SReg and // VReg? // 16-bit bitcast Index: llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -253,8 +253,8 @@ } // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly -// repor tthe register as unavailable because a super-register with a lane mask -// as unavailable. +// report the register as unavailable because a super-register with a lane mask +// is unavailable. static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { for (MachineBasicBlock *Succ : MBB.successors()) { if (Succ->isLiveIn(Reg)) Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h @@ -29,8 +29,10 @@ private: unsigned SGPRSetID; unsigned VGPRSetID; + unsigned AGPRSetID; BitVector SGPRPressureSets; BitVector VGPRPressureSets; + BitVector AGPRPressureSets; bool SpillSGPRToVGPR; bool SpillSGPRToSMEM; bool isWave32; @@ -129,7 +131,7 @@ /// \returns true if this class contains only SGPR registers bool isSGPRClass(const TargetRegisterClass *RC) const { - return !hasVGPRs(RC); + return !hasVGPRs(RC) && !hasAGPRs(RC); } /// \returns true if this class ID contains only SGPR registers @@ -149,10 +151,22 @@ /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; + /// \returns true if this class contains AGPR registers. + bool hasAGPRs(const TargetRegisterClass *RC) const; + + /// \returns true if this class contains any vector registers. + bool hasVectorRegisters(const TargetRegisterClass *RC) const { + return hasVGPRs(RC) || hasAGPRs(RC); + } + /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; + /// \returns An AGPR reg class with the same width as \p SRC + const TargetRegisterClass *getEquivalentAGPRClass( + const TargetRegisterClass *SRC) const; + /// \returns A SGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentSGPRClass( const TargetRegisterClass *VRC) const; @@ -190,10 +204,15 @@ unsigned getSGPRPressureSet() const { return SGPRSetID; }; unsigned getVGPRPressureSet() const { return VGPRSetID; }; + unsigned getAGPRPressureSet() const { return AGPRSetID; }; const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI, unsigned Reg) const; bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + bool isVectorRegister(const MachineRegisterInfo &MRI, unsigned Reg) const { + return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); + } virtual bool isDivergentRegClass(const TargetRegisterClass *RC) const override { @@ -201,10 +220,16 @@ } bool isSGPRPressureSet(unsigned SetID) const { - return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID); + return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID) && + !AGPRPressureSets.test(SetID); } bool isVGPRPressureSet(unsigned SetID) const { - return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID); + return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && + !AGPRPressureSets.test(SetID); + } + bool isAGPRPressureSet(unsigned SetID) const { + return AGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && + !VGPRPressureSets.test(SetID); } ArrayRef getRegSplitParts(const TargetRegisterClass *RC, Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -62,6 +62,7 @@ AMDGPURegisterInfo(), SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), + AGPRPressureSets(getNumRegPressureSets()), SpillSGPRToVGPR(false), SpillSGPRToSMEM(false), isWave32(ST.isWave32()) { @@ -74,10 +75,12 @@ SGPRSetID = NumRegPressureSets; VGPRSetID = NumRegPressureSets; + AGPRSetID = NumRegPressureSets; for (unsigned i = 0; i < NumRegPressureSets; ++i) { classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); + classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); } // Determine the number of reg units for each pressure set. @@ -89,7 +92,7 @@ } } - unsigned VGPRMax = 0, SGPRMax = 0; + unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; for (unsigned i = 0; i < NumRegPressureSets; ++i) { if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { VGPRSetID = i; @@ -100,10 +103,16 @@ SGPRSetID = i; SGPRMax = PressureSetRegUnits[i]; } + if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { + AGPRSetID = i; + AGPRMax = PressureSetRegUnits[i]; + continue; + } } assert(SGPRSetID < NumRegPressureSets && - VGPRSetID < NumRegPressureSets); + VGPRSetID < NumRegPressureSets && + AGPRSetID < NumRegPressureSets); } unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( @@ -1327,18 +1336,25 @@ static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, + &AMDGPU::AGPR_32RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, + &AMDGPU::AReg_64RegClass, &AMDGPU::VReg_96RegClass, &AMDGPU::SReg_96RegClass, &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, + &AMDGPU::AReg_128RegClass, &AMDGPU::VReg_160RegClass, &AMDGPU::SReg_160RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, + &AMDGPU::AReg_512RegClass, + &AMDGPU::SReg_1024RegClass, + &AMDGPU::VReg_1024RegClass, + &AMDGPU::AReg_1024RegClass, &AMDGPU::SCC_CLASSRegClass, &AMDGPU::Pseudo_SReg_32RegClass, &AMDGPU::Pseudo_SReg_128RegClass, @@ -1373,6 +1389,33 @@ return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; case 512: return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; + case 1024: + return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; + default: + llvm_unreachable("Invalid register class size"); + } +} + +bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { + unsigned Size = getRegSizeInBits(*RC); + if (Size < 32) + return false; + switch (Size) { + case 32: + return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; + case 64: + return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; + case 96: + return false; + case 128: + return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; + case 160: + case 256: + return false; + case 512: + return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; + case 1024: + return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; default: llvm_unreachable("Invalid register class size"); } @@ -1395,6 +1438,26 @@ return &AMDGPU::VReg_256RegClass; case 512: return &AMDGPU::VReg_512RegClass; + case 1024: + return &AMDGPU::VReg_1024RegClass; + default: + llvm_unreachable("Invalid register class size"); + } +} + +const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( + const TargetRegisterClass *SRC) const { + switch (getRegSizeInBits(*SRC)) { + case 32: + return &AMDGPU::AGPR_32RegClass; + case 64: + return &AMDGPU::AReg_64RegClass; + case 128: + return &AMDGPU::AReg_128RegClass; + case 512: + return &AMDGPU::AReg_512RegClass; + case 1024: + return &AMDGPU::AReg_1024RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1417,6 +1480,8 @@ return &AMDGPU::SReg_256RegClass; case 512: return &AMDGPU::SReg_512RegClass; + case 1024: + return &AMDGPU::SReg_1024RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1443,7 +1508,23 @@ return &AMDGPU::SReg_160RegClass; case 8: return &AMDGPU::SReg_256RegClass; - case 16: /* fall-through */ + case 16: + return &AMDGPU::SReg_512RegClass; + case 32: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } + } else if (hasAGPRs(RC)) { + switch (Count) { + case 1: + return &AMDGPU::AGPR_32RegClass; + case 2: + return &AMDGPU::AReg_64RegClass; + case 4: + return &AMDGPU::AReg_128RegClass; + case 16: + return &AMDGPU::AReg_512RegClass; + case 32: /* fall-through */ default: llvm_unreachable("Invalid sub-register class size"); } @@ -1461,7 +1542,9 @@ return &AMDGPU::VReg_160RegClass; case 8: return &AMDGPU::VReg_256RegClass; - case 16: /* fall-through */ + case 16: + return &AMDGPU::VReg_512RegClass; + case 32: /* fall-through */ default: llvm_unreachable("Invalid sub-register class size"); } @@ -1509,6 +1592,17 @@ ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { if (EltSize == 4) { + static const int16_t Sub0_31[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, + AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, + AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, + }; + static const int16_t Sub0_15[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, @@ -1552,12 +1646,25 @@ return makeArrayRef(Sub0_7); case 512: return makeArrayRef(Sub0_15); + case 1024: + return makeArrayRef(Sub0_31); default: llvm_unreachable("unhandled register size"); } } if (EltSize == 8) { + static const int16_t Sub0_31_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, + AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, + AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, + AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 + }; + static const int16_t Sub0_15_64[] = { AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, @@ -1584,12 +1691,26 @@ return makeArrayRef(Sub0_7_64); case 512: return makeArrayRef(Sub0_15_64); + case 1024: + return makeArrayRef(Sub0_31_64); default: llvm_unreachable("unhandled register size"); } } if (EltSize == 16) { + + static const int16_t Sub0_31_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11, + AMDGPU::sub12_sub13_sub14_sub15, + AMDGPU::sub16_sub17_sub18_sub19, + AMDGPU::sub20_sub21_sub22_sub23, + AMDGPU::sub24_sub25_sub26_sub27, + AMDGPU::sub28_sub29_sub30_sub31 + }; + static const int16_t Sub0_15_128[] = { AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub4_sub5_sub6_sub7, @@ -1609,6 +1730,8 @@ return makeArrayRef(Sub0_7_128); case 512: return makeArrayRef(Sub0_15_128); + case 1024: + return makeArrayRef(Sub0_31_128); default: llvm_unreachable("unhandled register size"); } @@ -1616,6 +1739,13 @@ assert(EltSize == 32 && "unhandled elt size"); + static const int16_t Sub0_31_256[] = { + AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, + AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, + AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 + }; + static const int16_t Sub0_15_256[] = { AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 @@ -1626,6 +1756,8 @@ return {}; case 512: return makeArrayRef(Sub0_15_256); + case 1024: + return makeArrayRef(Sub0_31_256); default: llvm_unreachable("unhandled register size"); } @@ -1647,6 +1779,13 @@ return hasVGPRs(RC); } +bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); + assert(RC && "Register class for the reg not found"); + return hasAGPRs(RC); +} + bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, @@ -1688,7 +1827,7 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - if (Idx == getVGPRPressureSet()) + if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, const_cast(MF)); @@ -1739,7 +1878,7 @@ &AMDGPU::SReg_32_XM0RegClass; case 64: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : - &AMDGPU::SReg_64_XEXECRegClass; + &AMDGPU::SReg_64_XEXECRegClass; case 96: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : &AMDGPU::SReg_96RegClass; Index: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -388,7 +388,7 @@ unsigned Reg = MO.getReg(); if (!TRI->isVirtualRegister(Reg) && - TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { + TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) { Flags = StateWQM; break; } Index: llvm/trunk/test/CodeGen/AMDGPU/accvgpr-copy.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/accvgpr-copy.mir +++ llvm/trunk/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -0,0 +1,132 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: a_to_v +# GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec +--- +name: a_to_v +body: | + bb.0: + $vgpr0 = COPY killed $agpr0, implicit $exec +... + +# GCN-LABEL: name: a4_to_v4 +# GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 +# GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 +# GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 +# GCN: $vgpr3 = V_ACCVGPR_READ_B32 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 +--- +name: a4_to_v4 +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec +... + +# GCN-LABEL: name: a16_to_v16 +# GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 +# GCN: $vgpr15 = V_ACCVGPR_READ_B32 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 +--- +name: a16_to_v16 +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec +... + +# GCN-LABEL: name: v_to_a +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec +--- +name: v_to_a +body: | + bb.0: + $agpr0 = COPY killed $vgpr0, implicit $exec +... + +# GCN-LABEL: name: v4_to_a4 +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 +# GCN: $agpr1 = V_ACCVGPR_WRITE_B32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 +# GCN: $agpr2 = V_ACCVGPR_WRITE_B32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 +# GCN: $agpr3 = V_ACCVGPR_WRITE_B32 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec +--- +name: v4_to_a4 +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec +... + +# GCN-LABEL: name: v16_to_a16 +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +# GCN: $agpr15 = V_ACCVGPR_WRITE_B32 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +--- +name: v16_to_a16 +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec +... + +# GCN-LABEL: name: s_to_a +# GCN: $vgpr[[TMP:[0-9]+]] = V_MOV_B32_e32 killed $sgpr0, implicit $exec +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP]], implicit $exec +--- +name: s_to_a +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + $agpr0 = COPY killed $sgpr0, implicit $exec +... + +# GCN-LABEL: name: s2_to_a2 +# GCN: $vgpr[[TMP1:[0-9]+]] = V_MOV_B32_e32 killed $sgpr0, implicit $exec +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP1]], implicit $exec +# GCN: $vgpr[[TMP2:[0-9]+]] = V_MOV_B32_e32 killed $sgpr1, implicit $exec +# GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP2]], implicit $exec +--- +name: s2_to_a2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + $agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec +... + +# GCN-LABEL: name: a_to_a +# GCN: $vgpr[[TMP:[0-9]+]] = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP]], implicit $exec +--- +name: a_to_a +tracksRegLiveness: true +body: | + bb.0: + $agpr1 = IMPLICIT_DEF + $agpr0 = COPY killed $agpr1, implicit $exec +... + +# GCN-LABEL: name: a2_to_a2 +# GCN: $vgpr[[TMP1:[0-9]+]] = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec +# GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP1]], implicit $exec +# GCN: $vgpr[[TMP2:[0-9]+]] = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec +# GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP2]], implicit $exec +--- +name: a2_to_a2 +tracksRegLiveness: true +body: | + bb.0: + $agpr0_agpr1 = IMPLICIT_DEF + $agpr1_agpr2 = COPY killed $agpr0_agpr1, implicit $exec +... + +# GCN-LABEL: name: a_to_a_spill +# Using last vgpr255 will raise error about absence of emergency spill slot. + +# GCN: $vgpr255 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr255, implicit $exec + +--- +name: a_to_a_spill +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254 + + $agpr1 = IMPLICIT_DEF + $agpr0 = COPY killed $agpr1, implicit $exec +... Index: llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll +++ llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32) + +; GCN-LABEL: {{^}}test_32_agprs: +; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, 0 +; GCN-NOT: v28 +; GCN: NumVgprs: 32 +; GCN: VGPRBlocks: 7 +define amdgpu_kernel void @test_32_agprs(<32 x i32> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> , i32 0, i32 0, i32 0) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll +++ llvm/trunk/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll @@ -42,4 +42,21 @@ ret void } +; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy +; GCN: ; illegal copy a1 to s9 +define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { + %agpr = call i32 asm sideeffect "; def $0", "=${a1}"() + call void asm sideeffect "; use $0", "${s9}"(i32 %agpr) + ret void +} + +; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy +; GCN: ; illegal copy a[0:1] to s[10:11] +define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 { + %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"() + call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr) + ret void +} + attributes #0 = { nounwind } +attributes #1 = { nounwind "target-cpu"="gfx908" } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -0,0 +1,1361 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32) +declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x i32>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) +declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32) +declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) +declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32) +declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x i32>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x2f32(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x4f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x4f32(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16: +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x i32> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16: +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x4f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16: +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_4x4x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x4f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16: +; GCN: s_load_dwordx16 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x8f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16: +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x16f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x16f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_32x32x4i8(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_16x16x4i8(<16 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) + store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_4x4x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_4x4x4i8(<4 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) + store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_16x16x16i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_4x4x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x8bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc: +; GCN: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %mai.1, i32 0, i32 0, i32 0) + store <32 x i32> %mai.2, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc: +; GCN: v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] +define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.2, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc: +; GCN: v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.2, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(<4 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(<16 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> , i32 0, i32 0, i32 0) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat: +; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000 +; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00 +; GCN: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(<16 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> , <4 x half> , <16 x float> , i32 0, i32 0, i32 0) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x i32> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> , i32 0, i32 0, i32 0) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm: +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 +; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(<4 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm: +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(<16 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> , i32 0, i32 0, i32 0) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm: +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x i32> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> , i32 0, i32 0, i32 0) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat: +; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x i32> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %tid + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %gep + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %gep + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/load-constant-i32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-constant-i32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}constant_load_i32: ; GCN: s_load_dword s{{[0-9]+}} @@ -402,6 +403,8 @@ ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-NOHSA-DAG: buffer_store_dwordx4 +; GCN-NOT: accvgpr + ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 Index: llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll @@ -2,8 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=GCNX3-NOHSA -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s - +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i32: ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}} @@ -560,6 +560,8 @@ ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-NOHSA-DAG: buffer_store_dwordx4 +; GCN-NOT: accvgpr + ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 Index: llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; Testing for ds_read/write_128 @@ -268,6 +269,7 @@ ; FUNC-LABEL: {{^}}local_load_v32i32: ; SICIVI: s_mov_b32 m0, -1 ; GFX9-NOT: m0 +; GFX9-NOT: accvgpr define amdgpu_kernel void @local_load_v32i32(<32 x i32> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in Index: llvm/trunk/test/CodeGen/AMDGPU/mai-inline.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/mai-inline.ll +++ llvm/trunk/test/CodeGen/AMDGPU/mai-inline.ll @@ -0,0 +1,190 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s + +; GCN-LABEL: {{^}}accvgpr_write_read: +; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1 +; GFX908: v_accvgpr_read [[VREG:v[0-9]+]], [[AREG]] +; GFX908: global_store_dword {{[^,]+}}, [[VREG]], off +define amdgpu_kernel void @accvgpr_write_read(float addrspace(1)* %arg) { +bb: + %in.1 = load float, float addrspace(1)* %arg + %init = tail call float asm "v_accvgpr_write $0, 1", "=a"() + %read = tail call float asm "v_accvgpr_read $0, $1", "=v,a"(float %init) + store float %read, float addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_avva +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_4x4x1f32_avva(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,v,v,a"(float 1.0, float 2.0, <4 x float> %in.1) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_aaaa +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], a{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_4x4x1f32_aaaa(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,a,a,a"(float 1.0, float 2.0, <4 x float> %in.1) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_4x4x4f16_aaaa +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_4x4x4f16 a[{{[0-9:]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_4x4x4f16_aaaa(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x4f16 $0, $1, $2, $3", "=a,a,a,a"(<4 x half> , <4 x half> , <4 x float> %in.1) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_16x16x1f32_avaa +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_16x16x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_16x16x1f32_avaa(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> asm "v_mfma_f32_16x16x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <16 x float> %in.1) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_32x32x1f32_avaa +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_32x32x1f32_avaa(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x i32> asm "v_mfma_f32_32x32x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <32 x i32> %in.1) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +}