diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1981,6 +1981,56 @@ Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], [ImmArg>, IntrHasSideEffects, IntrWillReturn]>; +// WMMA (Wave Matrix Multiply-Accumulate) intrinsics +// +// These operations perform a matrix multiplication and accumulation of +// the form: D = A * B + C . + +class AMDGPUWmmaIntrinsic : + Intrinsic< + [CD], // %D + [ + AB, // %A + AB, // %B + LLVMMatchType<0>, // %C + ], + [IntrNoMem, IntrConvergent, IntrWillReturn] +>; + +class AMDGPUWmmaIntrinsicOPSEL : + Intrinsic< + [CD], // %D + [ + AB, // %A + AB, // %B + LLVMMatchType<0>, // %C + llvm_i1_ty, // %high + ], + [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>] +>; + +class AMDGPUWmmaIntrinsicIU : + Intrinsic< + [CD], // %D + [ + llvm_i1_ty, // %A_sign + AB, // %A + llvm_i1_ty, // %B_sign + AB, // %B + LLVMMatchType<0>, // %C + llvm_i1_ty, // %clamp + ], + [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, ImmArg>] +>; + +def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU; +def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; + + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -55,6 +55,10 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_wmmaopselvop3pmods : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -231,6 +231,7 @@ bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const; + bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2782,6 +2782,20 @@ return true; } +bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, + SDValue &Src) const { + const ConstantSDNode *C = cast(In); + assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); + + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcVal = C->getAPIntValue().getZExtValue(); + if (SrcVal == 1) + Mods |= SISrcMods::OP_SEL_0; + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -189,6 +189,9 @@ InstructionSelector::ComplexRendererFns selectDotIUVOP3PMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3733,6 +3733,20 @@ }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( + MachineOperand &Root) const { + assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && + "expected i1 value"); + unsigned Mods = SISrcMods::OP_SEL_1; + if (Root.getImm() == -1) + Mods |= SISrcMods::OP_SEL_0; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { Register Src; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4271,6 +4271,12 @@ case Intrinsic::amdgcn_fdot2_f32_bf16: case Intrinsic::amdgcn_sudot4: case Intrinsic::amdgcn_sudot8: + case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: + case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: + case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: + case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: + case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_sbfe: case Intrinsic::amdgcn_ubfe: diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -575,6 +575,10 @@ if (Res) break; Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address); + if (Res) + break; + + Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address); } while (false); if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -100,6 +100,7 @@ bool fixLdsDirectVMEMHazard(MachineInstr *MI); bool fixVALUPartialForwardingHazard(MachineInstr *MI); bool fixVALUTransUseHazard(MachineInstr *MI); + bool fixWMMAHazards(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); int checkMAIHazards908(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1082,6 +1082,7 @@ } fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); + fixWMMAHazards(MI); } bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { @@ -1673,6 +1674,67 @@ return true; } +bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { + if (!SIInstrInfo::isWMMA(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) { + if (!SIInstrInfo::isWMMA(I)) + return false; + + // Src0 or Src1 of the current wmma instruction overlaps with the dest of + // the previous wmma. + const Register CurSrc0Reg = + TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); + const Register CurSrc1Reg = + TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); + + const Register PrevDstReg = + TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + + if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || + TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { + return true; + } + + // Src2 of the current wmma instruction overlaps with the dest of the + // previous wmma. + const MachineOperand *Src2 = + TII->getNamedOperand(*MI, AMDGPU::OpName::src2); + const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register(); + + if (CurSrc2Reg != AMDGPU::NoRegister && + TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) { + + const MachineOperand *Src2Mods = + TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers); + const bool NoSrc2Mods = + (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0; + // Exception: there is no hazard if the wmma instructions are of the same + // type and there is no input modifier on src2 of the current instruction. + return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) == + TII->pseudoToMCOpcode(MI->getOpcode()))); + } + + return false; + }; + + auto IsExpiredFn = [](const MachineInstr &I, int) { + return SIInstrInfo::isVALU(I); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + + return true; +} + int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { int NSAtoVMEMWaitStates = 1; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -126,7 +126,10 @@ IsAtomicNoRet = UINT64_C(1) << 57, // Atomic with return. - IsAtomicRet = UINT64_C(1) << 58 + IsAtomicRet = UINT64_C(1) << 58, + + // Is a WMMA instruction. + IsWMMA = UINT64_C(1) << 59, }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -147,6 +147,9 @@ // Atomic with return. field bit IsAtomicRet = 0; + // This bit indicates that this is one of WMMA instructions. + field bit IsWMMA = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -224,6 +227,8 @@ let TSFlags{58} = IsAtomicRet; + let TSFlags{59} = IsWMMA; + let SchedRW = [Write32Bit]; let AsmVariantName = AMDGPUAsmVariants.Default; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -673,6 +673,14 @@ return MI.getDesc().TSFlags & SIInstrFlags::IsDOT; } + static bool isWMMA(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA; + } + + bool isWMMA(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsWMMA; + } + bool isDOT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::IsDOT; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3261,6 +3261,21 @@ return MIB; } + if (SIInstrInfo::isWMMA(MI)) { + unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode()); + MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) + .setMIFlags(MI.getFlags()); + for (unsigned I = 0, E = MI.getDesc().getNumOperands(); I != E; ++I) + MIB->addOperand(MI.getOperand(I)); + MIB.copyImplicitOps(MI); + + updateLiveVariables(LV, MI, *MIB); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *MIB); + + return MIB; + } + // Handle MAC/FMAC. bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1497,6 +1497,7 @@ def VOP3PModsDOT : ComplexPattern; def DotIUVOP3PMods : ComplexPattern; +def WMMAOpSelVOP3PMods : ComplexPattern; def VOP3OpSel : ComplexPattern; @@ -2475,6 +2476,7 @@ field bit IsVOP3P = 0; field bit IsDOT = 0; field bit IsSingle = 0; + field bit IsWMMA = 0; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1076,6 +1076,18 @@ let DecoderMethod = "DecodeVS_32RegisterClass"; } +def VRegSrc_64 : RegisterOperand { + let DecoderMethod = "decodeOperand_VReg_64"; +} + +def VRegSrc_128 : RegisterOperand { + let DecoderMethod = "decodeOperand_VReg_128"; +} + +def VRegSrc_256 : RegisterOperand { + let DecoderMethod = "decodeOperand_VReg_256"; +} + //===----------------------------------------------------------------------===// // VGPRSrc_* //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -368,6 +368,11 @@ LLVM_READONLY const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L); +struct WMMAOpcodeMappingInfo { + unsigned Opcode2Addr; + unsigned Opcode3Addr; +}; + LLVM_READONLY const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP); @@ -477,6 +482,12 @@ LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); +LLVM_READONLY +unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc); + +LLVM_READONLY +unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc); + void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -293,6 +293,10 @@ #define GET_VOPC64DPPTable_IMPL #define GET_VOPC64DPP8Table_DECL #define GET_VOPC64DPP8Table_IMPL +#define GET_WMMAOpcode2AddrMappingTable_DECL +#define GET_WMMAOpcode2AddrMappingTable_IMPL +#define GET_WMMAOpcode3AddrMappingTable_DECL +#define GET_WMMAOpcode3AddrMappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMTBUFBaseOpcode(unsigned Opc) { @@ -394,6 +398,16 @@ return Info ? Info->is_gfx940_xdl : false; } +unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { + const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); + return Info ? Info->Opcode3Addr : ~0u; +} + +unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) { + const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom3AddrOpcode(Opc); + return Info ? Info->Opcode2Addr : ~0u; +} + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -673,6 +673,159 @@ def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; +class VOPProfileWMMA : VOP3P_Profile

{ + let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128); + let Src0RC64 = _Src01RC64; + let Src1RC64 = _Src01RC64; + let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32); + let HasClamp = _HasClamp; + let HasOpSel = _HasOpSel; + let IsWMMA = 1; +} + +def VOP_V8F32_V8F32_V8F32_V8F32 : VOPProfile <[v8f32, v8f32, v8f32, v8f32]>; +def VOP_V8F32_V8I32_V8I32_V8F32 : VOPProfile <[v8f32, v8i32, v8i32, v8f32]>; +def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>; +def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>; +def VOP_V8I32_V8I32_V8I32_V8I32 : VOPProfile <[v8i32, v8i32, v8i32, v8i32]>; + +def VOP_V4F32_V8F32_V8F32_V4F32 : VOPProfile <[v4f32, v8f32, v8f32, v4f32]>; +def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>; +def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>; +def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>; +def VOP_V4I32_V8I32_V8I32_V4I32 : VOPProfile <[v4i32, v8i32, v8i32, v4i32]>; + +class WMMAType val> { + bit hasClamp = val{0}; + bit hasOpsel = val{1}; +} + +def WMMARegular : WMMAType<0b00>; +def WMMAUIClamp : WMMAType<0b01>; +def WMMAOpSel : WMMAType<0b10>; + +class WMMARegularPat : + GCNPat < (P.DstVT (node + (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)), + (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers)) + )), + (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, $src2_modifiers, P.Src2VT:$src2)) +>; + +class WMMAOpSelPat : + GCNPat < (P.DstVT (node + (P.Src0VT P.Src0VT:$src0), + (P.Src1VT P.Src1VT:$src1), + (P.Src2VT P.Src2VT:$src2), (WMMAOpSelVOP3PMods i32:$src2_modifiers) + )), + (P.DstVT (Inst (i32 8), P.Src0VT:$src0, (i32 8), P.Src1VT:$src1, i32:$src2_modifiers, P.Src2VT:$src2)) +>; + +class WMMAUIClampPat : + GCNPat < (P.DstVT (node + (DotIUVOP3PMods i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0), + (DotIUVOP3PMods i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1), + (P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp) + )), + (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp)) +>; + +class WMMAOpcodeMapping { + Instruction Opcode2Addr = TwoAddr; + Instruction Opcode3Addr = ThreeAddr; + Predicate WaveSizePredicate; +} + +def WMMAOpcode : GenericEnum { + let FilterClass = "VOP3P_Pseudo"; +} + +class WMMAMappingTable : GenericTable { + let FilterClass = "WMMAOpcodeMapping"; + let CppTypeName = "WMMAOpcodeMappingInfo"; + let Fields = ["Opcode2Addr", "Opcode3Addr"]; + string TypeOf_Opcode2Addr = "WMMAOpcode"; + string TypeOf_Opcode3Addr = "WMMAOpcode"; +} + +def WMMAOpcode2AddrMappingTable : WMMAMappingTable { + let PrimaryKey = ["Opcode2Addr"]; + let PrimaryKeyName = "getWMMAMappingInfoFrom2AddrOpcode"; +} + +def WMMAOpcode3AddrMappingTable : WMMAMappingTable { + let PrimaryKey = ["Opcode3Addr"]; + let PrimaryKeyName = "getWMMAMappingInfoFrom3AddrOpcode"; +} + +// The WMMA instruction has extra constraints: +// Matrices A and B cannot overlap with D. C cannot partially overlap with D, +// but it is OK for them to be the same (which is a typical case). +// +// We implement it as follows: +// 1) Map the intrinsic to the pseudo where D is tied to C ($vdst = $src2). +// 2) The pass twoaddressinstruction checks if src2 is live and if that is the case +// it converts the default pseudo to the pseudo where src2 is not the same as vdst. +// 3) @earlyclobber on the destination satisfies the constraint during RA. + +multiclass WMMAInst { + + defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2"; + defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; + + defvar WMMAProfile = VOPProfileWMMA; + if !eq(Suffix, "_w32") then { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { + def _twoaddr_w32 : VOP3P_Pseudo; + } + let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { + def _threeaddr_w32 : VOP3P_Pseudo; + } + } + def : WMMAOpcodeMapping(NAME # _twoaddr_w32), + !cast(NAME # _threeaddr_w32)>; + } else if !eq(Suffix, "_w64") then { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { + def _twoaddr_w64 : VOP3P_Pseudo; + } + let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { + def _threeaddr_w64 : VOP3P_Pseudo; + } + } + def : WMMAOpcodeMapping(NAME # _twoaddr_w64), + !cast(NAME # _threeaddr_w64)>; + } + + if !eq(Type, WMMAOpSel) then { + def : WMMAOpSelPat(NAME # _twoaddr # Suffix), node, P>; + } else if !eq(Type, WMMAUIClamp) then { + def : WMMAUIClampPat(NAME # _twoaddr # Suffix), node, P>; + } else { + def : WMMARegularPat(NAME # _twoaddr # Suffix), node, P>; + } +} + +let WaveSizePredicate = isWave32 in { + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V8I32_V8I32_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V8I32_V8I32_V8I32_V8I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; + defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; +} + +let WaveSizePredicate = isWave64 in { + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V8I32_V8I32_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V4I32_V8I32_V8I32_V4I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; + defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; +} + //===----------------------------------------------------------------------===// // Begin Real Encodings //===----------------------------------------------------------------------===// @@ -743,6 +896,22 @@ defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>; defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>; +multiclass VOP3P_Real_WMMA op> { + let WaveSizePredicate = isWave32, DecoderNamespace = "GFX11" in { + defm _twoaddr_w32 : VOP3P_Real_gfx11 ; + } + let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX11" in { + defm _twoaddr_w64 : VOP3P_Real_gfx11 ; + } +} + +defm V_WMMA_F32_16X16X16_F16 : VOP3P_Real_WMMA <0x040>; +defm V_WMMA_F32_16X16X16_BF16 : VOP3P_Real_WMMA <0x041>; +defm V_WMMA_F16_16X16X16_F16 : VOP3P_Real_WMMA <0x042>; +defm V_WMMA_BF16_16X16X16_BF16 : VOP3P_Real_WMMA <0x043>; +defm V_WMMA_I32_16X16X16_IU8 : VOP3P_Real_WMMA <0x044>; +defm V_WMMA_I32_16X16X16_IU4 : VOP3P_Real_WMMA <0x045>; + //===----------------------------------------------------------------------===// // GFX8 (VI) //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -102,6 +102,7 @@ let VOP3_OPSEL = isVop3OpSel; let IsPacked = P.IsPacked; let IsMAI = P.IsMAI; + let IsWMMA = P.IsWMMA; let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, @@ -187,7 +188,11 @@ // XXX - Is there any reason to distinguish this from regular VOP3 // here? class VOP3P_Real : - VOP3_Real; + VOP3_Real { + + // The v_wmma pseudos have extra constraints that we do not want to impose on the real instruction. + let Constraints = !if(!eq(!substr(ps.Mnemonic,0,6), "v_wmma"), "", ps.Constraints); +} class VOP3a : Enc64 { bits<4> src0_modifiers; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll @@ -0,0 +1,331 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32 + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) + +; @llvm.amdgcn.wmma.f32.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; W32-LABEL: test_wmma_f32_16x16x16_f16: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.f32.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; W32-LABEL: test_wmma_f32_16x16x16_bf16: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.f16.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_lo: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_hi: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu8 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu4 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll @@ -0,0 +1,287 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64 + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) + +; @llvm.amdgcn.wmma.f32.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; W64-LABEL: test_wmma_f32_16x16x16_f16: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.f32.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; W64-LABEL: test_wmma_f32_16x16x16_bf16: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.f16.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu8 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu4 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll @@ -0,0 +1,331 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32 + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) + +; @llvm.amdgcn.wmma.f32.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; W32-LABEL: test_wmma_f32_16x16x16_f16: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.f32.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; W32-LABEL: test_wmma_f32_16x16x16_bf16: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.f16.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_lo: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_hi: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu8 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu4 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll @@ -0,0 +1,287 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64 + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) + +; @llvm.amdgcn.wmma.f32.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; W64-LABEL: test_wmma_f32_16x16x16_f16: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.f32.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; W64-LABEL: test_wmma_f32_16x16x16_bf16: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.f16.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu8 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp +; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu4 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp +; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/threeaddr-wmma.mir b/llvm/test/CodeGen/AMDGPU/threeaddr-wmma.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/threeaddr-wmma.mir @@ -0,0 +1,136 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -start-after postrapseudos -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: test_V_WMMA_F32_16X16X16_F16_threeaddr_w32: +# GCN: v_wmma_f32_16x16x16_f16 v[34:41], v[0:7], v[8:15], v[16:23] +--- +name: test_V_WMMA_F32_16X16X16_F16_threeaddr_w32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +--- + +# GCN-LABEL: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w32: +# GCN: v_wmma_f32_16x16x16_bf16 v[34:41], v[0:7], v[8:15], v[16:23] +--- +name: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +--- + +# GCN-LABEL: test_V_WMMA_F16_16X16X16_F16_threeaddr_w32: +# GCN: v_wmma_f16_16x16x16_f16 v[34:41], v[0:7], v[8:15], v[16:23] +--- +name: test_V_WMMA_F16_16X16X16_F16_threeaddr_w32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X16_F16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec +--- + +# GCN-LABEL: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32: +# GCN: v_wmma_bf16_16x16x16_bf16 v[34:41], v[0:7], v[8:15], v[16:23] +--- +name: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec +--- + +# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w32: +# GCN: v_wmma_i32_16x16x16_iu8 v[26:33], v[0:3], v[4:7], v[8:15] +--- +name: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $exec +--- + + +# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w32: +# GCN: v_wmma_i32_16x16x16_iu4 v[26:33], v[0:1], v[2:3], v[8:15] +--- +name: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $exec + + +# GCN-LABEL: test_V_WMMA_F32_16X16X16_F16_threeaddr_w64: +# GCN: v_wmma_f32_16x16x16_f16 v[34:37], v[0:7], v[8:15], v[16:19] +--- +name: test_V_WMMA_F32_16X16X16_F16_threeaddr_w64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec +--- + +# GCN-LABEL: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w64: +# GCN: v_wmma_f32_16x16x16_bf16 v[34:37], v[0:7], v[8:15], v[16:19] +--- +name: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec +--- + +# GCN-LABEL: test_V_WMMA_F16_16X16X16_F16_threeaddr_w64: +# GCN: v_wmma_f16_16x16x16_f16 v[34:37], v[0:7], v[8:15], v[16:19] +--- +name: test_V_WMMA_F16_16X16X16_F16_threeaddr_w64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_F16_16X16X16_F16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, 0, implicit $exec +--- + +# GCN-LABEL: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w64: +# GCN: v_wmma_bf16_16x16x16_bf16 v[34:37], v[0:7], v[8:15], v[16:19] +--- +name: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, 0, implicit $exec +--- + +# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w64: +# GCN: v_wmma_i32_16x16x16_iu8 v[26:29], v[0:3], v[4:7], v[8:11] +--- +name: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $exec +--- + + +# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w64: +# GCN: v_wmma_i32_16x16x16_iu4 v[26:29], v[0:1], v[2:3], v[8:11] +--- +name: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 + early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $exec + + diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir @@ -0,0 +1,217 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32 +# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec + +--- +name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32 +registers: + - { id: 0, class: vreg_256 } + - { id: 1, class: vreg_256 } + - { id: 2, class: vreg_256 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32 +# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec + +--- +name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32 +registers: + - { id: 0, class: vreg_256 } + - { id: 1, class: vreg_256 } + - { id: 2, class: vreg_256 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32 +# GCN: early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec + +--- +name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32 +registers: + - { id: 0, class: vreg_256 } + - { id: 1, class: vreg_256 } + - { id: 2, class: vreg_256 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32 +# GCN: early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec + +--- +name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32 +registers: + - { id: 0, class: vreg_256 } + - { id: 1, class: vreg_256 } + - { id: 2, class: vreg_256 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32 +# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec + +--- +name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32 +registers: + - { id: 0, class: vreg_256 } + - { id: 1, class: vreg_128 } + - { id: 2, class: vreg_256 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_256, 0, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32 +# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec + +--- +name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32 +registers: + - { id: 0, class: vreg_256 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_256 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_twoaddr_w32 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_256, 0, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64 +# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec + +--- +name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64 +registers: + - { id: 0, class: vreg_128 } + - { id: 1, class: vreg_256 } + - { id: 2, class: vreg_128 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64 +# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec + +--- +name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64 +registers: + - { id: 0, class: vreg_128 } + - { id: 1, class: vreg_256 } + - { id: 2, class: vreg_128 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64 +# GCN: early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec + +--- +name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64 +registers: + - { id: 0, class: vreg_128 } + - { id: 1, class: vreg_256 } + - { id: 2, class: vreg_128 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64 +# GCN: early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec + +--- +name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64 +registers: + - { id: 0, class: vreg_128 } + - { id: 1, class: vreg_256 } + - { id: 2, class: vreg_128 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64 +# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec + +--- +name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64 +registers: + - { id: 0, class: vreg_128 } + - { id: 1, class: vreg_128 } + - { id: 2, class: vreg_128 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_twoaddr_w64 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_128, 0, 0, 0, implicit $exec + +... + +# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64 +# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec + +--- +name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64 +registers: + - { id: 0, class: vreg_128 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_128 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_twoaddr_w64 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_128, 0, 0, 0, implicit $exec + +... diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards.mir @@ -0,0 +1,159 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: back_to_back_WMMA1_D_overlaps_WMMA2_A +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_A + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec +... +--- + +--- +name: back_to_back_WMMA1_D_overlaps_WMMA2_B +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_B + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec +... +--- + +--- +name: valu_inbetween_WMMA1_D_overlaps_WMMA2_A +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: valu_inbetween_WMMA1_D_overlaps_WMMA2_A + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 0, implicit $exec + early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec +... +--- + +--- +name: value_inbetween_WMMA1_D_overlaps_WMMA2_B +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: value_inbetween_WMMA1_D_overlaps_WMMA2_B + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 0, implicit $exec + early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec +... +--- + +--- +name: back_to_back_WMMA1_D_overlaps_WMMA2_C +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... +--- + +--- +name: valu_inbetween_WMMA1_D_overlaps_WMMA2_C +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: valu_inbetween_WMMA1_D_overlaps_WMMA2_C + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... +--- + +--- +name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_no_imod +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_no_imod + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... +--- + +--- +name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_real_instruction_no_imod +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_real_instruction_no_imod + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... +--- + +--- +name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_imod +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_imod + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... +--- + + +--- +name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_no_imod +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_no_imod + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... +--- + + +--- +name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_imod +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_imod + ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... +--- diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll @@ -0,0 +1,473 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32 + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) + +; The tests demonstrate that the following WMMA register constraints are satisfied. +; +; v_wmma D, A, B, C +; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case). +; +; In each test, +; - first wmma instruction: the dest register D is different than all the sources +; - second wmma instruction: the dest register D and src2 (C) are the same + + +; @llvm.amdgcn.wmma.f32.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_f32_16x16x16_f16: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] +; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[28:31], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C) + %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 + ret void +} + +; @llvm.amdgcn.wmma.f32.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_f32_16x16x16_bf16: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] +; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[28:31], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C) + %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 + ret void +} + +; @llvm.amdgcn.wmma.f16.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_lo: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[28:31], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0) + %res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 0) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_hi: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[28:31], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1) + %res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 1) + store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 + ret void +} + +; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[28:31], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 +; W32-NEXT: global_store_b128 v[24:25], v[28:31], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu8 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[20:23], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[20:23], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[20:23], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[20:23], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] clamp +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[20:23], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[20:23], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[20:23], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp +; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; W32-NEXT: global_store_b128 v[16:17], v[20:23], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 +; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu4 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[16:19], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[16:19], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[16:19], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[16:19], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] clamp +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[16:19], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[16:19], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[16:19], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp +; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] clamp +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 +; W32-NEXT: global_store_b128 v[12:13], v[16:19], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 +; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) + %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll @@ -0,0 +1,385 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64 + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) + +; The tests demonstrate that the following WMMA register constraints are satisfied. +; +; v_wmma D, A, B, C +; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case). +; +; In each test, +; - first wmma instruction: the dest register D is different than all the sources +; - second wmma instruction: the dest register D and src2 (C) are the same + + +; @llvm.amdgcn.wmma.f32.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_f32_16x16x16_f16: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] +; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[24:27], off +; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C) + %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 + ret void +} + +; @llvm.amdgcn.wmma.f32.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_f32_16x16x16_bf16: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] +; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[24:27], off +; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C) + %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 + ret void +} + +; @llvm.amdgcn.wmma.f16.16x16x16.f16 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[24:27], off +; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0) + %res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: global_store_b128 v[20:21], v[24:27], off +; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1) + %res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 1) + store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 + ret void +} + +; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] +; W64-NEXT: global_store_b128 v[20:21], v[24:27], off +; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: global_store_b128 v[20:21], v[24:27], off +; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu8 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] +; W64-NEXT: global_store_b128 v[12:13], v[16:19], off +; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] +; W64-NEXT: global_store_b128 v[12:13], v[16:19], off +; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] +; W64-NEXT: global_store_b128 v[12:13], v[16:19], off +; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] +; W64-NEXT: global_store_b128 v[12:13], v[16:19], off +; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp +; W64-NEXT: global_store_b128 v[12:13], v[16:19], off +; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp +; W64-NEXT: global_store_b128 v[12:13], v[16:19], off +; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp +; W64-NEXT: global_store_b128 v[12:13], v[16:19], off +; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp +; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp +; W64-NEXT: global_store_b128 v[12:13], v[16:19], off +; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +; @llvm.amdgcn.wmma.i32.16x16x16.iu4 + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] +; W64-NEXT: global_store_b128 v[8:9], v[12:15], off +; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] +; W64-NEXT: global_store_b128 v[8:9], v[12:15], off +; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] +; W64-NEXT: global_store_b128 v[8:9], v[12:15], off +; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] +; W64-NEXT: global_store_b128 v[8:9], v[12:15], off +; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp +; W64-NEXT: global_store_b128 v[8:9], v[12:15], off +; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp +; W64-NEXT: global_store_b128 v[8:9], v[12:15], off +; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp +; W64-NEXT: global_store_b128 v[8:9], v[12:15], off +; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp +; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp +; W64-NEXT: global_store_b128 v[8:9], v[12:15], off +; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) + %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + ret void +} + diff --git a/llvm/test/MC/AMDGPU/gfx11_wmma.s b/llvm/test/MC/AMDGPU/gfx11_wmma.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_wmma.s @@ -0,0 +1,461 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s + +// +// Test v_wmma_f32_16x16x16_f16 +// + +v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] +// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] +// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:23], 1.0, v[8:15], v[16:23] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[16:19], 1.0, v[8:15], v[16:19] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], 1.0, v[16:23] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], 1.0, v[16:19] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 +// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 +// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] clamp +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] clamp +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +// +// Test v_wmma_f32_16x16x16_bf16 +// + +v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:23], 1.0, v[8:15], v[16:23] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[16:19], 1.0, v[8:15], v[16:19] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], 1.0, v[16:23] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], 1.0, v[16:19] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 +// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 +// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] clamp +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] clamp +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +// +// Test v_wmma_f16_16x16x16_f16 +// + +v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] +// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] +// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:23], 1.0, v[8:15], v[16:23] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[16:19], 1.0, v[8:15], v[16:19] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], 1.0, v[16:23] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], 1.0, v[16:19] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 +// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 +// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] clamp +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] clamp +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +// +// Test v_wmma_bf16_16x16x16_bf16 +// + +v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:23], 1.0, v[8:15], v[16:23] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[16:19], 1.0, v[8:15], v[16:19] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], 1.0, v[16:23] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], 1.0, v[16:19] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 +// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 +// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] clamp +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] clamp +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +// +// Test v_wmma_i32_16x16x16_iu8 +// + +v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] +// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] +// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:15], 1, v[4:7], v[8:15] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[8:11], 1, v[4:7], v[8:11] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], 1, v[8:15] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], 1, v[8:11] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 +// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], 1 +// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp +// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp +// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +// +// Test v_wmma_i32_16x16x16_iu4 +// + +v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] +// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:11], 1, v[2:3], v[4:11] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[4:7], 1, v[2:3], v[4:7] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], 1, v[4:11] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], 1, v[4:7] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], 1 +// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], 1 +// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] +// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] +// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] neg_hi:[1,1,0] +// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp +// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_wmma.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_wmma.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_wmma.txt @@ -0,0 +1,157 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=W32 %s +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=W64 %s + + +# Test v_wmma_f32_16x16x16_f16 + +# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c] +# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c] +0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c + +# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b] +# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b] +0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b + +# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c] +# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c] +0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c + +# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c] +# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c] +0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c + +# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c] +# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c] +0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c + + +# Test v_wmma_f32_16x16x16_bf16 + +# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c] +# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c] +0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c + +# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b] +# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b] +0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b + +# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c] +# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c] +0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c + +# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c] +# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c] +0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c + +# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c] +# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c] +0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c + + +# Test v_wmma_f16_16x16x16_f16 + +# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c] +# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c] +0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c + +# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b] +# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b] +0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b + +# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c] +# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c] +0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c + +# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c] +# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c] +0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c + +# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c] +# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c] +0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c + +# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c] +# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c] +0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c + + +# Test v_wmma_bf16_16x16x16_bf16 + +# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c] +# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c] +0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c + +# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b] +# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b] +0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b + +# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c] +# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c] +0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c + +# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c] +# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c] +0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c + +# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c] +# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c] +0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c + +# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c] +# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c] +0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c + + +# Test v_wmma_i32_16x16x16_iu8 + +# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c] +# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c] +0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c + +# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a] +# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a] +0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a + +# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c] +# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c] +0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c + +# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c] +# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c] +0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c + +# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c] +# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c] +0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c + +# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c] +# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c] +0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c + + +# Test v_wmma_i32_16x16x16_iu4 + +# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c] +# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c] +0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c + +# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a] +# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a] +0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a + +# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c] +# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c] +0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c + +# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c] +# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c] +0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c + +# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c] +# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c] +0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c + +# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c] +# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c] +0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c +