Diff 441415

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Show First 20 Lines • Show All 1,975 Lines • ▼ Show 20 Lines	def int_amdgcn_ds_add_gs_reg_rtn :
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],		Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;		[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;

def int_amdgcn_ds_sub_gs_reg_rtn :		def int_amdgcn_ds_sub_gs_reg_rtn :
ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,		ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],		Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;		[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;

		// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
		//
		// These operations perform a matrix multiplication and accumulation of
		// the form: D = A * B + C .

		class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
		Intrinsic<
		arsenmUnsubmitted Not Done Reply Inline Actions Missing the clang changes and tests for this arsenm: Missing the clang changes and tests for this
		[CD], // %D
		[
		AB, // %A
		AB, // %B
		LLVMMatchType<0>, // %C
		],
		[IntrNoMem, IntrConvergent, IntrWillReturn]
		>;

		class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
		Intrinsic<
		[CD], // %D
		[
		AB, // %A
		AB, // %B
		LLVMMatchType<0>, // %C
		llvm_i1_ty, // %high
		],
		[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<3>>]
		>;

		class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
		Intrinsic<
		[CD], // %D
		[
		llvm_i1_ty, // %A_sign
		AB, // %A
		llvm_i1_ty, // %B_sign
		AB, // %B
		LLVMMatchType<0>, // %C
		llvm_i1_ty, // %clamp
		],
		[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
		>;

		def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v8f32_ty, llvm_anyfloat_ty>;
		def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v8i32_ty, llvm_anyfloat_ty>;
		def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v8f32_ty, llvm_anyfloat_ty>;
		def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v8i32_ty, llvm_anyint_ty>;
		def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
		def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;


//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Deep learning intrinsics.		// Deep learning intrinsics.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)		// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c		// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
def int_amdgcn_fdot2 :		def int_amdgcn_fdot2 :
ClangBuiltin<"__builtin_amdgcn_fdot2">,		ClangBuiltin<"__builtin_amdgcn_fdot2">,
▲ Show 20 Lines • Show All 321 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

	Show First 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
	def gi_vop3pmodsdot :			def gi_vop3pmodsdot :
	GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,			GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
	GIComplexPatternEquiv<VOP3PModsDOT>;			GIComplexPatternEquiv<VOP3PModsDOT>;

	def gi_dotiuvop3pmods :			def gi_dotiuvop3pmods :
	GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,			GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,
	GIComplexPatternEquiv<DotIUVOP3PMods>;			GIComplexPatternEquiv<DotIUVOP3PMods>;

				def gi_wmmaopselvop3pmods :
				GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
				GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;

	def gi_vop3opselmods :			def gi_vop3opselmods :
	GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,			GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
	GIComplexPatternEquiv<VOP3OpSelMods>;			GIComplexPatternEquiv<VOP3OpSelMods>;

	def gi_vinterpmods :			def gi_vinterpmods :
	GIComplexOperandMatcher<s32, "selectVINTERPMods">,			GIComplexOperandMatcher<s32, "selectVINTERPMods">,
	GIComplexPatternEquiv<VINTERPMods>;			GIComplexPatternEquiv<VINTERPMods>;

	▲ Show 20 Lines • Show All 297 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Show First 20 Lines • Show All 225 Lines • ▼ Show 20 Lines	private:
bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp,		bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp,
SDValue &Omod) const;		SDValue &Omod) const;

bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods,		bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods,
bool IsDOT = false) const;		bool IsDOT = false) const;
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;		bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;

bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;		bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;
		bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;

bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;		bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;

bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;		bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,		bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
unsigned &Mods) const;		unsigned &Mods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;		bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;

Show All 32 Lines

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Show First 20 Lines • Show All 2,776 Lines • ▼ Show 20 Lines	bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
unsigned SrcSign = C->getAPIntValue().getZExtValue();		unsigned SrcSign = C->getAPIntValue().getZExtValue();
if (SrcSign == 1)		if (SrcSign == 1)
Mods ^= SISrcMods::NEG;		Mods ^= SISrcMods::NEG;

Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);		Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;		return true;
}		}

		bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
		SDValue &Src) const {
		const ConstantSDNode *C = cast<ConstantSDNode>(In);
		assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");

		unsigned Mods = SISrcMods::OP_SEL_1;
		unsigned SrcVal = C->getAPIntValue().getZExtValue();
		if (SrcVal == 1)
		Mods \|= SISrcMods::OP_SEL_0;

		Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
		return true;
		}

bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,		bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {		SDValue &SrcMods) const {
Src = In;		Src = In;
// FIXME: Handle op_sel		// FIXME: Handle op_sel
SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);		SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
return true;		return true;
}		}

▲ Show 20 Lines • Show All 166 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Show First 20 Lines • Show All 184 Lines • ▼ Show 20 Lines	private:

InstructionSelector::ComplexRendererFns		InstructionSelector::ComplexRendererFns
selectVOP3PModsDOT(MachineOperand &Root) const;		selectVOP3PModsDOT(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns		InstructionSelector::ComplexRendererFns
selectDotIUVOP3PMods(MachineOperand &Root) const;		selectDotIUVOP3PMods(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns		InstructionSelector::ComplexRendererFns
		selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;

		InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;		selectVOP3OpSelMods(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns		InstructionSelector::ComplexRendererFns
selectVINTERPMods(MachineOperand &Root) const;		selectVINTERPMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns		InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;		selectVINTERPModsHi(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns		InstructionSelector::ComplexRendererFns
▲ Show 20 Lines • Show All 144 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Show First 20 Lines • Show All 3,728 Lines • ▼ Show 20 Lines	AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
if (Root.getImm() == -1)		if (Root.getImm() == -1)
Mods ^= SISrcMods::NEG;		Mods ^= SISrcMods::NEG;
return {{		return {{
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods		[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};		}};
}		}

InstructionSelector::ComplexRendererFns		InstructionSelector::ComplexRendererFns
		AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
		MachineOperand &Root) const {
		assert((Root.isImm() && (Root.getImm() == -1 \|\| Root.getImm() == 0)) &&
		arsenmUnsubmitted Not Done Reply Inline Actions No real point to the assert anymore arsenm: No real point to the assert anymore
		"expected i1 value");
		arsenmUnsubmitted Not Done Reply Inline Actions Probably should have a verifier check this, or just rely on 0/non-0 arsenm: Probably should have a verifier check this, or just rely on 0/non-0
		Joe_NashAuthorUnsubmitted Done Reply Inline Actions I don't know what to do here. The intrinsic has an i1 field. If you put a non-i1 value that will be reported right? Given that, we are asserting that other parts of ISel haven't transformed this value incorrectly. Also, we do the same thing in selectDotIUVOP3PMods, line 3726. Please let me know what could be done. Joe_Nash: I don't know what to do here. The intrinsic has an i1 field. If you put a non-i1 value that…
		arsenmUnsubmitted Not Done Reply Inline Actions I'd just invert the check below to != 0. The machine verifier is certainly not enforcing this be 0/-1 for booleans. Practically speaking, this would only come up for hand written MIR arsenm: I'd just invert the check below to != 0. The machine verifier is certainly not enforcing this…
		piotrUnsubmitted Not Done Reply Inline Actions I don't know what to do here. The intrinsic has an i1 field. If you put a non-i1 value that will be reported right? Given that, we are asserting that other parts of ISel haven't transformed this value incorrectly. Also, we do the same thing in selectDotIUVOP3PMods, line 3726. Please let me know what could be done. Indeed, this was strongly inspired by the existing code in selectDotIUVOP3PMods, which handles the intrinsic in a similar way. piotr: > I don't know what to do here. The intrinsic has an i1 field. If you put a non-i1 value that…
		unsigned Mods = SISrcMods::OP_SEL_1;
		if (Root.getImm() != 0)
		Mods \|= SISrcMods::OP_SEL_0;

		return {{
		[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
		}};
		}

		InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {		AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
Register Src;		Register Src;
unsigned Mods;		unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);		std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
if (!isKnownNeverNaN(Src, *MRI))		if (!isKnownNeverNaN(Src, *MRI))
return None;		return None;

return {{		return {{
▲ Show 20 Lines • Show All 1,160 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Show First 20 Lines • Show All 4,265 Lines • ▼ Show 20 Lines	case AMDGPU::G_INTRINSIC: {
case Intrinsic::amdgcn_udot4:		case Intrinsic::amdgcn_udot4:
case Intrinsic::amdgcn_sdot8:		case Intrinsic::amdgcn_sdot8:
case Intrinsic::amdgcn_udot8:		case Intrinsic::amdgcn_udot8:
case Intrinsic::amdgcn_fdot2_bf16_bf16:		case Intrinsic::amdgcn_fdot2_bf16_bf16:
case Intrinsic::amdgcn_fdot2_f16_f16:		case Intrinsic::amdgcn_fdot2_f16_f16:
case Intrinsic::amdgcn_fdot2_f32_bf16:		case Intrinsic::amdgcn_fdot2_f32_bf16:
case Intrinsic::amdgcn_sudot4:		case Intrinsic::amdgcn_sudot4:
case Intrinsic::amdgcn_sudot8:		case Intrinsic::amdgcn_sudot8:
		case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
		case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
		case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
		case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
		case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
		case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
return getDefaultMappingVOP(MI);		return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_sbfe:		case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_ubfe:		case Intrinsic::amdgcn_ubfe:
if (isSALUMapping(MI))		if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);		return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);		return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_ds_swizzle:		case Intrinsic::amdgcn_ds_swizzle:
case Intrinsic::amdgcn_ds_permute:		case Intrinsic::amdgcn_ds_permute:
▲ Show 20 Lines • Show All 557 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Show First 20 Lines • Show All 569 Lines • ▼ Show 20 Lines	do {

Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address);		Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address);
if (Res) break;		if (Res) break;

Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);		Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
if (Res) break;		if (Res) break;

Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address);		Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address);
		if (Res)
		break;

		Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address);
} while (false);		} while (false);

if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi \|\|		if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi \|\|
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 \|\|		MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 \|\|
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 \|\|		MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 \|\|
MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 \|\|		MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 \|\|
MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 \|\|		MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 \|\|
MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi \|\|		MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi \|\|
▲ Show 20 Lines • Show All 1,548 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h

Show First 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	private:
bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);		bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);		bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);
bool fixVcmpxExecWARHazard(MachineInstr *MI);		bool fixVcmpxExecWARHazard(MachineInstr *MI);
bool fixLdsBranchVmemWARHazard(MachineInstr *MI);		bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
bool fixLdsDirectVALUHazard(MachineInstr *MI);		bool fixLdsDirectVALUHazard(MachineInstr *MI);
bool fixLdsDirectVMEMHazard(MachineInstr *MI);		bool fixLdsDirectVMEMHazard(MachineInstr *MI);
bool fixVALUPartialForwardingHazard(MachineInstr *MI);		bool fixVALUPartialForwardingHazard(MachineInstr *MI);
bool fixVALUTransUseHazard(MachineInstr *MI);		bool fixVALUTransUseHazard(MachineInstr *MI);
		bool fixWMMAHazards(MachineInstr *MI);

int checkMAIHazards(MachineInstr *MI);		int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);		int checkMAIHazards908(MachineInstr *MI);
int checkMAIHazards90A(MachineInstr *MI);		int checkMAIHazards90A(MachineInstr *MI);
/// Pad the latency between neighboring MFMA instructions with s_nops. The		/// Pad the latency between neighboring MFMA instructions with s_nops. The
/// percentage of wait states to fill with s_nops is specified by the command		/// percentage of wait states to fill with s_nops is specified by the command
/// line option '-amdgpu-mfma-padding-ratio'.		/// line option '-amdgpu-mfma-padding-ratio'.
///		///
Show All 34 Lines

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Show First 20 Lines • Show All 1,076 Lines • ▼ Show 20 Lines	void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVcmpxExecWARHazard(MI);		fixVcmpxExecWARHazard(MI);
fixLdsBranchVmemWARHazard(MI);		fixLdsBranchVmemWARHazard(MI);
if (ST.hasLdsDirect()) {		if (ST.hasLdsDirect()) {
fixLdsDirectVALUHazard(MI);		fixLdsDirectVALUHazard(MI);
fixLdsDirectVMEMHazard(MI);		fixLdsDirectVMEMHazard(MI);
}		}
fixVALUPartialForwardingHazard(MI);		fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);		fixVALUTransUseHazard(MI);
		fixWMMAHazards(MI);
}		}

bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {		bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
if (!ST.hasVcmpxPermlaneHazard() \|\| !isPermlane(*MI))		if (!ST.hasVcmpxPermlaneHazard() \|\| !isPermlane(*MI))
return false;		return false;

const SIInstrInfo *TII = ST.getInstrInfo();		const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();		const SIRegisterInfo *TRI = ST.getRegisterInfo();
▲ Show 20 Lines • Show All 575 Lines • ▼ Show 20 Lines	bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
// avoided (mask 0x0fff achieves this).		// avoided (mask 0x0fff achieves this).
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),		BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))		TII.get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(0x0fff);		.addImm(0x0fff);

return true;		return true;
}		}

		bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
		if (!SIInstrInfo::isWMMA(*MI))
		return false;

		const SIInstrInfo *TII = ST.getInstrInfo();
		const SIRegisterInfo *TRI = ST.getRegisterInfo();

		auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
		if (!SIInstrInfo::isWMMA(I))
		return false;

		// Src0 or Src1 of the current wmma instruction overlaps with the dest of
		// the previous wmma.
		const Register CurSrc0Reg =
		TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
		const Register CurSrc1Reg =
		TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();

		const Register PrevDstReg =
		TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();

		if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) \|\|
		TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
		return true;
		}

		// Src2 of the current wmma instruction overlaps with the dest of the
		// previous wmma.
		const MachineOperand *Src2 =
		TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
		const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();

		arsenmUnsubmitted Done Reply Inline Actions Can use Register() in place of NoRegister arsenm: Can use Register() in place of NoRegister
		if (CurSrc2Reg != AMDGPU::NoRegister &&
		TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {

		const MachineOperand *Src2Mods =
		TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
		const bool NoSrc2Mods =
		(Src2Mods->getImm() & (SISrcMods::NEG \| SISrcMods::NEG_HI)) == 0;
		// Exception: there is no hazard if the wmma instructions are of the same
		// type and there is no input modifier on src2 of the current instruction.
		return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
		TII->pseudoToMCOpcode(MI->getOpcode())));
		}

		return false;
		};

		auto IsExpiredFn = [](const MachineInstr &I, int) {
		return SIInstrInfo::isVALU(I);
		};

		if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
		std::numeric_limits<int>::max())
		return false;

		BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));

		return true;
		}

int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {		int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
int NSAtoVMEMWaitStates = 1;		int NSAtoVMEMWaitStates = 1;

if (!ST.hasNSAtoVMEMBug())		if (!ST.hasNSAtoVMEMBug())
return 0;		return 0;

if (!SIInstrInfo::isMUBUF(MI) && !SIInstrInfo::isMTBUF(MI))		if (!SIInstrInfo::isMUBUF(MI) && !SIInstrInfo::isMTBUF(MI))
return 0;		return 0;
▲ Show 20 Lines • Show All 809 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIDefines.h

Show First 20 Lines • Show All 120 Lines • ▼ Show 20 Lines	enum : uint64_t {

// FLAT instruction accesses FLAT_SCRATCH segment.		// FLAT instruction accesses FLAT_SCRATCH segment.
FlatScratch = UINT64_C(1) << 56,		FlatScratch = UINT64_C(1) << 56,

// Atomic without return.		// Atomic without return.
IsAtomicNoRet = UINT64_C(1) << 57,		IsAtomicNoRet = UINT64_C(1) << 57,

// Atomic with return.		// Atomic with return.
IsAtomicRet = UINT64_C(1) << 58		IsAtomicRet = UINT64_C(1) << 58,

		// Is a WMMA instruction.
		IsWMMA = UINT64_C(1) << 59,
};		};

// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.		// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
// The result is true if any of these tests are true.		// The result is true if any of these tests are true.
enum ClassFlags : unsigned {		enum ClassFlags : unsigned {
S_NAN = 1 << 0, // Signaling NaN		S_NAN = 1 << 0, // Signaling NaN
Q_NAN = 1 << 1, // Quiet NaN		Q_NAN = 1 << 1, // Quiet NaN
N_INFINITY = 1 << 2, // Negative infinity		N_INFINITY = 1 << 2, // Negative infinity
▲ Show 20 Lines • Show All 922 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrFormats.td

Show First 20 Lines • Show All 141 Lines • ▼ Show 20 Lines	class InstSI <dag outs, dag ins, string asm = "",
field bit FlatScratch = 0;		field bit FlatScratch = 0;

// Atomic without a return.		// Atomic without a return.
field bit IsAtomicNoRet = 0;		field bit IsAtomicNoRet = 0;

// Atomic with return.		// Atomic with return.
field bit IsAtomicRet = 0;		field bit IsAtomicRet = 0;

		// This bit indicates that this is one of WMMA instructions.
		field bit IsWMMA = 0;

// These need to be kept in sync with the enum in SIInstrFlags.		// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;		let TSFlags{0} = SALU;
let TSFlags{1} = VALU;		let TSFlags{1} = VALU;

let TSFlags{2} = SOP1;		let TSFlags{2} = SOP1;
let TSFlags{3} = SOP2;		let TSFlags{3} = SOP2;
let TSFlags{4} = SOPC;		let TSFlags{4} = SOPC;
let TSFlags{5} = SOPK;		let TSFlags{5} = SOPK;
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{55} = IsDOT;		let TSFlags{55} = IsDOT;

let TSFlags{56} = FlatScratch;		let TSFlags{56} = FlatScratch;

let TSFlags{57} = IsAtomicNoRet;		let TSFlags{57} = IsAtomicNoRet;

let TSFlags{58} = IsAtomicRet;		let TSFlags{58} = IsAtomicRet;

		let TSFlags{59} = IsWMMA;

let SchedRW = [Write32Bit];		let SchedRW = [Write32Bit];

let AsmVariantName = AMDGPUAsmVariants.Default;		let AsmVariantName = AMDGPUAsmVariants.Default;

// Avoid changing source registers in a way that violates constant bus read limitations.		// Avoid changing source registers in a way that violates constant bus read limitations.
let hasExtraSrcRegAllocReq = !or(VOP1, VOP2, VOP3, VOPC, SDWA, VALU);		let hasExtraSrcRegAllocReq = !or(VOP1, VOP2, VOP3, VOPC, SDWA, VALU);
}		}

▲ Show 20 Lines • Show All 225 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Show First 20 Lines • Show All 667 Lines • ▼ Show 20 Lines	static bool isMFMA(const MachineInstr &MI) {
return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&		return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;		MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
}		}

static bool isDOT(const MachineInstr &MI) {		static bool isDOT(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;		return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
}		}

		static bool isWMMA(const MachineInstr &MI) {
		return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA;
		}

		bool isWMMA(uint16_t Opcode) const {
		return get(Opcode).TSFlags & SIInstrFlags::IsWMMA;
		}

bool isDOT(uint16_t Opcode) const {		bool isDOT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;		return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
}		}

static bool isLDSDIR(const MachineInstr &MI) {		static bool isLDSDIR(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::LDSDIR;		return MI.getDesc().TSFlags & SIInstrFlags::LDSDIR;
}		}

▲ Show 20 Lines • Show All 646 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,249 Lines • ▼ Show 20 Lines	if (NewMFMAOpc != -1) {
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)		for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
MIB.add(MI.getOperand(I));		MIB.add(MI.getOperand(I));
updateLiveVariables(LV, MI, *MIB);		updateLiveVariables(LV, MI, *MIB);
if (LIS)		if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);		LIS->ReplaceMachineInstrInMaps(MI, *MIB);
return MIB;		return MIB;
}		}

		if (SIInstrInfo::isWMMA(MI)) {
		unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
		MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
		.setMIFlags(MI.getFlags());
		for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
		arsenmUnsubmitted Done Reply Inline Actions Why skip implicit operands? arsenm: Why skip implicit operands?
		MIB->addOperand(MI.getOperand(I));

		foadUnsubmitted Done Reply Inline Actions I think you could just raise the upper bound of the loop above to `E = MI.getNumOperands()` instead of adding this extra call? foad: I think you could just raise the upper bound of the loop above to `E = MI.getNumOperands()`…
		updateLiveVariables(LV, MI, *MIB);
		if (LIS)
		LIS->ReplaceMachineInstrInMaps(MI, *MIB);

		return MIB;
		}

// Handle MAC/FMAC.		// Handle MAC/FMAC.
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 \|\| Opc == AMDGPU::V_MAC_F16_e64 \|\|		bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 \|\| Opc == AMDGPU::V_MAC_F16_e64 \|\|
Opc == AMDGPU::V_FMAC_F16_e32 \|\| Opc == AMDGPU::V_FMAC_F16_e64;		Opc == AMDGPU::V_FMAC_F16_e32 \|\| Opc == AMDGPU::V_FMAC_F16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 \|\| Opc == AMDGPU::V_FMAC_F32_e64 \|\|		bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 \|\| Opc == AMDGPU::V_FMAC_F32_e64 \|\|
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 \|\|		Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 \|\|
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 \|\|		Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 \|\|
Opc == AMDGPU::V_FMAC_F16_e32 \|\| Opc == AMDGPU::V_FMAC_F16_e64 \|\|		Opc == AMDGPU::V_FMAC_F16_e32 \|\| Opc == AMDGPU::V_FMAC_F16_e64 \|\|
Opc == AMDGPU::V_FMAC_F64_e32 \|\| Opc == AMDGPU::V_FMAC_F64_e64;		Opc == AMDGPU::V_FMAC_F64_e32 \|\| Opc == AMDGPU::V_FMAC_F64_e64;
▲ Show 20 Lines • Show All 5,198 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Show First 20 Lines • Show All 1,491 Lines • ▼ Show 20 Lines
def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;		def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;

def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;		def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;

def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;		def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;

def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;		def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;		def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;
		def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;

def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;		def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;

def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;		def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;

def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;		def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;

def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;		def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
▲ Show 20 Lines • Show All 962 Lines • ▼ Show 20 Lines	class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;		field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtSDWA9 = HasExtSDWA;		field bit HasExtSDWA9 = HasExtSDWA;
field int NeedPatGen = PatGenMode.NoPattern;		field int NeedPatGen = PatGenMode.NoPattern;

field bit IsMAI = 0;		field bit IsMAI = 0;
field bit IsVOP3P = 0;		field bit IsVOP3P = 0;
field bit IsDOT = 0;		field bit IsDOT = 0;
field bit IsSingle = 0;		field bit IsSingle = 0;
		field bit IsWMMA = 0;

field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);		field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);		field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods);		field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods);

field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));		field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));

// VOP3b instructions are a special case with a second explicit		// VOP3b instructions are a special case with a second explicit
▲ Show 20 Lines • Show All 469 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

	Show First 20 Lines • Show All 1,070 Lines • ▼ Show 20 Lines

	// This is for operands with the enum(9), VSrc encoding restriction,			// This is for operands with the enum(9), VSrc encoding restriction,
	// but only allows VGPRs.			// but only allows VGPRs.
	def VRegSrc_32 : RegisterOperand<VGPR_32> {			def VRegSrc_32 : RegisterOperand<VGPR_32> {
	//let ParserMatchClass = RegImmMatcher<"VRegSrc32">;			//let ParserMatchClass = RegImmMatcher<"VRegSrc32">;
	let DecoderMethod = "DecodeVS_32RegisterClass";			let DecoderMethod = "DecodeVS_32RegisterClass";
	}			}

				def VRegSrc_64 : RegisterOperand<VReg_64> {
				let DecoderMethod = "decodeOperand_VReg_64";
				}

				def VRegSrc_128 : RegisterOperand<VReg_128> {
				let DecoderMethod = "decodeOperand_VReg_128";
				}

				def VRegSrc_256 : RegisterOperand<VReg_256> {
				let DecoderMethod = "decodeOperand_VReg_256";
				}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// VGPRSrc_*			// VGPRSrc_*
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	// An 8-bit RegisterOperand wrapper for a VGPR			// An 8-bit RegisterOperand wrapper for a VGPR
	def VGPRSrc_32 : RegisterOperand<VGPR_32> {			def VGPRSrc_32 : RegisterOperand<VGPR_32> {
	let DecoderMethod = "DecodeVGPR_32RegisterClass";			let DecoderMethod = "DecodeVGPR_32RegisterClass";
	}			}
	▲ Show 20 Lines • Show All 96 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Show First 20 Lines • Show All 362 Lines • ▼ Show 20 Lines
struct MIMGG16MappingInfo {		struct MIMGG16MappingInfo {
MIMGBaseOpcode G;		MIMGBaseOpcode G;
MIMGBaseOpcode G16;		MIMGBaseOpcode G16;
};		};

LLVM_READONLY		LLVM_READONLY
const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);		const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);

		struct WMMAOpcodeMappingInfo {
		unsigned Opcode2Addr;
		unsigned Opcode3Addr;
		};

LLVM_READONLY		LLVM_READONLY
const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);		const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);

LLVM_READONLY		LLVM_READONLY
const MIMGBiasMappingInfo *getMIMGBiasMappingInfo(unsigned Bias);		const MIMGBiasMappingInfo *getMIMGBiasMappingInfo(unsigned Bias);

LLVM_READONLY		LLVM_READONLY
const MIMGOffsetMappingInfo *getMIMGOffsetMappingInfo(unsigned Offset);		const MIMGOffsetMappingInfo *getMIMGOffsetMappingInfo(unsigned Offset);
▲ Show 20 Lines • Show All 93 Lines • ▼ Show 20 Lines	const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
const MCSubtargetInfo &STI);		const MCSubtargetInfo &STI);
LLVM_READONLY		LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,		const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
const MCSubtargetInfo &STI);		const MCSubtargetInfo &STI);

LLVM_READONLY		LLVM_READONLY
int getMCOpcode(uint16_t Opcode, unsigned Gen);		int getMCOpcode(uint16_t Opcode, unsigned Gen);

		LLVM_READONLY
		unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);

		LLVM_READONLY
		unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc);

void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,		void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const MCSubtargetInfo *STI);		const MCSubtargetInfo *STI);

amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(		amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
const MCSubtargetInfo *STI);		const MCSubtargetInfo *STI);

bool isGroupSegment(const GlobalValue *GV);		bool isGroupSegment(const GlobalValue *GV);
bool isGlobalSegment(const GlobalValue *GV);		bool isGlobalSegment(const GlobalValue *GV);
▲ Show 20 Lines • Show All 606 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Show First 20 Lines • Show All 287 Lines • ▼ Show 20 Lines
#define GET_VOP2InfoTable_DECL		#define GET_VOP2InfoTable_DECL
#define GET_VOP2InfoTable_IMPL		#define GET_VOP2InfoTable_IMPL
#define GET_VOP3InfoTable_DECL		#define GET_VOP3InfoTable_DECL
#define GET_VOP3InfoTable_IMPL		#define GET_VOP3InfoTable_IMPL
#define GET_VOPC64DPPTable_DECL		#define GET_VOPC64DPPTable_DECL
#define GET_VOPC64DPPTable_IMPL		#define GET_VOPC64DPPTable_IMPL
#define GET_VOPC64DPP8Table_DECL		#define GET_VOPC64DPP8Table_DECL
#define GET_VOPC64DPP8Table_IMPL		#define GET_VOPC64DPP8Table_IMPL
		#define GET_WMMAOpcode2AddrMappingTable_DECL
		#define GET_WMMAOpcode2AddrMappingTable_IMPL
		#define GET_WMMAOpcode3AddrMappingTable_DECL
		#define GET_WMMAOpcode3AddrMappingTable_IMPL
#include "AMDGPUGenSearchableTables.inc"		#include "AMDGPUGenSearchableTables.inc"

int getMTBUFBaseOpcode(unsigned Opc) {		int getMTBUFBaseOpcode(unsigned Opc) {
const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc);		const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc);
return Info ? Info->BaseOpcode : -1;		return Info ? Info->BaseOpcode : -1;
}		}

int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) {		int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) {
▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines	bool getMAIIsDGEMM(unsigned Opc) {
return Info ? Info->is_dgemm : false;		return Info ? Info->is_dgemm : false;
}		}

bool getMAIIsGFX940XDL(unsigned Opc) {		bool getMAIIsGFX940XDL(unsigned Opc) {
const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);		const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
return Info ? Info->is_gfx940_xdl : false;		return Info ? Info->is_gfx940_xdl : false;
}		}

		unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
		const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
		return Info ? Info->Opcode3Addr : ~0u;
		}

		unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) {
		const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom3AddrOpcode(Opc);
		return Info ? Info->Opcode2Addr : ~0u;
		}

// Wrapper for Tablegen'd function. enum Subtarget is not defined in any		// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
// header files, so we need to wrap it in a function that takes unsigned		// header files, so we need to wrap it in a function that takes unsigned
// instead.		// instead.
int getMCOpcode(uint16_t Opcode, unsigned Gen) {		int getMCOpcode(uint16_t Opcode, unsigned Gen) {
return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));		return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
}		}

namespace IsaInfo {		namespace IsaInfo {
▲ Show 20 Lines • Show All 1,980 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Show First 20 Lines • Show All 671 Lines • ▼ Show 20 Lines	let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1, isReMaterializable = 1 in {
defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;		defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;		defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;		defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1		} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1

def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;		def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;		def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;

		class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bit _HasClamp, bit _HasOpSel> : VOP3P_Profile<P> {
		let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128);
		let Src0RC64 = _Src01RC64;
		let Src1RC64 = _Src01RC64;
		let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32);
		let HasClamp = _HasClamp;
		let HasOpSel = _HasOpSel;
		let IsWMMA = 1;
		}

		def VOP_V8F32_V8F32_V8F32_V8F32 : VOPProfile <[v8f32, v8f32, v8f32, v8f32]>;
		def VOP_V8F32_V8I32_V8I32_V8F32 : VOPProfile <[v8f32, v8i32, v8i32, v8f32]>;
		def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>;
		def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>;
		def VOP_V8I32_V8I32_V8I32_V8I32 : VOPProfile <[v8i32, v8i32, v8i32, v8i32]>;

		def VOP_V4F32_V8F32_V8F32_V4F32 : VOPProfile <[v4f32, v8f32, v8f32, v4f32]>;
		def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>;
		def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;
		def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>;
		def VOP_V4I32_V8I32_V8I32_V4I32 : VOPProfile <[v4i32, v8i32, v8i32, v4i32]>;

		class WMMAType <bits<2> val> {
		bit hasClamp = val{0};
		bit hasOpsel = val{1};
		}

		def WMMARegular : WMMAType<0b00>;
		def WMMAUIClamp : WMMAType<0b01>;
		def WMMAOpSel : WMMAType<0b10>;

		class WMMARegularPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
		GCNPat < (P.DstVT (node
		(P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)),
		(P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
		(P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))
		)),
		(P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, $src2_modifiers, P.Src2VT:$src2))
		>;

		class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
		GCNPat < (P.DstVT (node
		(P.Src0VT P.Src0VT:$src0),
		(P.Src1VT P.Src1VT:$src1),
		(P.Src2VT P.Src2VT:$src2), (WMMAOpSelVOP3PMods i32:$src2_modifiers)
		)),
		(P.DstVT (Inst (i32 8), P.Src0VT:$src0, (i32 8), P.Src1VT:$src1, i32:$src2_modifiers, P.Src2VT:$src2))
		>;

		class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
		GCNPat < (P.DstVT (node
		(DotIUVOP3PMods i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0),
		(DotIUVOP3PMods i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1),
		(P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp)
		)),
		(P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp))
		>;

		class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
		Instruction Opcode2Addr = TwoAddr;
		Instruction Opcode3Addr = ThreeAddr;
		Predicate WaveSizePredicate;
		}

		def WMMAOpcode : GenericEnum {
		let FilterClass = "VOP3P_Pseudo";
		}

		class WMMAMappingTable : GenericTable {
		let FilterClass = "WMMAOpcodeMapping";
		let CppTypeName = "WMMAOpcodeMappingInfo";
		let Fields = ["Opcode2Addr", "Opcode3Addr"];
		string TypeOf_Opcode2Addr = "WMMAOpcode";
		string TypeOf_Opcode3Addr = "WMMAOpcode";
		}

		def WMMAOpcode2AddrMappingTable : WMMAMappingTable {
		let PrimaryKey = ["Opcode2Addr"];
		let PrimaryKeyName = "getWMMAMappingInfoFrom2AddrOpcode";
		}

		def WMMAOpcode3AddrMappingTable : WMMAMappingTable {
		let PrimaryKey = ["Opcode3Addr"];
		let PrimaryKeyName = "getWMMAMappingInfoFrom3AddrOpcode";
		}

		// The WMMA instruction has extra constraints:
		// Matrices A and B cannot overlap with D. C cannot partially overlap with D,
		// but it is OK for them to be the same (which is a typical case).
		//
		// We implement it as follows:
		// 1) Map the intrinsic to the pseudo where D is tied to C ($vdst = $src2).
		// 2) The pass twoaddressinstruction checks if src2 is live and if that is the case
		// it converts the default pseudo to the pseudo where src2 is not the same as vdst.
		// 3) @earlyclobber on the destination satisfies the constraint during RA.

		multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> {

		defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
		defvar WMMAConstraints3Addr = "@earlyclobber $vdst";

		defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
		if !eq(Suffix, "_w32") then {
		let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
		let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
		def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
		}
		let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
		def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
		}
		}
		def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32),
		!cast<Instruction>(NAME # _threeaddr_w32)>;
		} else if !eq(Suffix, "_w64") then {
		let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
		let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
		def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
		}
		let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
		def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
		}
		}
		def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64),
		!cast<Instruction>(NAME # _threeaddr_w64)>;
		}

		if !eq(Type, WMMAOpSel) then {
		def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
		} else if !eq(Type, WMMAUIClamp) then {
		def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
		} else {
		def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
		}
		}

		let WaveSizePredicate = isWave32 in {
		defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
		defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V8I32_V8I32_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
		defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
		defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V8I32_V8I32_V8I32_V8I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
		defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
		defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
		}

		let WaveSizePredicate = isWave64 in {
		defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
		defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V8I32_V8I32_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
		defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
		defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V4I32_V8I32_V8I32_V4I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
		defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
		defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Begin Real Encodings		// Begin Real Encodings
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,		class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
string opName = ps.OpName>		string opName = ps.OpName>
: VOP3P_DPP<op, opName, ps.Pfl, 1>, SIMCInstr<ps.PseudoInstr, subtarget> {		: VOP3P_DPP<op, opName, ps.Pfl, 1>, SIMCInstr<ps.PseudoInstr, subtarget> {
let hasSideEffects = ps.hasSideEffects;		let hasSideEffects = ps.hasSideEffects;
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	multiclass VOP3P_Realtriple_gfx11<bits<7> op, string backing_ps_name = NAME,
VOP3P_Real_dpp_gfx11<op, backing_ps_name, asmName>,		VOP3P_Real_dpp_gfx11<op, backing_ps_name, asmName>,
VOP3P_Real_dpp8_gfx11<op, backing_ps_name, asmName>;		VOP3P_Real_dpp8_gfx11<op, backing_ps_name, asmName>;
} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"		} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"

defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11 <0x16>;		defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11 <0x16>;
defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>;		defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>;
defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>;		defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>;

		multiclass VOP3P_Real_WMMA <bits<7> op> {
		let WaveSizePredicate = isWave32, DecoderNamespace = "GFX11" in {
		defm _twoaddr_w32 : VOP3P_Real_gfx11 <op>;
		}
		let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX11" in {
		defm _twoaddr_w64 : VOP3P_Real_gfx11 <op>;
		}
		}

		defm V_WMMA_F32_16X16X16_F16 : VOP3P_Real_WMMA <0x040>;
		defm V_WMMA_F32_16X16X16_BF16 : VOP3P_Real_WMMA <0x041>;
		defm V_WMMA_F16_16X16X16_F16 : VOP3P_Real_WMMA <0x042>;
		defm V_WMMA_BF16_16X16X16_BF16 : VOP3P_Real_WMMA <0x043>;
		defm V_WMMA_I32_16X16X16_IU8 : VOP3P_Real_WMMA <0x044>;
		defm V_WMMA_I32_16X16X16_IU4 : VOP3P_Real_WMMA <0x045>;

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// GFX8 (VI)		// GFX8 (VI)
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

multiclass VOP3P_Real_vi<bits<7> op> {		multiclass VOP3P_Real_vi<bits<7> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,		def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {		VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
let AssemblerPredicate = HasVOP3PInsts;		let AssemblerPredicate = HasVOP3PInsts;
▲ Show 20 Lines • Show All 276 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/VOPInstructions.td

Show First 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	VOP_Pseudo <opName, "_e64", P, P.Outs64,
!if(isVop3OpSel,		!if(isVop3OpSel,
P.InsVOP3OpSel,		P.InsVOP3OpSel,
!if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),		!if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
"", pattern> {		"", pattern> {

let VOP3_OPSEL = isVop3OpSel;		let VOP3_OPSEL = isVop3OpSel;
let IsPacked = P.IsPacked;		let IsPacked = P.IsPacked;
let IsMAI = P.IsMAI;		let IsMAI = P.IsMAI;
		let IsWMMA = P.IsWMMA;

let AsmOperands = !if(isVop3OpSel,		let AsmOperands = !if(isVop3OpSel,
P.AsmVOP3OpSel,		P.AsmVOP3OpSel,
!if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));		!if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));

let Size = 8;		let Size = 8;
let mayLoad = 0;		let mayLoad = 0;
let mayStore = 0;		let mayStore = 0;
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines	class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
let TRANS = ps.TRANS;		let TRANS = ps.TRANS;

VOPProfile Pfl = ps.Pfl;		VOPProfile Pfl = ps.Pfl;
}		}

// XXX - Is there any reason to distinguish this from regular VOP3		// XXX - Is there any reason to distinguish this from regular VOP3
// here?		// here?
class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :		class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
VOP3_Real<ps, EncodingFamily, asm_name>;		VOP3_Real<ps, EncodingFamily, asm_name> {

		// The v_wmma pseudos have extra constraints that we do not want to impose on the real instruction.
		let Constraints = !if(!eq(!substr(ps.Mnemonic,0,6), "v_wmma"), "", ps.Constraints);
		}

class VOP3a<VOPProfile P> : Enc64 {		class VOP3a<VOPProfile P> : Enc64 {
bits<4> src0_modifiers;		bits<4> src0_modifiers;
bits<9> src0;		bits<9> src0;
bits<3> src1_modifiers;		bits<3> src1_modifiers;
bits<9> src1;		bits<9> src1;
bits<3> src2_modifiers;		bits<3> src2_modifiers;
bits<9> src2;		bits<9> src2;
▲ Show 20 Lines • Show All 1,258 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W32

				declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)
				declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)
				declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)
				declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)
				declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
				declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)

				; @llvm.amdgcn.wmma.f32.16x16x16.f16

				define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_f32_16x16x16_f16:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.f32.16x16x16.bf16

				define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_f32_16x16x16_bf16:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.f16.16x16x16.f16

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu8

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu4

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}


				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W64

				declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)
				declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)
				declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)
				declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)
				declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
				declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)

				; @llvm.amdgcn.wmma.f32.16x16x16.f16

				define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_f32_16x16x16_f16:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.f32.16x16x16.bf16

				define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_f32_16x16x16_bf16:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.f16.16x16x16.f16

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu8

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}


				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu4

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W32

				declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)
				declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)
				declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)
				declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)
				declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
				declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)

				; @llvm.amdgcn.wmma.f32.16x16x16.f16

				define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_f32_16x16x16_f16:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.f32.16x16x16.bf16

				define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_f32_16x16x16_bf16:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.f16.16x16x16.f16

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu8

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu4

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}


				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				ret void
				}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W64

				declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)
				declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)
				declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)
				declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)
				declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
				declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)

				; @llvm.amdgcn.wmma.f32.16x16x16.f16

				define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_f32_16x16x16_f16:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.f32.16x16x16.bf16

				define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_f32_16x16x16_bf16:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.f16.16x16x16.f16

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu8

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}


				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu4

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				ret void
				}

llvm/test/CodeGen/AMDGPU/threeaddr-wmma.mir

This file was added.

				# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -start-after postrapseudos -verify-machineinstrs -o - \| FileCheck -check-prefix=GCN %s

				# GCN-LABEL: test_V_WMMA_F32_16X16X16_F16_threeaddr_w32:
				# GCN: v_wmma_f32_16x16x16_f16 v[34:41], v[0:7], v[8:15], v[16:23]
				---
				name: test_V_WMMA_F32_16X16X16_F16_threeaddr_w32
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				---

				# GCN-LABEL: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w32:
				# GCN: v_wmma_f32_16x16x16_bf16 v[34:41], v[0:7], v[8:15], v[16:23]
				---
				name: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w32
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				---

				# GCN-LABEL: test_V_WMMA_F16_16X16X16_F16_threeaddr_w32:
				# GCN: v_wmma_f16_16x16x16_f16 v[34:41], v[0:7], v[8:15], v[16:23]
				---
				name: test_V_WMMA_F16_16X16X16_F16_threeaddr_w32
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X16_F16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
				---

				# GCN-LABEL: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32:
				# GCN: v_wmma_bf16_16x16x16_bf16 v[34:41], v[0:7], v[8:15], v[16:23]
				---
				name: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
				---

				# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w32:
				# GCN: v_wmma_i32_16x16x16_iu8 v[26:33], v[0:3], v[4:7], v[8:15]
				---
				name: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w32
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $exec
				---


				# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w32:
				# GCN: v_wmma_i32_16x16x16_iu4 v[26:33], v[0:1], v[2:3], v[8:15]
				---
				name: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w32
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $exec


				# GCN-LABEL: test_V_WMMA_F32_16X16X16_F16_threeaddr_w64:
				# GCN: v_wmma_f32_16x16x16_f16 v[34:37], v[0:7], v[8:15], v[16:19]
				---
				name: test_V_WMMA_F32_16X16X16_F16_threeaddr_w64
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec
				---

				# GCN-LABEL: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w64:
				# GCN: v_wmma_f32_16x16x16_bf16 v[34:37], v[0:7], v[8:15], v[16:19]
				---
				name: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w64
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec
				---

				# GCN-LABEL: test_V_WMMA_F16_16X16X16_F16_threeaddr_w64:
				# GCN: v_wmma_f16_16x16x16_f16 v[34:37], v[0:7], v[8:15], v[16:19]
				---
				name: test_V_WMMA_F16_16X16X16_F16_threeaddr_w64
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_F16_16X16X16_F16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, 0, implicit $exec
				---

				# GCN-LABEL: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w64:
				# GCN: v_wmma_bf16_16x16x16_bf16 v[34:37], v[0:7], v[8:15], v[16:19]
				---
				name: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w64
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, 0, implicit $exec
				---

				# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w64:
				# GCN: v_wmma_i32_16x16x16_iu8 v[26:29], v[0:3], v[4:7], v[8:11]
				---
				name: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w64
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $exec
				---


				# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w64:
				# GCN: v_wmma_i32_16x16x16_iu4 v[26:29], v[0:1], v[2:3], v[8:11]
				---
				name: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w64
				tracksRegLiveness: true
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
				early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $exec

llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir

This file was added.

				# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - \| FileCheck -check-prefix=GCN %s

				# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32
				# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec

				---
				name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32
				registers:
				- { id: 0, class: vreg_256 }
				- { id: 1, class: vreg_256 }
				- { id: 2, class: vreg_256 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32
				# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec

				---
				name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32
				registers:
				- { id: 0, class: vreg_256 }
				- { id: 1, class: vreg_256 }
				- { id: 2, class: vreg_256 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32
				# GCN: early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec

				---
				name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32
				registers:
				- { id: 0, class: vreg_256 }
				- { id: 1, class: vreg_256 }
				- { id: 2, class: vreg_256 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32
				# GCN: early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec

				---
				name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32
				registers:
				- { id: 0, class: vreg_256 }
				- { id: 1, class: vreg_256 }
				- { id: 2, class: vreg_256 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32
				# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec

				---
				name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32
				registers:
				- { id: 0, class: vreg_256 }
				- { id: 1, class: vreg_128 }
				- { id: 2, class: vreg_256 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_256, 0, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32
				# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec

				---
				name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32
				registers:
				- { id: 0, class: vreg_256 }
				- { id: 1, class: vreg_64 }
				- { id: 2, class: vreg_256 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_twoaddr_w32 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_256, 0, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64
				# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec

				---
				name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64
				registers:
				- { id: 0, class: vreg_128 }
				- { id: 1, class: vreg_256 }
				- { id: 2, class: vreg_128 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64
				# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec

				---
				name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64
				registers:
				- { id: 0, class: vreg_128 }
				- { id: 1, class: vreg_256 }
				- { id: 2, class: vreg_128 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64
				# GCN: early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec

				---
				name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64
				registers:
				- { id: 0, class: vreg_128 }
				- { id: 1, class: vreg_256 }
				- { id: 2, class: vreg_128 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64
				# GCN: early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec

				---
				name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64
				registers:
				- { id: 0, class: vreg_128 }
				- { id: 1, class: vreg_256 }
				- { id: 2, class: vreg_128 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64
				# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec

				---
				name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64
				registers:
				- { id: 0, class: vreg_128 }
				- { id: 1, class: vreg_128 }
				- { id: 2, class: vreg_128 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_twoaddr_w64 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_128, 0, 0, 0, implicit $exec

				...

				# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64
				# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec

				---
				name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64
				registers:
				- { id: 0, class: vreg_128 }
				- { id: 1, class: vreg_64 }
				- { id: 2, class: vreg_128 }
				body: \|
				bb.0:

				%0 = IMPLICIT_DEF
				%1 = IMPLICIT_DEF
				early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_twoaddr_w64 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_128, 0, 0, 0, implicit $exec

				...

llvm/test/CodeGen/AMDGPU/wmma-hazards.mir

This file was added.

				# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
				# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - \| FileCheck -check-prefix=GCN %s

				---
				name: back_to_back_WMMA1_D_overlaps_WMMA2_A
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_A
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: V_NOP_e32 implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
				...
				---

				---
				name: back_to_back_WMMA1_D_overlaps_WMMA2_B
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_B
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: V_NOP_e32 implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
				...
				---

				---
				name: valu_inbetween_WMMA1_D_overlaps_WMMA2_A
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: valu_inbetween_WMMA1_D_overlaps_WMMA2_A
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				$vgpr40 = V_MOV_B32_e32 0, implicit $exec
				early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
				...
				---

				---
				name: value_inbetween_WMMA1_D_overlaps_WMMA2_B
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: value_inbetween_WMMA1_D_overlaps_WMMA2_B
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				$vgpr40 = V_MOV_B32_e32 0, implicit $exec
				early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
				...
				---

				---
				name: back_to_back_WMMA1_D_overlaps_WMMA2_C
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: V_NOP_e32 implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				...
				---

				---
				name: valu_inbetween_WMMA1_D_overlaps_WMMA2_C
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: valu_inbetween_WMMA1_D_overlaps_WMMA2_C
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				$vgpr40 = V_MOV_B32_e32 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				...
				---

				---
				name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_no_imod
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_no_imod
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				...
				---

				---
				name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_real_instruction_no_imod
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_real_instruction_no_imod
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				...
				---

				---
				name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_imod
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_imod
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: V_NOP_e32 implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				...
				---


				---
				name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_no_imod
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_no_imod
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: V_NOP_e32 implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				...
				---


				---
				name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_imod
				body: \|
				bb.0:
				liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
				; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_imod
				; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				; GCN-NEXT: V_NOP_e32 implicit $exec
				; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
				...
				---

llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W32

				declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)
				declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)
				declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)
				declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)
				declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
				declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)

				; The tests demonstrate that the following WMMA register constraints are satisfied.
				;
				; v_wmma D, A, B, C
				; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).
				;
				; In each test,
				; - first wmma instruction: the dest register D is different than all the sources
				; - second wmma instruction: the dest register D and src2 (C) are the same


				; @llvm.amdgcn.wmma.f32.16x16x16.f16

				define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_f32_16x16x16_f16:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
				%res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.f32.16x16x16.bf16

				define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_f32_16x16x16_bf16:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
				%res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x float> %C)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.f16.16x16x16.f16

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
				%res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 0)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
				%res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 1)
				store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
				store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
				; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
				; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu8

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15]
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] clamp
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
				; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
				; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
				; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu4

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11]
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0]
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}


				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] clamp
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
				; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
				; W32: ; %bb.0: ; %bb
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
				; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
				; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W32-NEXT: s_clause 0x1
				; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
				; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
				; W32-NEXT: s_endpgm
				bb:
				%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
				%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
				store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
				store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
				ret void
				}

llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W64

				declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)
				declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)
				declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)
				declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)
				declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
				declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)

				; The tests demonstrate that the following WMMA register constraints are satisfied.
				;
				; v_wmma D, A, B, C
				; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).
				;
				; In each test,
				; - first wmma instruction: the dest register D is different than all the sources
				; - second wmma instruction: the dest register D and src2 (C) are the same


				; @llvm.amdgcn.wmma.f32.16x16x16.f16

				define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_f32_16x16x16_f16:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
				; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
				%res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.f32.16x16x16.bf16

				define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_f32_16x16x16_bf16:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
				; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
				%res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x float> %C)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.f16.16x16x16.f16

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
				; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
				%res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
				; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
				; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
				%res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 1)
				store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
				store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
				; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
				; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
				; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
				; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
				; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu8

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11]
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11]
				; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}


				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0]
				; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0]
				; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0]
				; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
				; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
				; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
				; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				; @llvm.amdgcn.wmma.i32.16x16x16.iu4

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7]
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7]
				; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
				; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0]
				; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
				; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0]
				; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
				; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0]
				; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
				; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
				; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
				; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
				; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

				define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
				; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
				; W64: ; %bb.0: ; %bb
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
				; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
				; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
				; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
				; W64-NEXT: s_endpgm
				bb:
				%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
				%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
				store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
				store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
				ret void
				}

llvm/test/MC/AMDGPU/gfx11_wmma.s

This file was added.

				// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s \| FileCheck --check-prefix=W32 %s
				// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s \| FileCheck --check-prefix=W64 %s
				// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 \| FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
				// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 \| FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s

				//
				// Test v_wmma_f32_16x16x16_f16
				//

				v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
				// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
				// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:23], 1.0, v[8:15], v[16:23]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_f16 v[16:19], 1.0, v[8:15], v[16:19]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], 1.0, v[16:23]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], 1.0, v[16:19]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0
				// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0
				// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] clamp
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] clamp
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				//
				// Test v_wmma_f32_16x16x16_bf16
				//

				v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
				// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
				// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:23], 1.0, v[8:15], v[16:23]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_bf16 v[16:19], 1.0, v[8:15], v[16:19]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], 1.0, v[16:23]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], 1.0, v[16:19]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0
				// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0
				// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] clamp
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] clamp
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				//
				// Test v_wmma_f16_16x16x16_f16
				//

				v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
				// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
				// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:23], 1.0, v[8:15], v[16:23]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f16_16x16x16_f16 v[16:19], 1.0, v[8:15], v[16:19]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], 1.0, v[16:23]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], 1.0, v[16:19]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0
				// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0
				// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] clamp
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] clamp
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				//
				// Test v_wmma_bf16_16x16x16_bf16
				//

				v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
				// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
				// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:23], 1.0, v[8:15], v[16:23]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_bf16_16x16x16_bf16 v[16:19], 1.0, v[8:15], v[16:19]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], 1.0, v[16:23]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], 1.0, v[16:19]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0
				// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0
				// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] clamp
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] clamp
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				//
				// Test v_wmma_i32_16x16x16_iu8
				//

				v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15]
				// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
				// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:15], 1, v[4:7], v[8:15]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu8 v[8:11], 1, v[4:7], v[8:11]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], 1, v[8:15]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], 1, v[8:11]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1
				// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], 1
				// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu8 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp
				// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
				// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				//
				// Test v_wmma_i32_16x16x16_iu4
				//

				v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
				// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
				// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:11], 1, v[2:3], v[4:11]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu4 v[4:7], 1, v[2:3], v[4:7]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], 1, v[4:11]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], 1, v[4:7]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], 1
				// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], 1
				// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu4 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
				// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction

				v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
				// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
				// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] neg_hi:[1,1,0]
				// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
				// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c]
				// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

				v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
				// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c]
				// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

llvm/test/MC/Disassembler/AMDGPU/gfx11_wmma.txt

This file was added.

				# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s \| FileCheck -check-prefix=W32 %s
				# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s \| FileCheck -check-prefix=W64 %s


				# Test v_wmma_f32_16x16x16_f16

				# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
				# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
				0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c

				# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b]
				# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b]
				0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b

				# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c]
				# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c]
				0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c

				# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c]
				# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c]
				0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c

				# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c]
				# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c]
				0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c


				# Test v_wmma_f32_16x16x16_bf16

				# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c]
				# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c]
				0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c

				# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b]
				# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b]
				0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b

				# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c]
				# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c]
				0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c

				# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c]
				# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c]
				0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c

				# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c]
				# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c]
				0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c


				# Test v_wmma_f16_16x16x16_f16

				# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c]
				# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c]
				0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c

				# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b]
				# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b]
				0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b

				# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c]
				# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c]
				0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c

				# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c]
				# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c]
				0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c

				# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c]
				# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c]
				0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c

				# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c]
				# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c]
				0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c


				# Test v_wmma_bf16_16x16x16_bf16

				# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c]
				# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c]
				0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c

				# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b]
				# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b]
				0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b

				# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c]
				# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c]
				0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c

				# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c]
				# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c]
				0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c

				# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c]
				# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c]
				0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c

				# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c]
				# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c]
				0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c


				# Test v_wmma_i32_16x16x16_iu8

				# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c]
				# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c]
				0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c

				# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
				# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
				0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a

				# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c]
				# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c]
				0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c

				# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c]
				# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c]
				0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c

				# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c]
				# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c]
				0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c

				# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c]
				# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c]
				0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c


				# Test v_wmma_i32_16x16x16_iu4

				# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c]
				# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c]
				0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c

				# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a]
				# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a]
				0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a

				# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c]
				# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c]
				0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c

				# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c]
				# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c]
				0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c

				# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c]
				# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c]
				0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c

				# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c]
				# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c]
				0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] gfx11 WMMA instruction support
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 441415

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

llvm/lib/Target/AMDGPU/SIDefines.h

llvm/lib/Target/AMDGPU/SIInstrFormats.td

llvm/lib/Target/AMDGPU/SIInstrInfo.h

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.td

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

llvm/lib/Target/AMDGPU/VOPInstructions.td

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll

llvm/test/CodeGen/AMDGPU/threeaddr-wmma.mir

llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir

llvm/test/CodeGen/AMDGPU/wmma-hazards.mir

llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll

llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll

llvm/test/MC/AMDGPU/gfx11_wmma.s

llvm/test/MC/Disassembler/AMDGPU/gfx11_wmma.txt

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] gfx11 WMMA instruction supportClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 441415

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

llvm/lib/Target/AMDGPU/SIDefines.h

llvm/lib/Target/AMDGPU/SIInstrFormats.td

llvm/lib/Target/AMDGPU/SIInstrInfo.h

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.td

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

llvm/lib/Target/AMDGPU/VOPInstructions.td

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll

llvm/test/CodeGen/AMDGPU/threeaddr-wmma.mir

llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir

llvm/test/CodeGen/AMDGPU/wmma-hazards.mir

llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll

llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll

llvm/test/MC/AMDGPU/gfx11_wmma.s

llvm/test/MC/Disassembler/AMDGPU/gfx11_wmma.txt

[AMDGPU] gfx11 WMMA instruction support
ClosedPublic