This is an archive of the discontinued LLVM Phabricator instance.

Show First 20 Lines • Show All 2,017 Lines • ▼ Show 20 Lines	[
llvm_i1_ty, // %B_sign		llvm_i1_ty, // %B_sign
AB, // %B		AB, // %B
LLVMMatchType<0>, // %C		LLVMMatchType<0>, // %C
llvm_i1_ty, // %clamp		llvm_i1_ty, // %clamp
],		],
[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]		[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
>;		>;

def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v8f32_ty, llvm_anyfloat_ty>;		def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v8i32_ty, llvm_anyfloat_ty>;		def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v8f32_ty, llvm_anyfloat_ty>;		def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v8i32_ty, llvm_anyint_ty>;		def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;		def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;		def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;


//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Deep learning intrinsics.		// Deep learning intrinsics.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

▲ Show 20 Lines • Show All 325 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Show First 20 Lines • Show All 323 Lines • ▼ Show 20 Lines	def SIfptrunc_round_downward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_DOWNWARD",
SDTFPRoundOp		SDTFPRoundOp
>;		>;

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// ValueType helpers		// ValueType helpers
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

// Returns 1 if the source arguments have modifiers, 0 if they do not.		// Returns 1 if the source arguments have modifiers, 0 if they do not.
// XXX - do f16 instructions?
class isFloatType<ValueType SrcVT> {		class isFloatType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, f16.Value),		bit ret = !or(!eq(SrcVT.Value, f16.Value),
!eq(SrcVT.Value, f32.Value),		!eq(SrcVT.Value, f32.Value),
!eq(SrcVT.Value, f64.Value),		!eq(SrcVT.Value, f64.Value),
!eq(SrcVT.Value, v2f16.Value),		!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v4f16.Value),		!eq(SrcVT.Value, v4f16.Value),
		!eq(SrcVT.Value, v8f16.Value),
		!eq(SrcVT.Value, v16f16.Value),
!eq(SrcVT.Value, v2f32.Value),		!eq(SrcVT.Value, v2f32.Value),
!eq(SrcVT.Value, v4f32.Value),		!eq(SrcVT.Value, v4f32.Value),
!eq(SrcVT.Value, v8f32.Value),		!eq(SrcVT.Value, v8f32.Value),
!eq(SrcVT.Value, v2f64.Value),		!eq(SrcVT.Value, v2f64.Value),
!eq(SrcVT.Value, v4f64.Value));		!eq(SrcVT.Value, v4f64.Value));
}		}

		// XXX - do v2i16 instructions?
class isIntType<ValueType SrcVT> {		class isIntType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, i16.Value),		bit ret = !or(!eq(SrcVT.Value, i16.Value),
!eq(SrcVT.Value, i32.Value),		!eq(SrcVT.Value, i32.Value),
!eq(SrcVT.Value, i64.Value),		!eq(SrcVT.Value, i64.Value),
		!eq(SrcVT.Value, v4i16.Value),
		!eq(SrcVT.Value, v8i16.Value),
		!eq(SrcVT.Value, v16i16.Value),
!eq(SrcVT.Value, v2i32.Value),		!eq(SrcVT.Value, v2i32.Value),
!eq(SrcVT.Value, v4i32.Value),		!eq(SrcVT.Value, v4i32.Value),
!eq(SrcVT.Value, v8i32.Value));		!eq(SrcVT.Value, v8i32.Value));
}		}

class isPackedType<ValueType SrcVT> {		class isPackedType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, v2i16.Value),		bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
!eq(SrcVT.Value, v2f16.Value),		!eq(SrcVT.Value, v2f16.Value),
▲ Show 20 Lines • Show All 1,370 Lines • ▼ Show 20 Lines
class isModifierType<ValueType SrcVT> {		class isModifierType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, f16.Value),		bit ret = !or(!eq(SrcVT.Value, f16.Value),
!eq(SrcVT.Value, f32.Value),		!eq(SrcVT.Value, f32.Value),
!eq(SrcVT.Value, f64.Value),		!eq(SrcVT.Value, f64.Value),
!eq(SrcVT.Value, v2f16.Value),		!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v2i16.Value),		!eq(SrcVT.Value, v2i16.Value),
!eq(SrcVT.Value, v2f32.Value),		!eq(SrcVT.Value, v2f32.Value),
!eq(SrcVT.Value, v2i32.Value),		!eq(SrcVT.Value, v2i32.Value),
		!eq(SrcVT.Value, v4f16.Value),
		!eq(SrcVT.Value, v4i16.Value),
!eq(SrcVT.Value, v4f32.Value),		!eq(SrcVT.Value, v4f32.Value),
!eq(SrcVT.Value, v4i32.Value),		!eq(SrcVT.Value, v4i32.Value),
		!eq(SrcVT.Value, v8f16.Value),
		!eq(SrcVT.Value, v8i16.Value),
!eq(SrcVT.Value, v8f32.Value),		!eq(SrcVT.Value, v8f32.Value),
!eq(SrcVT.Value, v8i32.Value));		!eq(SrcVT.Value, v8i32.Value),
		!eq(SrcVT.Value, v16f16.Value),
		!eq(SrcVT.Value, v16i16.Value));
}		}

// Return type of input modifiers operand for specified input operand		// Return type of input modifiers operand for specified input operand
class getSrcMod <ValueType VT, bit EnableF32SrcMods> {		class getSrcMod <ValueType VT, bit EnableF32SrcMods> {
bit isFP = isFloatType<VT>.ret;		bit isFP = isFloatType<VT>.ret;
bit isPacked = isPackedType<VT>.ret;		bit isPacked = isPackedType<VT>.ret;
Operand ret = !if(!eq(VT.Size, 64),		Operand ret = !if(!eq(VT.Size, 64),
!if(isFP, FP64InputMods, Int64InputMods),		!if(isFP, FP64InputMods, Int64InputMods),
▲ Show 20 Lines • Show All 1,209 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Show First 20 Lines • Show All 678 Lines • ▼ Show 20 Lines

class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bit _HasClamp, bit _HasOpSel> : VOP3P_Profile<P> {		class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bit _HasClamp, bit _HasOpSel> : VOP3P_Profile<P> {
let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128);		let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128);
let Src0RC64 = _Src01RC64;		let Src0RC64 = _Src01RC64;
let Src1RC64 = _Src01RC64;		let Src1RC64 = _Src01RC64;
let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32);		let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32);
let HasClamp = _HasClamp;		let HasClamp = _HasClamp;
let HasOpSel = _HasOpSel;		let HasOpSel = _HasOpSel;
		let IsPacked = 1;
let IsWMMA = 1;		let IsWMMA = 1;
}		}

def VOP_V8F32_V8F32_V8F32_V8F32 : VOPProfile <[v8f32, v8f32, v8f32, v8f32]>;		def VOP_V8F32_V16F16_V16F16_V8F32 : VOPProfile <[v8f32, v16f16, v16f16, v8f32]>;
def VOP_V8F32_V8I32_V8I32_V8F32 : VOPProfile <[v8f32, v8i32, v8i32, v8f32]>;		def VOP_V8F32_V16I16_V16I16_V8F32 : VOPProfile <[v8f32, v16i16, v16i16, v8f32]>;
		def VOP_V16F16_V16F16_V16F16_V16F16 : VOPProfile <[v16f16, v16f16, v16f16, v16f16]>;
		def VOP_V16I16_V16I16_V16I16_V16I16 : VOPProfile <[v16i16, v16i16, v16i16, v16i16]>;
def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>;		def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>;
def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>;		def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>;
def VOP_V8I32_V8I32_V8I32_V8I32 : VOPProfile <[v8i32, v8i32, v8i32, v8i32]>;

def VOP_V4F32_V8F32_V8F32_V4F32 : VOPProfile <[v4f32, v8f32, v8f32, v4f32]>;		def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>;
def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>;		def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>;
		def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>;
		def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>;
def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;		def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;
def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>;		def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>;
def VOP_V4I32_V8I32_V8I32_V4I32 : VOPProfile <[v4i32, v8i32, v8i32, v4i32]>;

class WMMAType <bits<2> val> {		class WMMAType <bits<2> val> {
bit hasClamp = val{0};		bit hasClamp = val{0};
bit hasOpsel = val{1};		bit hasOpsel = val{1};
}		}

def WMMARegular : WMMAType<0b00>;		def WMMARegular : WMMAType<0b00>;
def WMMAUIClamp : WMMAType<0b01>;		def WMMAUIClamp : WMMAType<0b01>;
▲ Show 20 Lines • Show All 98 Lines • ▼ Show 20 Lines	if !eq(Type, WMMAOpSel) then {
def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;		def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
} else if !eq(Type, WMMAUIClamp) then {		} else if !eq(Type, WMMAUIClamp) then {
def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;		def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
} else {		} else {
def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;		def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
}		}
}		}


let WaveSizePredicate = isWave32 in {		let WaveSizePredicate = isWave32 in {
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;		defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V8I32_V8I32_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;		defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;		defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V8I32_V8I32_V8I32_V8I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;		defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;		defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;		defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
}		}

let WaveSizePredicate = isWave64 in {		let WaveSizePredicate = isWave64 in {
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;		defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V8I32_V8I32_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;		defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;		defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V4I32_V8I32_V8I32_V4I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;		defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;		defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;		defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;

}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Begin Real Encodings		// Begin Real Encodings
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,		class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
string opName = ps.OpName>		string opName = ps.OpName>
▲ Show 20 Lines • Show All 364 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W32			; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W32

	declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)			declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
	declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)			declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
	declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)			declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
	declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)			declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
	declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)			declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
	declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)			declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)

	; @llvm.amdgcn.wmma.f32.16x16x16.f16			; @llvm.amdgcn.wmma.f32.16x16x16.f16

	define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_f32_16x16x16_f16:			; W32-LABEL: test_wmma_f32_16x16x16_f16:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)			%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f32.16x16x16.bf16			; @llvm.amdgcn.wmma.f32.16x16x16.bf16

	define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_f32_16x16x16_bf16:			; W32-LABEL: test_wmma_f32_16x16x16_bf16:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)			%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f16.16x16x16.f16			; @llvm.amdgcn.wmma.f16.16x16x16.f16

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:			; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)			%res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:			; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]			; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)			%res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.bf16.16x16x16.bf16			; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:			; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)			%res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
	store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32			store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:			; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]			; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)			%res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
	store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32			store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.i32.16x16x16.iu8			; @llvm.amdgcn.wmma.i32.16x16x16.iu8

	define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:			; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	▲ Show 20 Lines • Show All 240 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W64			; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W64

	declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)			declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
	declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)			declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
	declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)			declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
	declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)			declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
	declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)			declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
	declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)			declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)

	; @llvm.amdgcn.wmma.f32.16x16x16.f16			; @llvm.amdgcn.wmma.f32.16x16x16.f16

	define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_f32_16x16x16_f16:			; W64-LABEL: test_wmma_f32_16x16x16_f16:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)			%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f32.16x16x16.bf16			; @llvm.amdgcn.wmma.f32.16x16x16.bf16

	define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_f32_16x16x16_bf16:			; W64-LABEL: test_wmma_f32_16x16x16_bf16:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)			%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f16.16x16x16.f16			; @llvm.amdgcn.wmma.f16.16x16x16.f16

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:			; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)			%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:			; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]			; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)			%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.bf16.16x16x16.bf16			; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:			; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)			%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
	store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16			store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:			; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]			; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)			%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
	store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16			store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.i32.16x16x16.iu8			; @llvm.amdgcn.wmma.i32.16x16x16.iu8

	define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:			; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	▲ Show 20 Lines • Show All 208 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W32			; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W32

	declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)			declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
	declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)			declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
	declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)			declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
	declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)			declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
	declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)			declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
	declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)			declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)

	; @llvm.amdgcn.wmma.f32.16x16x16.f16			; @llvm.amdgcn.wmma.f32.16x16x16.f16

	define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_f32_16x16x16_f16:			; W32-LABEL: test_wmma_f32_16x16x16_f16:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)			%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f32.16x16x16.bf16			; @llvm.amdgcn.wmma.f32.16x16x16.bf16

	define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_f32_16x16x16_bf16:			; W32-LABEL: test_wmma_f32_16x16x16_bf16:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)			%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f16.16x16x16.f16			; @llvm.amdgcn.wmma.f16.16x16x16.f16

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:			; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)			%res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:			; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]			; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)			%res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.bf16.16x16x16.bf16			; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:			; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)			%res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
	store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32			store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:			; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]			; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[16:19], off			; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)			%res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
	store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32			store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.i32.16x16x16.iu8			; @llvm.amdgcn.wmma.i32.16x16x16.iu8

	define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
	; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:			; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	▲ Show 20 Lines • Show All 240 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W64			; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W64

	declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)			declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
	declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)			declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
	declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)			declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
	declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)			declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
	declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)			declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
	declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)			declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)

	; @llvm.amdgcn.wmma.f32.16x16x16.f16			; @llvm.amdgcn.wmma.f32.16x16x16.f16

	define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_f32_16x16x16_f16:			; W64-LABEL: test_wmma_f32_16x16x16_f16:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)			%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f32.16x16x16.bf16			; @llvm.amdgcn.wmma.f32.16x16x16.bf16

	define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_f32_16x16x16_bf16:			; W64-LABEL: test_wmma_f32_16x16x16_bf16:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)			%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f16.16x16x16.f16			; @llvm.amdgcn.wmma.f16.16x16x16.f16

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:			; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)			%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:			; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]			; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)			%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.bf16.16x16x16.bf16			; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:			; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)			%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
	store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16			store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:			; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]			; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
	; W64-NEXT: global_store_b128 v[20:21], v[16:19], off			; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)			%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
	store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16			store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.i32.16x16x16.iu8			; @llvm.amdgcn.wmma.i32.16x16x16.iu8

	define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {			define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
	; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:			; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	▲ Show 20 Lines • Show All 208 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W32			; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W32

	declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)			declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
	declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)			declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
	declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)			declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
	declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)			declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
	declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)			declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
	declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)			declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)

	; The tests demonstrate that the following WMMA register constraints are satisfied.			; The tests demonstrate that the following WMMA register constraints are satisfied.
	;			;
	; v_wmma D, A, B, C			; v_wmma D, A, B, C
	; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).			; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).
	;			;
	; In each test,			; In each test,
	; - first wmma instruction: the dest register D is different than all the sources			; - first wmma instruction: the dest register D is different than all the sources
	; - second wmma instruction: the dest register D and src2 (C) are the same			; - second wmma instruction: the dest register D and src2 (C) are the same


	; @llvm.amdgcn.wmma.f32.16x16x16.f16			; @llvm.amdgcn.wmma.f32.16x16x16.f16

	define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
	; W32-LABEL: test_wmma_f32_16x16x16_f16:			; W32-LABEL: test_wmma_f32_16x16x16_f16:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[28:31], off			; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[26:27], v[16:19], off			; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)			%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
	%res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C)			%res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x float> %C)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
	store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32			store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f32.16x16x16.bf16			; @llvm.amdgcn.wmma.f32.16x16x16.bf16

	define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
	; W32-LABEL: test_wmma_f32_16x16x16_bf16:			; W32-LABEL: test_wmma_f32_16x16x16_bf16:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[28:31], off			; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[26:27], v[16:19], off			; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)			%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
	%res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x float> %C)			%res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x float> %C)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
	store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32			store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f16.16x16x16.f16			; @llvm.amdgcn.wmma.f16.16x16x16.f16

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) {
	; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:			; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]			; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[28:31], off			; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[26:27], v[16:19], off			; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)			%res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
	%res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 0)			%res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 0)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
	store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32			store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) {
	; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:			; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]			; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
	; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]			; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[28:31], off			; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[26:27], v[16:19], off			; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)			%res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
	%res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 1)			%res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 1)
	store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32			store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
	store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32			store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.bf16.16x16x16.bf16			; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) {
	; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:			; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]			; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
	; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]			; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[28:31], off			; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[26:27], v[16:19], off			; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)			%res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
	%res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 0)			%res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 0)
	store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32			store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
	store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32			store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) {
	; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:			; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]			; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
	; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]			; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16			; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
	; W32-NEXT: global_store_b128 v[24:25], v[28:31], off			; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
	; W32-NEXT: s_clause 0x1			; W32-NEXT: s_clause 0x1
	; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16			; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
	; W32-NEXT: global_store_b128 v[26:27], v[16:19], off			; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
	; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W32-NEXT: s_endpgm			; W32-NEXT: s_endpgm
	bb:			bb:
	%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)			%res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
	%res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 1)			%res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 1)
	store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32			store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
	store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32			store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.i32.16x16x16.iu8			; @llvm.amdgcn.wmma.i32.16x16x16.iu8

	define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
	; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:			; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
	; W32: ; %bb.0: ; %bb			; W32: ; %bb.0: ; %bb
	▲ Show 20 Lines • Show All 336 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W64			; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck %s --check-prefix=W64

	declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)			declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
	declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)			declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
	declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)			declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
	declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)			declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
	declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)			declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
	declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)			declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)

	; The tests demonstrate that the following WMMA register constraints are satisfied.			; The tests demonstrate that the following WMMA register constraints are satisfied.
	;			;
	; v_wmma D, A, B, C			; v_wmma D, A, B, C
	; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).			; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).
	;			;
	; In each test,			; In each test,
	; - first wmma instruction: the dest register D is different than all the sources			; - first wmma instruction: the dest register D is different than all the sources
	; - second wmma instruction: the dest register D and src2 (C) are the same			; - second wmma instruction: the dest register D and src2 (C) are the same


	; @llvm.amdgcn.wmma.f32.16x16x16.f16			; @llvm.amdgcn.wmma.f32.16x16x16.f16

	define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
	; W64-LABEL: test_wmma_f32_16x16x16_f16:			; W64-LABEL: test_wmma_f32_16x16x16_f16:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[24:27], off			; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
	; W64-NEXT: global_store_b128 v[22:23], v[16:19], off			; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)			%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
	%res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C)			%res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <4 x float> %C)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
	store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16			store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f32.16x16x16.bf16			; @llvm.amdgcn.wmma.f32.16x16x16.bf16

	define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
	; W64-LABEL: test_wmma_f32_16x16x16_bf16:			; W64-LABEL: test_wmma_f32_16x16x16_bf16:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[24:27], off			; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
	; W64-NEXT: global_store_b128 v[22:23], v[16:19], off			; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)			%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
	%res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x float> %C)			%res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <4 x float> %C)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
	store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16			store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.f16.16x16x16.f16			; @llvm.amdgcn.wmma.f16.16x16x16.f16

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) {
	; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:			; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]			; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[24:27], off			; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
	; W64-NEXT: global_store_b128 v[22:23], v[16:19], off			; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)			%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
	%res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 0)			%res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 0)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
	store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16			store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) {
	; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:			; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]			; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
	; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]			; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
	; W64-NEXT: global_store_b128 v[20:21], v[24:27], off			; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
	; W64-NEXT: global_store_b128 v[22:23], v[16:19], off			; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)			%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
	%res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 1)			%res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 1)
	store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16			store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
	store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16			store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.bf16.16x16x16.bf16			; @llvm.amdgcn.wmma.bf16.16x16x16.bf16

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) {
	; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:			; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]			; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
	; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]			; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
	; W64-NEXT: global_store_b128 v[20:21], v[24:27], off			; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
	; W64-NEXT: global_store_b128 v[22:23], v[16:19], off			; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)			%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
	%res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 0)			%res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 0)
	store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16			store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
	store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16			store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16
	ret void			ret void
	}			}

	define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) {
	; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:			; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]			; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
	; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]			; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
	; W64-NEXT: global_store_b128 v[20:21], v[24:27], off			; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
	; W64-NEXT: global_store_b128 v[22:23], v[16:19], off			; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
	; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; W64-NEXT: s_endpgm			; W64-NEXT: s_endpgm
	bb:			bb:
	%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)			%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
	%res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 1)			%res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 1)
	store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16			store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
	store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16			store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16
	ret void			ret void
	}			}

	; @llvm.amdgcn.wmma.i32.16x16x16.iu8			; @llvm.amdgcn.wmma.i32.16x16x16.iu8

	define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {			define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
	; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:			; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
	; W64: ; %bb.0: ; %bb			; W64: ; %bb.0: ; %bb
	▲ Show 20 Lines • Show All 272 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Update WMMA intrinsics with explicit f16 typesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 441607

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

llvm/lib/Target/AMDGPU/SIInstrInfo.td

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll

llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll

llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll

[AMDGPU] Update WMMA intrinsics with explicit f16 types
ClosedPublic