Diff 402618

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 354 Lines • ▼ Show 20 Lines	AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);		setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);		setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);		setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);		setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);		setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);		setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom);
		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
▲ Show 20 Lines • Show All 1,032 Lines • ▼ Show 20 Lines	SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
EVT SrcVT = Op.getOperand(0).getValueType();		EVT SrcVT = Op.getOperand(0).getValueType();

// For these types, we have some TableGen patterns except if the index is 1		// For these types, we have some TableGen patterns except if the index is 1
if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) \|\|		if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) \|\|
(SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&		(SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
Start != 1)		Start != 1)
return Op;		return Op;

		if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) \|\|
		(SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
		(Start == 0 \|\| Start == 4))
		return Op;

DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,		DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
VT.getVectorNumElements());		VT.getVectorNumElements());

return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);		return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
}		}

/// Generate Min/Max node		/// Generate Min/Max node
SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,		SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
▲ Show 20 Lines • Show All 3,494 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 128 Lines • ▼ Show 20 Lines	if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);		addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);		addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);

// Unless there are also VOP3P operations, not operations are really legal.		// Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);		addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);		addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);		addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);		addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
		addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
		addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
}		}

addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);		addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));		addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));

computeRegisterProperties(Subtarget->getRegisterInfo());		computeRegisterProperties(Subtarget->getRegisterInfo());

// The boolean content concept here is too inflexible. Compares only ever		// The boolean content concept here is too inflexible. Compares only ever
▲ Show 20 Lines • Show All 119 Lines • ▼ Show 20 Lines
#endif		#endif

// We only support LOAD/STORE and vector manipulation ops for vectors		// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.		// with > 4 elements.
for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,		for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,		MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,		MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,		MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {		MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64,
		MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {		for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {		switch (Op) {
case ISD::LOAD:		case ISD::LOAD:
case ISD::STORE:		case ISD::STORE:
case ISD::BUILD_VECTOR:		case ISD::BUILD_VECTOR:
case ISD::BITCAST:		case ISD::BITCAST:
case ISD::EXTRACT_VECTOR_ELT:		case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:		case ISD::INSERT_VECTOR_ELT:
▲ Show 20 Lines • Show All 325 Lines • ▼ Show 20 Lines	if (Subtarget->has16BitInsts()) {

setOperationAction(ISD::FDIV, MVT::f16, Custom);		setOperationAction(ISD::FDIV, MVT::f16, Custom);

// F16 - VOP3 Actions.		// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);		setOperationAction(ISD::FMA, MVT::f16, Legal);
if (STI.hasMadF16())		if (STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);		setOperationAction(ISD::FMAD, MVT::f16, Legal);

for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {		for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
		MVT::v8f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {		for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {		switch (Op) {
case ISD::LOAD:		case ISD::LOAD:
case ISD::STORE:		case ISD::STORE:
case ISD::BUILD_VECTOR:		case ISD::BUILD_VECTOR:
case ISD::BITCAST:		case ISD::BITCAST:
case ISD::EXTRACT_VECTOR_ELT:		case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:		case ISD::INSERT_VECTOR_ELT:
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::LOAD, MVT::v4f16, Promote);		setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);		AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);

setOperationAction(ISD::STORE, MVT::v4i16, Promote);		setOperationAction(ISD::STORE, MVT::v4i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);		AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
setOperationAction(ISD::STORE, MVT::v4f16, Promote);		setOperationAction(ISD::STORE, MVT::v4f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);		AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);

		setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
		AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
		setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
		AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);

		setOperationAction(ISD::STORE, MVT::v4i16, Promote);
		AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
		setOperationAction(ISD::STORE, MVT::v4f16, Promote);
		AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);

		setOperationAction(ISD::STORE, MVT::v8i16, Promote);
		AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
		setOperationAction(ISD::STORE, MVT::v8f16, Promote);
		AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);

setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);		setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);		setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);		setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);		setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);

setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);		setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);		setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);		setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);

		setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand);
		setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand);
		setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand);

if (!Subtarget->hasVOP3PInsts()) {		if (!Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);		setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);		setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
}		}

setOperationAction(ISD::FNEG, MVT::v2f16, Legal);		setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
// This isn't really legal, but this avoids the legalizer unrolling it (and		// This isn't really legal, but this avoids the legalizer unrolling it (and
// allows matching fneg (fabs x) patterns)		// allows matching fneg (fabs x) patterns)
setOperationAction(ISD::FABS, MVT::v2f16, Legal);		setOperationAction(ISD::FABS, MVT::v2f16, Legal);

setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);		setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::f16, Custom);		setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);		setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);		setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);

setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);		setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);		setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
		setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom);
		setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom);

setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);		setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);		setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
		setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand);
		setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand);

		for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) {
		setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);
		setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);
		setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
		setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand);
		}
}		}

if (Subtarget->hasVOP3PInsts()) {		if (Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::ADD, MVT::v2i16, Legal);		setOperationAction(ISD::ADD, MVT::v2i16, Legal);
setOperationAction(ISD::SUB, MVT::v2i16, Legal);		setOperationAction(ISD::SUB, MVT::v2i16, Legal);
setOperationAction(ISD::MUL, MVT::v2i16, Legal);		setOperationAction(ISD::MUL, MVT::v2i16, Legal);
setOperationAction(ISD::SHL, MVT::v2i16, Legal);		setOperationAction(ISD::SHL, MVT::v2i16, Legal);
setOperationAction(ISD::SRL, MVT::v2i16, Legal);		setOperationAction(ISD::SRL, MVT::v2i16, Legal);
Show All 17 Lines	if (Subtarget->hasVOP3PInsts()) {

setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);		setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);		setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);		setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom);
		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);

setOperationAction(ISD::SHL, MVT::v4i16, Custom);		for (MVT VT : { MVT::v4i16, MVT::v8i16 }) {
setOperationAction(ISD::SRA, MVT::v4i16, Custom);		// Split vector operations.
setOperationAction(ISD::SRL, MVT::v4i16, Custom);		setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::ADD, MVT::v4i16, Custom);		setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SUB, MVT::v4i16, Custom);		setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::MUL, MVT::v4i16, Custom);		setOperationAction(ISD::ADD, VT, Custom);
		setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::SMIN, MVT::v4i16, Custom);		setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
setOperationAction(ISD::UMIN, MVT::v4i16, Custom);		setOperationAction(ISD::SMIN, VT, Custom);
setOperationAction(ISD::UMAX, MVT::v4i16, Custom);		setOperationAction(ISD::SMAX, VT, Custom);
		setOperationAction(ISD::UMIN, VT, Custom);
setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom);		setOperationAction(ISD::UMAX, VT, Custom);
setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom);
setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom);		setOperationAction(ISD::UADDSAT, VT, Custom);
setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom);		setOperationAction(ISD::SADDSAT, VT, Custom);
		setOperationAction(ISD::USUBSAT, VT, Custom);
setOperationAction(ISD::FADD, MVT::v4f16, Custom);		setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);		}
setOperationAction(ISD::FMA, MVT::v4f16, Custom);
		for (MVT VT : { MVT::v4f16, MVT::v8f16 }) {
		// Split vector operations.
		setOperationAction(ISD::FADD, VT, Custom);
		setOperationAction(ISD::FMUL, VT, Custom);
		setOperationAction(ISD::FMA, VT, Custom);
		setOperationAction(ISD::FCANONICALIZE, VT, Custom);
		}

setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);		setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);		setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);

setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);		setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);		setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);

setOperationAction(ISD::FEXP, MVT::v2f16, Custom);		setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
setOperationAction(ISD::SELECT, MVT::v4i16, Custom);		setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
setOperationAction(ISD::SELECT, MVT::v4f16, Custom);		setOperationAction(ISD::SELECT, MVT::v4f16, Custom);

if (Subtarget->hasPackedFP32Ops()) {		if (Subtarget->hasPackedFP32Ops()) {
setOperationAction(ISD::FADD, MVT::v2f32, Legal);		setOperationAction(ISD::FADD, MVT::v2f32, Legal);
setOperationAction(ISD::FMUL, MVT::v2f32, Legal);		setOperationAction(ISD::FMUL, MVT::v2f32, Legal);
Show All 20 Lines	if (Subtarget->has16BitInsts()) {
// Legalization hack.		// Legalization hack.
setOperationAction(ISD::SELECT, MVT::v2i16, Custom);		setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
setOperationAction(ISD::SELECT, MVT::v2f16, Custom);		setOperationAction(ISD::SELECT, MVT::v2f16, Custom);

setOperationAction(ISD::FNEG, MVT::v2f16, Custom);		setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
setOperationAction(ISD::FABS, MVT::v2f16, Custom);		setOperationAction(ISD::FABS, MVT::v2f16, Custom);
}		}

for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {		for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
		MVT::v8i16, MVT::v8f16 }) {
setOperationAction(ISD::SELECT, VT, Custom);		setOperationAction(ISD::SELECT, VT, Custom);
}		}

setOperationAction(ISD::SMULO, MVT::i64, Custom);		setOperationAction(ISD::SMULO, MVT::i64, Custom);
setOperationAction(ISD::UMULO, MVT::i64, Custom);		setOperationAction(ISD::UMULO, MVT::i64, Custom);

if (Subtarget->hasMad64_32()) {		if (Subtarget->hasMad64_32()) {
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);		setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
▲ Show 20 Lines • Show All 3,794 Lines • ▼ Show 20 Lines

// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the		// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
// wider vector type is legal.		// wider vector type is legal.
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,		SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();		unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4f32 \|\|		assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4f32 \|\|
VT == MVT::v8f32 \|\| VT == MVT::v16f32 \|\| VT == MVT::v32f32);		VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8f32 \|\|
		VT == MVT::v16f32 \|\| VT == MVT::v32f32);

SDValue Lo0, Hi0;		SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);		std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
SDValue Lo1, Hi1;		SDValue Lo1, Hi1;
std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);		std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);

SDLoc SL(Op);		SDLoc SL(Op);

SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,		SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());		Op->getFlags());
SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,		SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());		Op->getFlags());

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);		return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}		}

SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,		SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();		unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4f32 \|\|		assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v8i16 \|\|
VT == MVT::v8f32 \|\| VT == MVT::v16f32 \|\| VT == MVT::v32f32);		VT == MVT::v8f16 \|\| VT == MVT::v4f32 \|\| VT == MVT::v8f32 \|\|
		VT == MVT::v16f32 \|\| VT == MVT::v32f32);

SDValue Lo0, Hi0;		SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);		SDValue Op0 = Op.getOperand(0);
		std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
		? DAG.SplitVectorOperand(Op.getNode(), 0)
		: std::make_pair(Op0, Op0);
SDValue Lo1, Hi1;		SDValue Lo1, Hi1;
std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);		std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
SDValue Lo2, Hi2;		SDValue Lo2, Hi2;
std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);		std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);

SDLoc SL(Op);		SDLoc SL(Op);
		auto ResVT = DAG.GetSplitDestVTs(VT);

SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2,		SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());		Op->getFlags());
SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2,		SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());		Op->getFlags());

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);		return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}		}


SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {		SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {		switch (Op.getOpcode()) {
▲ Show 20 Lines • Show All 645 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,

// FIXME: Assert during selection that this is only selected for		// FIXME: Assert during selection that this is only selected for
// ieee_mode. Currently a combine can produce the ieee version for non-ieee		// ieee_mode. Currently a combine can produce the ieee version for non-ieee
// mode functions, but this happens to be OK since it's only done in cases		// mode functions, but this happens to be OK since it's only done in cases
// where there is known no sNaN.		// where there is known no sNaN.
if (IsIEEEMode)		if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);		return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);

if (VT == MVT::v4f16)		if (VT == MVT::v4f16 \|\| VT == MVT::v8f16)
return splitBinaryVectorOp(Op, DAG);		return splitBinaryVectorOp(Op, DAG);
return Op;		return Op;
}		}

SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {		SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
SDLoc SL(Op);		SDLoc SL(Op);
SDValue LHS = Op.getOperand(0);		SDValue LHS = Op.getOperand(0);
▲ Show 20 Lines • Show All 385 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SDLoc SL(Op);		SDLoc SL(Op);

EVT ResultVT = Op.getValueType();		EVT ResultVT = Op.getValueType();
SDValue Vec = Op.getOperand(0);		SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);		SDValue Idx = Op.getOperand(1);
EVT VecVT = Vec.getValueType();		EVT VecVT = Vec.getValueType();
unsigned VecSize = VecVT.getSizeInBits();		unsigned VecSize = VecVT.getSizeInBits();
EVT EltVT = VecVT.getVectorElementType();		EVT EltVT = VecVT.getVectorElementType();
assert(VecSize <= 64);

DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);		DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);

// Make sure we do any optimizations that will make it easier to fold		// Make sure we do any optimizations that will make it easier to fold
// source modifiers before obscuring it with bit operations.		// source modifiers before obscuring it with bit operations.

// XXX - Why doesn't this get called when vector_shuffle is expanded?		// XXX - Why doesn't this get called when vector_shuffle is expanded?
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))		if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;		return Combined;

		if (VecSize == 128) {
		SDValue Lo, Hi;
		EVT LoVT, HiVT;
		SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
		std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
		Lo =
		DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
		V2, DAG.getConstant(0, SL, MVT::i32)));
		Hi =
		DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
		V2, DAG.getConstant(1, SL, MVT::i32)));
		EVT IdxVT = Idx.getValueType();
		unsigned NElem = VecVT.getVectorNumElements();
		assert(isPowerOf2_32(NElem));
		SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
		SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
		SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
		return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
		}

		assert(VecSize <= 64);

unsigned EltSize = EltVT.getSizeInBits();		unsigned EltSize = EltVT.getSizeInBits();
assert(isPowerOf2_32(EltSize));		assert(isPowerOf2_32(EltSize));

MVT IntVT = MVT::getIntegerVT(VecSize);		MVT IntVT = MVT::getIntegerVT(VecSize);
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);		SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);

// Convert vector index to bit-index (* EltSize)		// Convert vector index to bit-index (* EltSize)
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);		SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);		return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
}		}

SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,		SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc SL(Op);		SDLoc SL(Op);
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();

if (VT == MVT::v4i16 \|\| VT == MVT::v4f16) {		if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);		VT == MVT::v8i16 \|\| VT == MVT::v8f16) {
		EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
		VT.getVectorNumElements() / 2);
		MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());

// Turn into pair of packed build_vectors.		// Turn into pair of packed build_vectors.
// TODO: Special case for constants that can be materialized with s_mov_b64.		// TODO: Special case for constants that can be materialized with s_mov_b64.
SDValue Lo = DAG.getBuildVector(HalfVT, SL,		SmallVector<SDValue, 4> LoOps, HiOps;
{ Op.getOperand(0), Op.getOperand(1) });		for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
SDValue Hi = DAG.getBuildVector(HalfVT, SL,		LoOps.push_back(Op.getOperand(I));
{ Op.getOperand(2), Op.getOperand(3) });		HiOps.push_back(Op.getOperand(I + E));
		}
		SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
		SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);

SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);		SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);		SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);

SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });		SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
		{ CastLo, CastHi });
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);		return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}		}

assert(VT == MVT::v2f16 \|\| VT == MVT::v2i16);		assert(VT == MVT::v2f16 \|\| VT == MVT::v2i16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");		assert(!Subtarget->hasVOP3PInsts() && "this should be legal");

SDValue Lo = Op.getOperand(0);		SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);		SDValue Hi = Op.getOperand(1);
▲ Show 20 Lines • Show All 2,595 Lines • ▼ Show 20 Lines	if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
return DAG.getMergeValues(Ops, DL);		return DAG.getMergeValues(Ops, DL);
}		}

return SDValue();		return SDValue();
}		}

SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {		SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
		if (VT.getSizeInBits() == 128)
		return splitTernaryVectorOp(Op, DAG);

assert(VT.getSizeInBits() == 64);		assert(VT.getSizeInBits() == 64);

SDLoc DL(Op);		SDLoc DL(Op);
SDValue Cond = Op.getOperand(0);		SDValue Cond = Op.getOperand(0);

SDValue Zero = DAG.getConstant(0, DL, MVT::i32);		SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
SDValue One = DAG.getConstant(1, DL, MVT::i32);		SDValue One = DAG.getConstant(1, DL, MVT::i32);

▲ Show 20 Lines • Show All 4,063 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 1,186 Lines • ▼ Show 20 Lines	def : Pat <
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))		(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
>;		>;

def : Pat <		def : Pat <
(extract_subvector v4f16:$vec, (i32 2)),		(extract_subvector v4f16:$vec, (i32 2)),
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))		(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
>;		>;

		def : Pat <
		(extract_subvector v8i16:$vec, (i32 0)),
		(v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1))
		>;

		def : Pat <
		(extract_subvector v8i16:$vec, (i32 4)),
		(v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3))
		>;

		def : Pat <
		(extract_subvector v8f16:$vec, (i32 0)),
		(v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1))
		>;

		def : Pat <
		(extract_subvector v8f16:$vec, (i32 4)),
		(v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3))
		>;

foreach Index = 0-31 in {		foreach Index = 0-31 in {
def Extract_Element_v32i32_#Index : Extract_Element <		def Extract_Element_v32i32_#Index : Extract_Element <
i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)		i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
>;		>;

def Insert_Element_v32i32_#Index : Insert_Element <		def Insert_Element_v32i32_#Index : Insert_Element <
i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)		i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
>;		>;
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines
def : BitConvert <v2f64, v4f32, VReg_128>;		def : BitConvert <v2f64, v4f32, VReg_128>;
def : BitConvert <v2f64, v4i32, VReg_128>;		def : BitConvert <v2f64, v4i32, VReg_128>;
def : BitConvert <v4f32, v2f64, VReg_128>;		def : BitConvert <v4f32, v2f64, VReg_128>;
def : BitConvert <v4i32, v2f64, VReg_128>;		def : BitConvert <v4i32, v2f64, VReg_128>;
def : BitConvert <v2i64, v2f64, VReg_128>;		def : BitConvert <v2i64, v2f64, VReg_128>;
def : BitConvert <v2f64, v2i64, VReg_128>;		def : BitConvert <v2f64, v2i64, VReg_128>;
def : BitConvert <v4f32, v2i64, VReg_128>;		def : BitConvert <v4f32, v2i64, VReg_128>;
def : BitConvert <v2i64, v4f32, VReg_128>;		def : BitConvert <v2i64, v4f32, VReg_128>;
		def : BitConvert <v8i16, v4i32, SReg_128>;
		def : BitConvert <v4i32, v8i16, SReg_128>;
		def : BitConvert <v8f16, v4f32, VReg_128>;
		def : BitConvert <v8f16, v4i32, VReg_128>;
		def : BitConvert <v4f32, v8f16, VReg_128>;
		def : BitConvert <v4i32, v8f16, VReg_128>;
		def : BitConvert <v8i16, v8f16, VReg_128>;
		def : BitConvert <v8f16, v8i16, VReg_128>;
		def : BitConvert <v4f32, v8i16, VReg_128>;
		def : BitConvert <v8i16, v4f32, VReg_128>;
		def : BitConvert <v8i16, v8f16, SReg_128>;
		def : BitConvert <v8i16, v2i64, SReg_128>;
		def : BitConvert <v8i16, v2f64, SReg_128>;
		def : BitConvert <v8f16, v2i64, SReg_128>;
		def : BitConvert <v8f16, v2f64, SReg_128>;
		def : BitConvert <v8f16, v8i16, SReg_128>;
		def : BitConvert <v2i64, v8i16, SReg_128>;
		def : BitConvert <v2f64, v8i16, SReg_128>;
		def : BitConvert <v2i64, v8f16, SReg_128>;
		def : BitConvert <v2f64, v8f16, SReg_128>;

// 160-bit bitcast		// 160-bit bitcast
def : BitConvert <v5i32, v5f32, SReg_160>;		def : BitConvert <v5i32, v5f32, SReg_160>;
def : BitConvert <v5f32, v5i32, SReg_160>;		def : BitConvert <v5f32, v5i32, SReg_160>;
def : BitConvert <v5i32, v5f32, VReg_160>;		def : BitConvert <v5i32, v5f32, VReg_160>;
def : BitConvert <v5f32, v5i32, VReg_160>;		def : BitConvert <v5f32, v5i32, VReg_160>;

// 192-bit bitcast		// 192-bit bitcast
▲ Show 20 Lines • Show All 1,795 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Show First 20 Lines • Show All 611 Lines • ▼ Show 20 Lines

def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,		def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add FP_REG, SP_REG)> {		(add FP_REG, SP_REG)> {
let isAllocatable = 0;		let isAllocatable = 0;
let CopyCost = -1;		let CopyCost = -1;
let HasSGPR = 1;		let HasSGPR = 1;
}		}

def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,		def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32,
(add PRIVATE_RSRC_REG)> {		(add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;		let isAllocatable = 0;
let CopyCost = -1;		let CopyCost = -1;
let HasSGPR = 1;		let HasSGPR = 1;
}		}

def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,		def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
(add LDS_DIRECT)> {		(add LDS_DIRECT)> {
▲ Show 20 Lines • Show All 150 Lines • ▼ Show 20 Lines	def SReg_ # suffix :
!dag(add, [!cast<RegisterClass>(ttmpName)], ["ttmp"]),		!dag(add, [!cast<RegisterClass>(ttmpName)], ["ttmp"]),
(add)))> {		(add)))> {
let isAllocatable = 0;		let isAllocatable = 0;
}		}
}		}
}		}

defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;		defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64], SGPR_128Regs, TTMP_128Regs>;		defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;		defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;		defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;		defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256Regs>;		defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256Regs>;
defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;		defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;		defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;

def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,		def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
Show All 23 Lines	let HasVGPR = 1 in {
// Define 2-aligned variant		// Define 2-aligned variant
def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;		def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
}		}
}		}

defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],		defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
(add VGPR_64)>;		(add VGPR_64)>;
defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;		defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;		defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>;
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;		defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;

defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;		defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;		defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;		defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;		defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;		defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;

multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {		multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {		let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
// Define the regular class.		// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList>;		def "" : VRegClassBase<numRegs, regTypes, regList>;

// Define 2-aligned variant		// Define 2-aligned variant
def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;		def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
}		}
}		}

defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],		defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
(add AGPR_64)>;		(add AGPR_64)>;
defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;		defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;		defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>;
defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;		defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;		defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;		defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;		defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;		defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;		defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;

} // End GeneratePressureSet = 0		} // End GeneratePressureSet = 0
▲ Show 20 Lines • Show All 312 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll

	Show First 20 Lines • Show All 70 Lines • ▼ Show 20 Lines
	; FAST16-LABEL: 'add_i16'			; FAST16-LABEL: 'add_i16'
	; FAST16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = add i16 undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = add i16 undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = add <2 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = add <2 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = add <3 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = add <3 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = add <4 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = add <4 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17i16 = add <17 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17i16 = add <17 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; SLOW16-LABEL: 'add_i16'			; SLOW16-LABEL: 'add_i16'
	; SLOW16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = add i16 undef, undef			; SLOW16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = add i16 undef, undef
	; SLOW16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = add <2 x i16> undef, undef			; SLOW16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = add <2 x i16> undef, undef
	; SLOW16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = add <3 x i16> undef, undef			; SLOW16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = add <3 x i16> undef, undef
	; SLOW16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = add <4 x i16> undef, undef			; SLOW16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = add <4 x i16> undef, undef
	; SLOW16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = add <5 x i16> undef, undef			; SLOW16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = add <5 x i16> undef, undef
	; SLOW16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v6i16 = add <6 x i16> undef, undef			; SLOW16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v6i16 = add <6 x i16> undef, undef
	; SLOW16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = add <16 x i16> undef, undef			; SLOW16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = add <16 x i16> undef, undef
	; SLOW16-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17i16 = add <17 x i16> undef, undef			; SLOW16-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17i16 = add <17 x i16> undef, undef
	; SLOW16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; SLOW16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; FAST16-SIZE-LABEL: 'add_i16'			; FAST16-SIZE-LABEL: 'add_i16'
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = add i16 undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = add i16 undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = add <2 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = add <2 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = add <3 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = add <3 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = add <4 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = add <4 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17i16 = add <17 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17i16 = add <17 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; SLOW16-SIZE-LABEL: 'add_i16'			; SLOW16-SIZE-LABEL: 'add_i16'
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = add i16 undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = add i16 undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = add <2 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = add <2 x i16> undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = add <3 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = add <3 x i16> undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = add <4 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = add <4 x i16> undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = add <5 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = add <5 x i16> undef, undef
	▲ Show 20 Lines • Show All 102 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll

	Show First 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.sadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.sadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	Show All 38 Lines
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.sadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.sadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	Show All 38 Lines
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	▲ Show 20 Lines • Show All 63 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll

	Show First 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.uadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.uadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	Show All 38 Lines
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.uadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.uadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
	; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; FAST-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	Show All 38 Lines
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	▲ Show 20 Lines • Show All 63 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/AMDGPU/cast.ll

Show First 20 Lines • Show All 293 Lines • ▼ Show 20 Lines	;
%C1 = sitofp <4 x i16> %c to <4 x float>		%C1 = sitofp <4 x i16> %c to <4 x float>
%C2 = sitofp <4 x i16> %c to <4 x double>		%C2 = sitofp <4 x i16> %c to <4 x double>
%D1 = sitofp <4 x i32> %d to <4 x float>		%D1 = sitofp <4 x i32> %d to <4 x float>
%D2 = sitofp <4 x i32> %d to <4 x double>		%D2 = sitofp <4 x i32> %d to <4 x double>
ret void		ret void
}		}

define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {		define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
; FAST-LABEL: 'sitofp8'		; ALL-LABEL: 'sitofp8'
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>		; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>		; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>		; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>		; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void		; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SLOW-LABEL: 'sitofp8'
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; FAST-SIZE-LABEL: 'sitofp8'
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;		;
; SLOW-SIZE-LABEL: 'sitofp8'		; ALL-SIZE-LABEL: 'sitofp8'
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;		;
%A1 = sitofp <8 x i1> %a to <8 x float>		%A1 = sitofp <8 x i1> %a to <8 x float>
%B1 = sitofp <8 x i8> %b to <8 x float>		%B1 = sitofp <8 x i8> %b to <8 x float>
%C1 = sitofp <8 x i16> %c to <8 x float>		%C1 = sitofp <8 x i16> %c to <8 x float>
%D1 = sitofp <8 x i32> %d to <8 x float>		%D1 = sitofp <8 x i32> %d to <8 x float>
ret void		ret void
}		}

▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines	;
%C1 = uitofp <4 x i16> %c to <4 x float>		%C1 = uitofp <4 x i16> %c to <4 x float>
%C2 = uitofp <4 x i16> %c to <4 x double>		%C2 = uitofp <4 x i16> %c to <4 x double>
%D1 = uitofp <4 x i32> %d to <4 x float>		%D1 = uitofp <4 x i32> %d to <4 x float>
%D2 = uitofp <4 x i32> %d to <4 x double>		%D2 = uitofp <4 x i32> %d to <4 x double>
ret void		ret void
}		}

define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {		define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
; FAST-LABEL: 'uitofp8'		; ALL-LABEL: 'uitofp8'
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>		; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>		; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>		; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>		; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void		; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SLOW-LABEL: 'uitofp8'
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; FAST-SIZE-LABEL: 'uitofp8'
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;		;
; SLOW-SIZE-LABEL: 'uitofp8'		; ALL-SIZE-LABEL: 'uitofp8'
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void		; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;		;
%A1 = uitofp <8 x i1> %a to <8 x float>		%A1 = uitofp <8 x i1> %a to <8 x float>
%B1 = uitofp <8 x i8> %b to <8 x float>		%B1 = uitofp <8 x i8> %b to <8 x float>
%C1 = uitofp <8 x i16> %c to <8 x float>		%C1 = uitofp <8 x i16> %c to <8 x float>
%D1 = uitofp <8 x i32> %d to <8 x float>		%D1 = uitofp <8 x i32> %d to <8 x float>
ret void		ret void
}		}

Show All 21 Lines

llvm/test/Analysis/CostModel/AMDGPU/fadd.ll

	Show First 20 Lines • Show All 109 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @fadd_f16() #0 {			define amdgpu_kernel void @fadd_f16() #0 {
	; FASTF16-LABEL: 'fadd_f16'			; FASTF16-LABEL: 'fadd_f16'
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fadd <17 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fadd <17 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; SLOWF64-LABEL: 'fadd_f16'			; SLOWF64-LABEL: 'fadd_f16'
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; FASTF16-SIZE-LABEL: 'fadd_f16'			; FASTF16-SIZE-LABEL: 'fadd_f16'
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fadd <17 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fadd <17 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; SLOWF64-SIZE-LABEL: 'fadd_f16'			; SLOWF64-SIZE-LABEL: 'fadd_f16'
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef
	Show All 15 Lines

llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll

	Show First 20 Lines • Show All 159 Lines • ▼ Show 20 Lines
	;			;
	; FP16-LABEL: 'fdiv_f16_f32ieee'			; FP16-LABEL: 'fdiv_f16_f32ieee'
	; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16 = fdiv half undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16 = fdiv half undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16 = fdiv <2 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16 = fdiv <2 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v3f16 = fdiv <3 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v3f16 = fdiv <3 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %v17f16 = fdiv <17 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v17f16 = fdiv <17 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ieee'			; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ieee'
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16 = fdiv half undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16 = fdiv half undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16 = fdiv <2 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16 = fdiv <2 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v3f16 = fdiv <3 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v3f16 = fdiv <3 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %v17f16 = fdiv <17 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %v17f16 = fdiv <17 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; FP16-SIZE-LABEL: 'fdiv_f16_f32ieee'			; FP16-SIZE-LABEL: 'fdiv_f16_f32ieee'
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f16 = fdiv half undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f16 = fdiv half undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16 = fdiv <2 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16 = fdiv <2 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = fdiv <3 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = fdiv <3 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v17f16 = fdiv <17 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %v17f16 = fdiv <17 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	%f16 = fdiv half undef, undef			%f16 = fdiv half undef, undef
	%v2f16 = fdiv <2 x half> undef, undef			%v2f16 = fdiv <2 x half> undef, undef
	%v3f16 = fdiv <3 x half> undef, undef			%v3f16 = fdiv <3 x half> undef, undef
	%v4f16 = fdiv <4 x half> undef, undef			%v4f16 = fdiv <4 x half> undef, undef
	%v5f16 = fdiv <5 x half> undef, undef			%v5f16 = fdiv <5 x half> undef, undef
	%v16f16 = fdiv <16 x half> undef, undef			%v16f16 = fdiv <16 x half> undef, undef
	Show All 14 Lines
	;			;
	; FP16-LABEL: 'fdiv_f16_f32ftzdaz'			; FP16-LABEL: 'fdiv_f16_f32ftzdaz'
	; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16 = fdiv half undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16 = fdiv half undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16 = fdiv <2 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16 = fdiv <2 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v3f16 = fdiv <3 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v3f16 = fdiv <3 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %v17f16 = fdiv <17 x half> undef, undef			; FP16-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v17f16 = fdiv <17 x half> undef, undef
	; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ftzdaz'			; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ftzdaz'
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16 = fdiv half undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16 = fdiv half undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v2f16 = fdiv <2 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v2f16 = fdiv <2 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v3f16 = fdiv <3 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v3f16 = fdiv <3 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v4f16 = fdiv <4 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v4f16 = fdiv <4 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v5f16 = fdiv <5 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v5f16 = fdiv <5 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %v16f16 = fdiv <16 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %v16f16 = fdiv <16 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 476 for instruction: %v17f16 = fdiv <17 x half> undef, undef			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 476 for instruction: %v17f16 = fdiv <17 x half> undef, undef
	; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; NOFP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; FP16-SIZE-LABEL: 'fdiv_f16_f32ftzdaz'			; FP16-SIZE-LABEL: 'fdiv_f16_f32ftzdaz'
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f16 = fdiv half undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f16 = fdiv half undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16 = fdiv <2 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16 = fdiv <2 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = fdiv <3 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = fdiv <3 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v17f16 = fdiv <17 x half> undef, undef			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %v17f16 = fdiv <17 x half> undef, undef
	; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	%f16 = fdiv half undef, undef			%f16 = fdiv half undef, undef
	%v2f16 = fdiv <2 x half> undef, undef			%v2f16 = fdiv <2 x half> undef, undef
	%v3f16 = fdiv <3 x half> undef, undef			%v3f16 = fdiv <3 x half> undef, undef
	%v4f16 = fdiv <4 x half> undef, undef			%v4f16 = fdiv <4 x half> undef, undef
	%v5f16 = fdiv <5 x half> undef, undef			%v5f16 = fdiv <5 x half> undef, undef
	%v16f16 = fdiv <16 x half> undef, undef			%v16f16 = fdiv <16 x half> undef, undef
	▲ Show 20 Lines • Show All 434 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/AMDGPU/fma.ll

	; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
	; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST,SLOWF64,NOPACKEDF32 %s			; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST,SLOWF64 %s
	; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST,FASTF64,PACKEDF32 %s			; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST,FASTF64 %s
	; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST,SLOWF64,NOPACKEDF32 %s			; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST,SLOWF64 %s
	; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s \| FileCheck -check-prefixes=SLOW %s			; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s \| FileCheck -check-prefixes=SLOW %s
	; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE,NOPACKEDF32-SIZE %s			; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s
	; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST-SIZE,FASTF64-SIZE,PACKEDF32-SIZE %s			; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST-SIZE,FASTF64-SIZE %s
	; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE,NOPACKEDF32-SIZE %s			; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s \| FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s
	; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s \| FileCheck -check-prefixes=SLOW-SIZE %s			; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s \| FileCheck -check-prefixes=SLOW-SIZE %s
	; END.			; END.

	define amdgpu_kernel void @fma_f32() #0 {			define amdgpu_kernel void @fma_f32() #0 {
	; NOPACKEDF32-LABEL: 'fma_f32'			; SLOWF64-LABEL: 'fma_f32'
	; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
	; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
	; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
	; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
	; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
	; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
	; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
	; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; PACKEDF32-LABEL: 'fma_f32'			; FASTF64-LABEL: 'fma_f32'
	; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2			; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
	; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2			; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
	; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2			; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
	; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2			; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
	; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2			; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
	; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2			; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
	; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2			; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
	; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; SLOW-LABEL: 'fma_f32'			; SLOW-LABEL: 'fma_f32'
	; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; NOPACKEDF32-SIZE-LABEL: 'fma_f32'			; SLOWF64-SIZE-LABEL: 'fma_f32'
	; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
	; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
	; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
	; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
	; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
	; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
	; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
	; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; PACKEDF32-SIZE-LABEL: 'fma_f32'			; FASTF64-SIZE-LABEL: 'fma_f32'
	; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2			; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
	; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2			; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
	; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2			; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
	; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2			; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
	; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2			; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
	; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2			; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
	; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2			; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
	; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; SLOW-SIZE-LABEL: 'fma_f32'			; SLOW-SIZE-LABEL: 'fma_f32'
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
	▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @fma_f16() #0 {			define amdgpu_kernel void @fma_f16() #0 {
	; FAST-LABEL: 'fma_f16'			; FAST-LABEL: 'fma_f16'
	; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2			; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
	; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2			; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
	; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2			; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
	; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2			; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
	; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2			; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
	; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2			; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
	; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; SLOW-LABEL: 'fma_f16'			; SLOW-LABEL: 'fma_f16'
	; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2			; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
	; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; FAST-SIZE-LABEL: 'fma_f16'			; FAST-SIZE-LABEL: 'fma_f16'
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
	; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; SLOW-SIZE-LABEL: 'fma_f16'			; SLOW-SIZE-LABEL: 'fma_f16'
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
	Show All 38 Lines

llvm/test/Analysis/CostModel/AMDGPU/fmul.ll

	Show First 20 Lines • Show All 109 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @fmul_f16() #0 {			define amdgpu_kernel void @fmul_f16() #0 {
	; GFX9-LABEL: 'fmul_f16'			; GFX9-LABEL: 'fmul_f16'
	; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef			; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef
	; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef			; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef
	; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef			; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef
	; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef			; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef
	; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef			; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef
	; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef			; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef
	; GFX9-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fmul <17 x half> undef, undef			; GFX9-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fmul <17 x half> undef, undef
	; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; SLOW-LABEL: 'fmul_f16'			; SLOW-LABEL: 'fmul_f16'
	; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef			; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef
	; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef			; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef
	; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef			; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef
	; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef			; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef
	; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef			; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef
	; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef			; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef
	; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef			; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef
	; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; GFX9-SIZE-LABEL: 'fmul_f16'			; GFX9-SIZE-LABEL: 'fmul_f16'
	; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef			; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef
	; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef			; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef
	; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef			; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef
	; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef			; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef
	; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef			; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef
	; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef			; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef
	; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fmul <17 x half> undef, undef			; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fmul <17 x half> undef, undef
	; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; SLOW-SIZE-LABEL: 'fmul_f16'			; SLOW-SIZE-LABEL: 'fmul_f16'
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef
	; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef			; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef
	Show All 15 Lines

llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll

Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	;
ret i32 undef		ret i32 undef
}		}

define i32 @fptosi_double_i16(i32 %arg) {		define i32 @fptosi_double_i16(i32 %arg) {
; FAST-LABEL: 'fptosi_double_i16'		; FAST-LABEL: 'fptosi_double_i16'
; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16		; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef		; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
;		;
; SLOW-LABEL: 'fptosi_double_i16'		; SLOW-LABEL: 'fptosi_double_i16'
; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16		; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef		; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
;		;
; FAST-SIZE-LABEL: 'fptosi_double_i16'		; FAST-SIZE-LABEL: 'fptosi_double_i16'
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;		;
; SLOW-SIZE-LABEL: 'fptosi_double_i16'		; SLOW-SIZE-LABEL: 'fptosi_double_i16'
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
▲ Show 20 Lines • Show All 91 Lines • ▼ Show 20 Lines	;
ret i32 undef		ret i32 undef
}		}

define i32 @fptosi_float_i16(i32 %arg) {		define i32 @fptosi_float_i16(i32 %arg) {
; FAST-LABEL: 'fptosi_float_i16'		; FAST-LABEL: 'fptosi_float_i16'
; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16		; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef		; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
;		;
; SLOW-LABEL: 'fptosi_float_i16'		; SLOW-LABEL: 'fptosi_float_i16'
; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16		; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef		; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
;		;
; FAST-SIZE-LABEL: 'fptosi_float_i16'		; FAST-SIZE-LABEL: 'fptosi_float_i16'
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;		;
; SLOW-SIZE-LABEL: 'fptosi_float_i16'		; SLOW-SIZE-LABEL: 'fptosi_float_i16'
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
▲ Show 20 Lines • Show All 50 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll

Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	;
ret i32 undef		ret i32 undef
}		}

define i32 @fptoui_double_i16(i32 %arg) {		define i32 @fptoui_double_i16(i32 %arg) {
; FAST-LABEL: 'fptoui_double_i16'		; FAST-LABEL: 'fptoui_double_i16'
; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16		; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef		; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
;		;
; SLOW-LABEL: 'fptoui_double_i16'		; SLOW-LABEL: 'fptoui_double_i16'
; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16		; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef		; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
;		;
; FAST-SIZE-LABEL: 'fptoui_double_i16'		; FAST-SIZE-LABEL: 'fptoui_double_i16'
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;		;
; SLOW-SIZE-LABEL: 'fptoui_double_i16'		; SLOW-SIZE-LABEL: 'fptoui_double_i16'
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
▲ Show 20 Lines • Show All 91 Lines • ▼ Show 20 Lines	;
ret i32 undef		ret i32 undef
}		}

define i32 @fptoui_float_i16(i32 %arg) {		define i32 @fptoui_float_i16(i32 %arg) {
; FAST-LABEL: 'fptoui_float_i16'		; FAST-LABEL: 'fptoui_float_i16'
; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16		; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>		; FAST-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef		; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
;		;
; SLOW-LABEL: 'fptoui_float_i16'		; SLOW-LABEL: 'fptoui_float_i16'
; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16		; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>		; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef		; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
;		;
; FAST-SIZE-LABEL: 'fptoui_float_i16'		; FAST-SIZE-LABEL: 'fptoui_float_i16'
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef		; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;		;
; SLOW-SIZE-LABEL: 'fptoui_float_i16'		; SLOW-SIZE-LABEL: 'fptoui_float_i16'
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>		; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
▲ Show 20 Lines • Show All 50 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/AMDGPU/fsub.ll

	Show First 20 Lines • Show All 109 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @fsub_f16() #0 {			define amdgpu_kernel void @fsub_f16() #0 {
	; FASTF16-LABEL: 'fsub_f16'			; FASTF16-LABEL: 'fsub_f16'
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fsub <17 x half> undef, undef			; FASTF16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fsub <17 x half> undef, undef
	; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; SLOWF64-LABEL: 'fsub_f16'			; SLOWF64-LABEL: 'fsub_f16'
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef
	; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; FASTF16-SIZE-LABEL: 'fsub_f16'			; FASTF16-SIZE-LABEL: 'fsub_f16'
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fsub <17 x half> undef, undef			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fsub <17 x half> undef, undef
	; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; SLOWF64-SIZE-LABEL: 'fsub_f16'			; SLOWF64-SIZE-LABEL: 'fsub_f16'
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef
	; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef			; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef
	Show All 13 Lines

llvm/test/Analysis/CostModel/AMDGPU/mul.ll

	Show First 20 Lines • Show All 73 Lines • ▼ Show 20 Lines
	;			;
	; FAST16-LABEL: 'mul_i16'			; FAST16-LABEL: 'mul_i16'
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i16 = mul i16 undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i16 = mul i16 undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i16 = mul <2 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i16 = mul <2 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3i16 = mul <3 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3i16 = mul <3 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i16 = mul <4 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i16 = mul <4 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5i16 = mul <5 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5i16 = mul <5 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = mul <16 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = mul <16 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v17i16 = mul <17 x i16> undef, undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17i16 = mul <17 x i16> undef, undef
	; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void			; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
	;			;
	; SLOW16-SIZE-LABEL: 'mul_i16'			; SLOW16-SIZE-LABEL: 'mul_i16'
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i16 = mul i16 undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i16 = mul i16 undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i16 = mul <2 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i16 = mul <2 x i16> undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3i16 = mul <3 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3i16 = mul <3 x i16> undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i16 = mul <4 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i16 = mul <4 x i16> undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5i16 = mul <5 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5i16 = mul <5 x i16> undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = mul <16 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = mul <16 x i16> undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17i16 = mul <17 x i16> undef, undef			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17i16 = mul <17 x i16> undef, undef
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	; FAST16-SIZE-LABEL: 'mul_i16'			; FAST16-SIZE-LABEL: 'mul_i16'
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i16 = mul i16 undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i16 = mul i16 undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = mul <2 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = mul <2 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = mul <3 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = mul <3 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = mul <4 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = mul <4 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = mul <5 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = mul <5 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = mul <16 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = mul <16 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v17i16 = mul <17 x i16> undef, undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = mul <17 x i16> undef, undef
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
	;			;
	%i16 = mul i16 undef, undef			%i16 = mul i16 undef, undef
	%v2i16 = mul <2 x i16> undef, undef			%v2i16 = mul <2 x i16> undef, undef
	%v3i16 = mul <3 x i16> undef, undef			%v3i16 = mul <3 x i16> undef, undef
	%v4i16 = mul <4 x i16> undef, undef			%v4i16 = mul <4 x i16> undef, undef
	%v5i16 = mul <5 x i16> undef, undef			%v5i16 = mul <5 x i16> undef, undef
	%v16i16 = mul <16 x i16> undef, undef			%v16i16 = mul <16 x i16> undef, undef
	Show All 28 Lines
	; FAST16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = mul <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = mul <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = mul i32 undef, 16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = mul i32 undef, 16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i32 = mul <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i32 = mul <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = mul <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = mul <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = mul <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = mul <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
	;			;
	; SLOW16-SIZE-LABEL: 'mul_constpow2'			; SLOW16-SIZE-LABEL: 'mul_constpow2'
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I64 = mul i64 undef, 16			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I64 = mul i64 undef, 16
	Show All 21 Lines
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8i64 = mul <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8i64 = mul <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = mul i32 undef, 16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = mul i32 undef, 16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = mul <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = mul <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = mul <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = mul <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = mul <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = mul <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
	;			;
	%I64 = mul i64 undef, 16			%I64 = mul i64 undef, 16
	%V2i64 = mul <2 x i64> undef, <i64 8, i64 16>			%V2i64 = mul <2 x i64> undef, <i64 8, i64 16>
	▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines
	; FAST16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = mul <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = mul <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = mul i32 undef, 16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = mul i32 undef, 16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i32 = mul <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i32 = mul <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = mul <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = mul <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = mul <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = mul <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i16 = mul <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
	;			;
	; SLOW16-SIZE-LABEL: 'mul_uniformconstpow2'			; SLOW16-SIZE-LABEL: 'mul_uniformconstpow2'
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I64 = mul i64 undef, 16			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I64 = mul i64 undef, 16
	Show All 21 Lines
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8i64 = mul <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8i64 = mul <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = mul i32 undef, 16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = mul i32 undef, 16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = mul <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = mul <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = mul <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = mul <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = mul <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = mul <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = mul <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
	;			;
	%I64 = mul i64 undef, 16			%I64 = mul i64 undef, 16
	%V2i64 = mul <2 x i64> undef, <i64 16, i64 16>			%V2i64 = mul <2 x i64> undef, <i64 16, i64 16>
	▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines
	; FAST16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = mul i32 undef, -16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = mul i32 undef, -16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>			; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
	;			;
	; SLOW16-SIZE-LABEL: 'mul_constnegpow2'			; SLOW16-SIZE-LABEL: 'mul_constnegpow2'
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I64 = mul i64 undef, -16			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I64 = mul i64 undef, -16
	Show All 21 Lines
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = mul i32 undef, -16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = mul i32 undef, -16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
	;			;
	%I64 = mul i64 undef, -16			%I64 = mul i64 undef, -16
	%V2i64 = mul <2 x i64> undef, <i64 -8, i64 -16>			%V2i64 = mul <2 x i64> undef, <i64 -8, i64 -16>
	▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines
	; FAST16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = mul i32 undef, -16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = mul i32 undef, -16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16			; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16
	; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>			; FAST16-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
	; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef			; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
	;			;
	; SLOW16-SIZE-LABEL: 'mul_uniformconstnegpow2'			; SLOW16-SIZE-LABEL: 'mul_uniformconstnegpow2'
	; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I64 = mul i64 undef, -16			; SLOW16-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I64 = mul i64 undef, -16
	Show All 21 Lines
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = mul i32 undef, -16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = mul i32 undef, -16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
	; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef			; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
	;			;
	%I64 = mul i64 undef, -16			%I64 = mul i64 undef, -16
	%V2i64 = mul <2 x i64> undef, <i64 -16, i64 -16>			%V2i64 = mul <2 x i64> undef, <i64 -16, i64 -16>
	Show All 22 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir

	Show First 20 Lines • Show All 709 Lines • ▼ Show 20 Lines

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $sgpr0_sgpr1			liveins: $sgpr0_sgpr1

	; GFX6-LABEL: name: load_constant_v8s16			; GFX6-LABEL: name: load_constant_v8s16
	; GFX6: liveins: $sgpr0_sgpr1			; GFX6: liveins: $sgpr0_sgpr1
	; GFX6-NEXT: {{ $}}			; GFX6-NEXT: {{ $}}
	; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1			; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
	; GFX6-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>), align 4, addrspace 4)			; GFX6-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
	; GFX6-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<8 x s16>)			; GFX6-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
	; GFX7-LABEL: name: load_constant_v8s16			; GFX7-LABEL: name: load_constant_v8s16
	; GFX7: liveins: $sgpr0_sgpr1			; GFX7: liveins: $sgpr0_sgpr1
	; GFX7-NEXT: {{ $}}			; GFX7-NEXT: {{ $}}
	; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1			; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
	; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>), align 4, addrspace 4)			; GFX7-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
	; GFX7-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<8 x s16>)			; GFX7-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
	; GFX8-LABEL: name: load_constant_v8s16			; GFX8-LABEL: name: load_constant_v8s16
	; GFX8: liveins: $sgpr0_sgpr1			; GFX8: liveins: $sgpr0_sgpr1
	; GFX8-NEXT: {{ $}}			; GFX8-NEXT: {{ $}}
	; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1			; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
	; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>), align 4, addrspace 4)			; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
	; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<8 x s16>)			; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
	; GFX10-LABEL: name: load_constant_v8s16			; GFX10-LABEL: name: load_constant_v8s16
	; GFX10: liveins: $sgpr0_sgpr1			; GFX10: liveins: $sgpr0_sgpr1
	; GFX10-NEXT: {{ $}}			; GFX10-NEXT: {{ $}}
	; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1			; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
	; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>), align 4, addrspace 4)			; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
	; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<8 x s16>)			; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
	%0:sgpr(p4) = COPY $sgpr0_sgpr1			%0:sgpr(p4) = COPY $sgpr0_sgpr1
	%1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4)			%1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4)
	$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1			$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1

	...			...

	---			---

	▲ Show 20 Lines • Show All 490 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir

	Show First 20 Lines • Show All 735 Lines • ▼ Show 20 Lines

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $vgpr0_vgpr1			liveins: $vgpr0_vgpr1

	; GFX7-LABEL: name: load_flat_v8s16			; GFX7-LABEL: name: load_flat_v8s16
	; GFX7: liveins: $vgpr0_vgpr1			; GFX7: liveins: $vgpr0_vgpr1
	; GFX7-NEXT: {{ $}}			; GFX7-NEXT: {{ $}}
	; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1			; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
	; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4)			; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
	; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)			; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
	; GFX8-LABEL: name: load_flat_v8s16			; GFX8-LABEL: name: load_flat_v8s16
	; GFX8: liveins: $vgpr0_vgpr1			; GFX8: liveins: $vgpr0_vgpr1
	; GFX8-NEXT: {{ $}}			; GFX8-NEXT: {{ $}}
	; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1			; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
	; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4)			; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
	; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)			; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
	; GFX9-LABEL: name: load_flat_v8s16			; GFX9-LABEL: name: load_flat_v8s16
	; GFX9: liveins: $vgpr0_vgpr1			; GFX9: liveins: $vgpr0_vgpr1
	; GFX9-NEXT: {{ $}}			; GFX9-NEXT: {{ $}}
	; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1			; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
	; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4)			; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
	; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)			; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
	; GFX10-LABEL: name: load_flat_v8s16			; GFX10-LABEL: name: load_flat_v8s16
	; GFX10: liveins: $vgpr0_vgpr1			; GFX10: liveins: $vgpr0_vgpr1
	; GFX10-NEXT: {{ $}}			; GFX10-NEXT: {{ $}}
	; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1			; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
	; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4)			; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
	; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)			; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
	%0:vgpr(p1) = COPY $vgpr0_vgpr1			%0:vgpr(p1) = COPY $vgpr0_vgpr1
	%1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 0)			%1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 0)
	$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1			$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1

	...			...

	################################################################################			################################################################################
	### Stress addressing modes			### Stress addressing modes
	▲ Show 20 Lines • Show All 967 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir

Show First 20 Lines • Show All 867 Lines • ▼ Show 20 Lines	bb.0:
; GFX6: liveins: $vgpr0_vgpr1		; GFX6: liveins: $vgpr0_vgpr1
; GFX6-NEXT: {{ $}}		; GFX6-NEXT: {{ $}}
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)		; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)
; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)		; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
; GFX7-LABEL: name: load_global_v8s16		; GFX7-LABEL: name: load_global_v8s16
; GFX7: liveins: $vgpr0_vgpr1		; GFX7: liveins: $vgpr0_vgpr1
; GFX7-NEXT: {{ $}}		; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)		; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)		; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
; GFX7-FLAT-LABEL: name: load_global_v8s16		; GFX7-FLAT-LABEL: name: load_global_v8s16
; GFX7-FLAT: liveins: $vgpr0_vgpr1		; GFX7-FLAT: liveins: $vgpr0_vgpr1
; GFX7-FLAT-NEXT: {{ $}}		; GFX7-FLAT-NEXT: {{ $}}
; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)		; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)		; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
; GFX8-LABEL: name: load_global_v8s16		; GFX8-LABEL: name: load_global_v8s16
; GFX8: liveins: $vgpr0_vgpr1		; GFX8: liveins: $vgpr0_vgpr1
; GFX8-NEXT: {{ $}}		; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)		; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)		; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
; GFX9-LABEL: name: load_global_v8s16		; GFX9-LABEL: name: load_global_v8s16
; GFX9: liveins: $vgpr0_vgpr1		; GFX9: liveins: $vgpr0_vgpr1
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)		; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)		; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
; GFX10-LABEL: name: load_global_v8s16		; GFX10-LABEL: name: load_global_v8s16
; GFX10: liveins: $vgpr0_vgpr1		; GFX10: liveins: $vgpr0_vgpr1
; GFX10-NEXT: {{ $}}		; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)		; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)		; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1		%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 1)		%1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 1)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1		$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1

...		...

################################################################################		################################################################################
### Stress addressing modes		### Stress addressing modes
▲ Show 20 Lines • Show All 1,287 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir

# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py		# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=GFX7 %s		# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=GFX7 %s
# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=GFX9 %s		# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=GFX9 %s
# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=GFX9 %s		# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=GFX10 %s

---		---

name: load_local_v4s32_align16		name: load_local_v4s32_align16
legalized: true		legalized: true
regBankSelected: true		regBankSelected: true
tracksRegLiveness: true		tracksRegLiveness: true

Show All 9 Lines	bb.0:
; GFX7-NEXT: [[DS_READ_B128_:%[0-9]+]]:vreg_128 = DS_READ_B128 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<4 x s32>), addrspace 3)		; GFX7-NEXT: [[DS_READ_B128_:%[0-9]+]]:vreg_128 = DS_READ_B128 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<4 x s32>), addrspace 3)
; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_]]		; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_]]
; GFX9-LABEL: name: load_local_v4s32_align16		; GFX9-LABEL: name: load_local_v4s32_align16
; GFX9: liveins: $vgpr0		; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0		; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), addrspace 3)		; GFX9-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), addrspace 3)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]		; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
		; GFX10-LABEL: name: load_local_v4s32_align16
		; GFX10: liveins: $vgpr0
		; GFX10-NEXT: {{ $}}
		; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GFX10-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), addrspace 3)
		; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0		%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 16, addrspace 3)		%1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 16, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1		$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1

...		...

---		---

Show All 14 Lines	bb.0:
; GFX7-NEXT: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)		; GFX7-NEXT: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)
; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]		; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]
; GFX9-LABEL: name: load_local_v4s32_align_8		; GFX9-LABEL: name: load_local_v4s32_align_8
; GFX9: liveins: $vgpr0		; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0		; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)		; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]		; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
		; GFX10-LABEL: name: load_local_v4s32_align_8
		; GFX10: liveins: $vgpr0
		; GFX10-NEXT: {{ $}}
		; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)
		; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0		%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 8, addrspace 3)		%1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1		$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1

...		...

---		---

Show All 14 Lines	bb.0:
; GFX7-NEXT: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 50, 51, 0, implicit $m0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)		; GFX7-NEXT: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 50, 51, 0, implicit $m0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)
; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]		; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]
; GFX9-LABEL: name: load_local_v4s32_align_8_offset_160		; GFX9-LABEL: name: load_local_v4s32_align_8_offset_160
; GFX9: liveins: $vgpr0		; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0		; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 50, 51, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)		; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 50, 51, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]		; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
		; GFX10-LABEL: name: load_local_v4s32_align_8_offset_160
		; GFX10: liveins: $vgpr0
		; GFX10-NEXT: {{ $}}
		; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 50, 51, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)
		; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0		%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 400		%1:vgpr(s32) = G_CONSTANT i32 400
%2:vgpr(p3) = G_PTR_ADD %0, %1		%2:vgpr(p3) = G_PTR_ADD %0, %1
%3:vgpr(<4 x s32>) = G_LOAD %2 :: (load (<4 x s32>), align 8, addrspace 3)		%3:vgpr(<4 x s32>) = G_LOAD %2 :: (load (<4 x s32>), align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3		$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3

...		...

Show All 20 Lines	bb.0:
; GFX9-LABEL: name: load_local_v4s32_align_8_offset_320		; GFX9-LABEL: name: load_local_v4s32_align_8_offset_320
; GFX9: liveins: $vgpr0		; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0		; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4000, implicit $exec		; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4000, implicit $exec
; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec		; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)		; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]		; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
		; GFX10-LABEL: name: load_local_v4s32_align_8_offset_320
		; GFX10: liveins: $vgpr0
		; GFX10-NEXT: {{ $}}
		; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4000, implicit $exec
		; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
		; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3)
		; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0		%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 4000		%1:vgpr(s32) = G_CONSTANT i32 4000
%2:vgpr(p3) = G_PTR_ADD %0, %1		%2:vgpr(p3) = G_PTR_ADD %0, %1
%3:vgpr(<4 x s32>) = G_LOAD %2 :: (load (<4 x s32>), align 8, addrspace 3)		%3:vgpr(<4 x s32>) = G_LOAD %2 :: (load (<4 x s32>), align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3		$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3

...		...

Show All 16 Lines	bb.0:
; GFX7-NEXT: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (<2 x s64>), align 8, addrspace 3)		; GFX7-NEXT: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (<2 x s64>), align 8, addrspace 3)
; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]		; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]
; GFX9-LABEL: name: load_local_v2s64		; GFX9-LABEL: name: load_local_v2s64
; GFX9: liveins: $vgpr0		; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0		; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<2 x s64>), align 8, addrspace 3)		; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<2 x s64>), align 8, addrspace 3)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]		; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
		; GFX10-LABEL: name: load_local_v2s64
		; GFX10: liveins: $vgpr0
		; GFX10-NEXT: {{ $}}
		; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<2 x s64>), align 8, addrspace 3)
		; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0		%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 8, addrspace 3)		%1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1		$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1

...		...

---		---

Show All 14 Lines	bb.0:
; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load (<2 x p1>), align 8, addrspace 3)		; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load (<2 x p1>), align 8, addrspace 3)
; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)		; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
; GFX9-LABEL: name: load_local_v2p1		; GFX9-LABEL: name: load_local_v2p1
; GFX9: liveins: $vgpr0		; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0		; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load (<2 x p1>), align 8, addrspace 3)		; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load (<2 x p1>), align 8, addrspace 3)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)		; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
		; GFX10-LABEL: name: load_local_v2p1
		; GFX10: liveins: $vgpr0
		; GFX10-NEXT: {{ $}}
		; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
		; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load (<2 x p1>), align 8, addrspace 3)
		; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
%0:vgpr(p3) = COPY $vgpr0		%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 8, addrspace 3)		%1:vgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1		$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1

...		...

---		---

Show All 14 Lines	bb.0:
; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load (s128), align 8, addrspace 3)		; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load (s128), align 8, addrspace 3)
; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)		; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
; GFX9-LABEL: name: load_local_s128		; GFX9-LABEL: name: load_local_s128
; GFX9: liveins: $vgpr0		; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0		; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load (s128), align 8, addrspace 3)		; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load (s128), align 8, addrspace 3)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)		; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
		; GFX10-LABEL: name: load_local_s128
		; GFX10: liveins: $vgpr0
		; GFX10-NEXT: {{ $}}
		; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
		; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load (s128), align 8, addrspace 3)
		; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
%0:vgpr(p3) = COPY $vgpr0		%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s128) = G_LOAD %0 :: (load (s128), align 8, addrspace 3)		%1:vgpr(s128) = G_LOAD %0 :: (load (s128), align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1		$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1

...		...

---		---

name: load_local_v8s16		name: load_local_v8s16
legalized: true		legalized: true
regBankSelected: true		regBankSelected: true
tracksRegLiveness: true		tracksRegLiveness: true

body: \|		body: \|
bb.0:		bb.0:
liveins: $vgpr0		liveins: $vgpr0

; GFX7-LABEL: name: load_local_v8s16		; GFX7-LABEL: name: load_local_v8s16
; GFX7: liveins: $vgpr0		; GFX7: liveins: $vgpr0
; GFX7-NEXT: {{ $}}		; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0		; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX7-NEXT: $m0 = S_MOV_B32 -1		; GFX7-NEXT: $m0 = S_MOV_B32 -1
; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)		; GFX7-NEXT: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (<8 x s16>), align 8, addrspace 3)
; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)		; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]
; GFX9-LABEL: name: load_local_v8s16		; GFX9-LABEL: name: load_local_v8s16
; GFX9: liveins: $vgpr0		; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0		; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)		; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<8 x s16>), align 8, addrspace 3)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)		; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
		; GFX10-LABEL: name: load_local_v8s16
		; GFX10: liveins: $vgpr0
		; GFX10-NEXT: {{ $}}
		; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<8 x s16>), align 8, addrspace 3)
		; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0		%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 8, addrspace 3)		%1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1		$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1

...		...

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir

	Show First 20 Lines • Show All 497 Lines • ▼ Show 20 Lines

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5			liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5

	; GFX7-LABEL: name: store_flat_v8s16			; GFX7-LABEL: name: store_flat_v8s16
	; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5			; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
	; GFX7-NEXT: {{ $}}			; GFX7-NEXT: {{ $}}
	; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1			; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
	; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5			; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
	; GFX7-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>))			; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
	; GFX8-LABEL: name: store_flat_v8s16			; GFX8-LABEL: name: store_flat_v8s16
	; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5			; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
	; GFX8-NEXT: {{ $}}			; GFX8-NEXT: {{ $}}
	; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1			; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
	; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5			; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
	; GFX8-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>))			; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
	; GFX9-LABEL: name: store_flat_v8s16			; GFX9-LABEL: name: store_flat_v8s16
	; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5			; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
	; GFX9-NEXT: {{ $}}			; GFX9-NEXT: {{ $}}
	; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1			; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
	; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5			; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
	; GFX9-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>))			; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
	; GFX10-LABEL: name: store_flat_v8s16			; GFX10-LABEL: name: store_flat_v8s16
	; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5			; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
	; GFX10-NEXT: {{ $}}			; GFX10-NEXT: {{ $}}
	; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1			; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
	; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5			; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
	; GFX10-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>))			; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
	%0:vgpr(p1) = COPY $vgpr0_vgpr1			%0:vgpr(p1) = COPY $vgpr0_vgpr1
	%1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5			%1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
	G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 0)			G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 0)

	...			...

	---			---

	▲ Show 20 Lines • Show All 356 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir

Show First 20 Lines • Show All 543 Lines • ▼ Show 20 Lines	bb.0:
; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5		; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
; GFX6-NEXT: {{ $}}		; GFX6-NEXT: {{ $}}
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5		; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
; GFX6-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1)		; GFX6-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1)
; GFX7-LABEL: name: store_global_v8s16		; GFX7-LABEL: name: store_global_v8s16
; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5		; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
; GFX7-NEXT: {{ $}}		; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5		; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
; GFX7-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1)		; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1)
; GFX7-FLAT-LABEL: name: store_global_v8s16		; GFX7-FLAT-LABEL: name: store_global_v8s16
; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5		; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
; GFX7-FLAT-NEXT: {{ $}}		; GFX7-FLAT-NEXT: {{ $}}
; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5		; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
; GFX7-FLAT-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1)		; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1)
; GFX8-LABEL: name: store_global_v8s16		; GFX8-LABEL: name: store_global_v8s16
; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5		; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
; GFX8-NEXT: {{ $}}		; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5		; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
; GFX8-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1)		; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1)
; GFX9-LABEL: name: store_global_v8s16		; GFX9-LABEL: name: store_global_v8s16
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5		; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
; GFX9-NEXT: {{ $}}		; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5		; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
; GFX9-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1)		; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
; GFX10-LABEL: name: store_global_v8s16		; GFX10-LABEL: name: store_global_v8s16
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5		; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
; GFX10-NEXT: {{ $}}		; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1		; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5		; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
; GFX10-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1)		; GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
%0:vgpr(p1) = COPY $vgpr0_vgpr1		%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5		%1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 1)		G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 1)

...		...

---		---

▲ Show 20 Lines • Show All 452 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/add.v2i16.ll

Show First 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
%add = add <2 x i16> %a, %b		%add = add <2 x i16> %a, %b
%ext = zext <2 x i16> %add to <2 x i32>		%ext = zext <2 x i16> %add to <2 x i32>
store <2 x i32> %ext, <2 x i32> addrspace(1)* %out		store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FIXME: Need to handle non-uniform case for function below (load without gep).		; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64:		; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64:
		; GFX9PLUS: v_mov_b32_e32 [[MASK:v[0-9+]]], 0xffff
; GFX9PLUS: global_load_dword [[A:v[0-9]+]]		; GFX9PLUS: global_load_dword [[A:v[0-9]+]]
; GFX9PLUS: global_load_dword [[B:v[0-9]+]]		; GFX9PLUS: global_load_dword [[B:v[0-9]+]]

; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]		; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]		; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]]
; GFX9PLUS-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]		; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9PLUS: buffer_store_dwordx4		; GFX9PLUS: buffer_store_dwordx4

; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}		; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; VI-DAG: flat_load_dword v[[A:[0-9]+]]		; VI-DAG: flat_load_dword v[[A:[0-9]+]]
; VI-DAG: flat_load_dword v[[B:[0-9]+]]		; VI-DAG: flat_load_dword v[[B:[0-9]+]]

; VI-DAG: v_add_u16_e32		; VI-DAG: v_add_u16_e32
; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1		; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
▲ Show 20 Lines • Show All 76 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll

Show All 17 Lines	bb:
%gep2 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %arg1, i32 %lid		%gep2 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %arg1, i32 %lid
store <2 x i32> %shuffle, <2 x i32> addrspace(1)* %gep2, align 8		store <2 x i32> %shuffle, <2 x i32> addrspace(1)* %gep2, align 8
ret void		ret void
}		}

; GCN-LABEL: {{^}}test_vector_creation:		; GCN-LABEL: {{^}}test_vector_creation:
; GCN: global_load_dwordx2 v[{{[0-9]*[02468]}}:{{[0-9]+}}],		; GCN: global_load_dwordx2 v[{{[0-9]*[02468]}}:{{[0-9]+}}],
; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}}		; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[HI:[13579]]], v{{[0-9]+}}
; GCN: global_store_dwordx4 v[{{[0-9][02468]:[0-9][13579]}}], v[{{[0-9][02468]:[0-9][13579]}}]		; GCN: global_store_dwordx4 v[{{[0-9][02468]:[0-9][13579]}}], v[{{[0-9][02468]:[0-9][13579]}}]
define amdgpu_kernel void @test_vector_creation() {		define amdgpu_kernel void @test_vector_creation() {
entry:		entry:
%tmp231 = load <4 x i16>, <4 x i16> addrspace(1)* undef, align 2		%tmp231 = load <4 x i16>, <4 x i16> addrspace(1)* undef, align 2
%vext466 = shufflevector <4 x i16> %tmp231, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>		%vext466 = shufflevector <4 x i16> %tmp231, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%vecinit467 = shufflevector <8 x i16> undef, <8 x i16> %vext466, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>		%vecinit467 = shufflevector <8 x i16> undef, <8 x i16> %vext466, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
%vecinit471 = shufflevector <8 x i16> %vecinit467, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>		%vecinit471 = shufflevector <8 x i16> %vecinit467, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
store <8 x i16> %vecinit471, <8 x i16> addrspace(1)* undef, align 16		store <8 x i16> %vecinit471, <8 x i16> addrspace(1)* undef, align 16
ret void		ret void
}		}

declare i32 @llvm.amdgcn.workitem.id.x()		declare i32 @llvm.amdgcn.workitem.id.x()

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s \| FileCheck -check-prefix=SI %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s \| FileCheck -check-prefix=GFX9 %s

				define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
				; SI-LABEL: extract_4xi16:
				; SI: ; %bb.0:
				; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; SI-NEXT: s_cbranch_scc0 .LBB0_2
				; SI-NEXT: ; %bb.1: ; %F
				; SI-NEXT: s_mov_b32 s6, 0
				; SI-NEXT: s_mov_b32 s7, 0xf000
				; SI-NEXT: s_mov_b32 s4, s6
				; SI-NEXT: s_mov_b32 s5, s6
				; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
				; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
				; SI-NEXT: v_or_b32_e32 v2, v6, v2
				; SI-NEXT: v_or_b32_e32 v3, v4, v3
				; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
				; SI-NEXT: s_mov_b64 vcc, exec
				; SI-NEXT: s_cbranch_execz .LBB0_3
				; SI-NEXT: s_branch .LBB0_4
				; SI-NEXT: .LBB0_2:
				; SI-NEXT: ; implicit-def: $vgpr3
				; SI-NEXT: ; implicit-def: $vgpr4
				; SI-NEXT: ; implicit-def: $vgpr2
				; SI-NEXT: s_mov_b64 vcc, 0
				; SI-NEXT: .LBB0_3: ; %T
				; SI-NEXT: s_mov_b32 s6, 0
				; SI-NEXT: s_mov_b32 s7, 0xf000
				; SI-NEXT: s_mov_b32 s4, s6
				; SI-NEXT: s_mov_b32 s5, s6
				; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
				; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
				; SI-NEXT: v_or_b32_e32 v2, v4, v0
				; SI-NEXT: v_or_b32_e32 v3, v3, v1
				; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
				; SI-NEXT: .LBB0_4: ; %exit
				; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
				; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
				; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
				; SI-NEXT: s_mov_b32 s4, 0xffff
				; SI-NEXT: v_mov_b32_e32 v3, 0x8000
				; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
				; SI-NEXT: v_bfrev_b32_e32 v5, 1
				; SI-NEXT: v_mov_b32_e32 v6, 0xffff8000
				; SI-NEXT: v_mov_b32_e32 v7, s4
				; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
				; SI-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc
				; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
				; SI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
				; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
				; SI-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
				; SI-NEXT: v_or_b32_e32 v0, v0, v1
				; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
				; SI-NEXT: v_and_b32_e32 v2, s4, v2
				; SI-NEXT: v_or_b32_e32 v2, v2, v3
				; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
				; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
				; SI-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-LABEL: extract_4xi16:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: s_cbranch_scc0 .LBB0_2
				; GFX9-NEXT: ; %bb.1: ; %F
				; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: s_cbranch_execz .LBB0_3
				; GFX9-NEXT: s_branch .LBB0_4
				; GFX9-NEXT: .LBB0_2:
				; GFX9-NEXT: s_mov_b32 s8, 0
				; GFX9-NEXT: s_mov_b32 s9, s8
				; GFX9-NEXT: s_mov_b32 s10, s8
				; GFX9-NEXT: s_mov_b32 s11, s8
				; GFX9-NEXT: v_mov_b32_e32 v2, s8
				; GFX9-NEXT: v_mov_b32_e32 v3, s9
				; GFX9-NEXT: v_mov_b32_e32 v4, s10
				; GFX9-NEXT: v_mov_b32_e32 v5, s11
				; GFX9-NEXT: .LBB0_3: ; %T
				; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: .LBB0_4: ; %exit
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
				; GFX9-NEXT: s_movk_i32 s4, 0x8000
				; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
				; GFX9-NEXT: v_or_b32_e32 v3, s4, v0
				; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
				; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
				; GFX9-NEXT: v_or_b32_e32 v0, s4, v0
				; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX9-NEXT: v_and_b32_e32 v0, v4, v0
				; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
				; GFX9-NEXT: v_and_b32_e32 v2, v4, v3
				; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				br i1 undef, label %T, label %F

				T:
				%t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
				br label %exit

				F:
				%f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
				br label %exit

				exit:
				%m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
				%v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
				%b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
				%r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
				ret <4 x i16> %r2
				}

				define <4 x i16> @extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
				; SI-LABEL: extract_4xi16_2:
				; SI: ; %bb.0:
				; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; SI-NEXT: s_cbranch_scc0 .LBB1_2
				; SI-NEXT: ; %bb.1: ; %F
				; SI-NEXT: s_mov_b32 s6, 0
				; SI-NEXT: s_mov_b32 s7, 0xf000
				; SI-NEXT: s_mov_b32 s4, s6
				; SI-NEXT: s_mov_b32 s5, s6
				; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
				; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
				; SI-NEXT: v_or_b32_e32 v2, v6, v2
				; SI-NEXT: v_or_b32_e32 v3, v4, v3
				; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
				; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
				; SI-NEXT: s_mov_b64 vcc, exec
				; SI-NEXT: s_cbranch_execz .LBB1_3
				; SI-NEXT: s_branch .LBB1_4
				; SI-NEXT: .LBB1_2:
				; SI-NEXT: ; implicit-def: $vgpr3
				; SI-NEXT: ; implicit-def: $vgpr5
				; SI-NEXT: ; implicit-def: $vgpr2
				; SI-NEXT: ; implicit-def: $vgpr4
				; SI-NEXT: s_mov_b64 vcc, 0
				; SI-NEXT: .LBB1_3: ; %T
				; SI-NEXT: s_mov_b32 s6, 0
				; SI-NEXT: s_mov_b32 s7, 0xf000
				; SI-NEXT: s_mov_b32 s4, s6
				; SI-NEXT: s_mov_b32 s5, s6
				; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
				; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
				; SI-NEXT: v_or_b32_e32 v2, v4, v0
				; SI-NEXT: v_or_b32_e32 v3, v3, v1
				; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
				; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
				; SI-NEXT: .LBB1_4: ; %exit
				; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
				; SI-NEXT: v_bfe_i32 v1, v5, 0, 16
				; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
				; SI-NEXT: v_bfe_i32 v3, v4, 0, 16
				; SI-NEXT: v_mov_b32_e32 v4, 0xffff
				; SI-NEXT: v_mov_b32_e32 v5, 0x8000
				; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
				; SI-NEXT: v_bfrev_b32_e32 v7, 1
				; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
				; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
				; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
				; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
				; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
				; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
				; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
				; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
				; SI-NEXT: v_or_b32_e32 v0, v0, v1
				; SI-NEXT: v_or_b32_e32 v2, v2, v3
				; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
				; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
				; SI-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-LABEL: extract_4xi16_2:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: s_cbranch_scc0 .LBB1_2
				; GFX9-NEXT: ; %bb.1: ; %F
				; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: s_cbranch_execz .LBB1_3
				; GFX9-NEXT: s_branch .LBB1_4
				; GFX9-NEXT: .LBB1_2:
				; GFX9-NEXT: s_mov_b32 s8, 0
				; GFX9-NEXT: s_mov_b32 s9, s8
				; GFX9-NEXT: s_mov_b32 s10, s8
				; GFX9-NEXT: s_mov_b32 s11, s8
				; GFX9-NEXT: v_mov_b32_e32 v2, s8
				; GFX9-NEXT: v_mov_b32_e32 v3, s9
				; GFX9-NEXT: v_mov_b32_e32 v4, s10
				; GFX9-NEXT: v_mov_b32_e32 v5, s11
				; GFX9-NEXT: .LBB1_3: ; %T
				; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: .LBB1_4: ; %exit
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
				; GFX9-NEXT: s_movk_i32 s4, 0x8000
				; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
				; GFX9-NEXT: v_or_b32_e32 v2, s4, v0
				; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
				; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
				; GFX9-NEXT: v_or_b32_e32 v0, s4, v0
				; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX9-NEXT: v_and_b32_e32 v0, v4, v0
				; GFX9-NEXT: v_and_b32_e32 v2, v4, v2
				; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
				; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				br i1 undef, label %T, label %F

				T:
				%t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
				br label %exit

				F:
				%f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
				br label %exit

				exit:
				%m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
				%v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
				%b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
				%r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
				ret <4 x i16> %r2
				}

				define <4 x half> @extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) {
				; SI-LABEL: extract_4xf16:
				; SI: ; %bb.0:
				; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; SI-NEXT: s_cbranch_scc0 .LBB2_2
				; SI-NEXT: ; %bb.1: ; %F
				; SI-NEXT: s_mov_b32 s6, 0
				; SI-NEXT: s_mov_b32 s7, 0xf000
				; SI-NEXT: s_mov_b32 s4, s6
				; SI-NEXT: s_mov_b32 s5, s6
				; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
				; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
				; SI-NEXT: v_or_b32_e32 v2, v6, v2
				; SI-NEXT: v_or_b32_e32 v4, v4, v3
				; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
				; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
				; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
				; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
				; SI-NEXT: s_mov_b64 vcc, exec
				; SI-NEXT: s_cbranch_execz .LBB2_3
				; SI-NEXT: s_branch .LBB2_4
				; SI-NEXT: .LBB2_2:
				; SI-NEXT: ; implicit-def: $vgpr3
				; SI-NEXT: ; implicit-def: $vgpr4
				; SI-NEXT: ; implicit-def: $vgpr2
				; SI-NEXT: s_mov_b64 vcc, 0
				; SI-NEXT: .LBB2_3: ; %T
				; SI-NEXT: s_mov_b32 s6, 0
				; SI-NEXT: s_mov_b32 s7, 0xf000
				; SI-NEXT: s_mov_b32 s4, s6
				; SI-NEXT: s_mov_b32 s5, s6
				; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
				; SI-NEXT: s_waitcnt vmcnt(0)
				; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
				; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
				; SI-NEXT: v_or_b32_e32 v0, v4, v0
				; SI-NEXT: v_or_b32_e32 v1, v2, v1
				; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
				; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
				; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
				; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
				; SI-NEXT: .LBB2_4: ; %exit
				; SI-NEXT: v_cvt_f16_f32_e32 v0, v3
				; SI-NEXT: v_cvt_f16_f32_e32 v1, v4
				; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
				; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000
				; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000
				; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
				; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
				; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
				; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0
				; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
				; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1
				; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
				; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2
				; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
				; SI-NEXT: v_mov_b32_e32 v3, v2
				; SI-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-LABEL: extract_4xf16:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: s_cbranch_scc0 .LBB2_2
				; GFX9-NEXT: ; %bb.1: ; %F
				; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: s_cbranch_execz .LBB2_3
				; GFX9-NEXT: s_branch .LBB2_4
				; GFX9-NEXT: .LBB2_2:
				; GFX9-NEXT: s_mov_b32 s8, 0
				; GFX9-NEXT: s_mov_b32 s9, s8
				; GFX9-NEXT: s_mov_b32 s10, s8
				; GFX9-NEXT: s_mov_b32 s11, s8
				; GFX9-NEXT: v_mov_b32_e32 v2, s8
				; GFX9-NEXT: v_mov_b32_e32 v3, s9
				; GFX9-NEXT: v_mov_b32_e32 v4, s10
				; GFX9-NEXT: v_mov_b32_e32 v5, s11
				; GFX9-NEXT: .LBB2_3: ; %T
				; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: .LBB2_4: ; %exit
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3
				; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
				; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800
				; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900
				; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00
				; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0
				; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc
				; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
				; GFX9-NEXT: v_cndmask_b32_e32 v6, v4, v3, vcc
				; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2
				; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
				; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD
				; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
				; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
				; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				br i1 undef, label %T, label %F

				T:
				%t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0
				br label %exit

				F:
				%f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1
				br label %exit

				exit:
				%m = phi <8 x half> [ %t, %T ], [ %f, %F ]
				%v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
				%b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
				%r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
				ret <4 x half> %r2
				}

llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll

Show First 20 Lines • Show All 160 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(<16 x half> addrspace(4)* %ptr) #0 {
%load = load <16 x half>, <16 x half> addrspace(4)* %ptr		%load = load <16 x half>, <16 x half> addrspace(4)* %ptr
%elt2 = extractelement <16 x half> %load, i32 2		%elt2 = extractelement <16 x half> %load, i32 2
%elt3 = extractelement <16 x half> %load, i32 3		%elt3 = extractelement <16 x half> %load, i32 3
store volatile half %elt2, half addrspace(1)* undef, align 2		store volatile half %elt2, half addrspace(1)* undef, align 2
store volatile half %elt3, half addrspace(1)* undef, align 2		store volatile half %elt3, half addrspace(1)* undef, align 2
ret void		ret void
}		}

		; GCN-LABEL: {{^}}v_extractelement_v8f16_dynamic_sgpr:
		; GCN-COUNT-7: v_cndmask_b32_e32
		define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(half addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %n) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
		%vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
		%vec.extract = extractelement <8 x half> %vec, i32 %n
		store half %vec.extract, half addrspace(1)* %out.gep
		ret void
		}

declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }

llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll

Show First 20 Lines • Show All 160 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(<16 x i16> addrspace(4)* %ptr) #0 {
%load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr		%load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%elt2 = extractelement <16 x i16> %load, i32 2		%elt2 = extractelement <16 x i16> %load, i32 2
%elt3 = extractelement <16 x i16> %load, i32 3		%elt3 = extractelement <16 x i16> %load, i32 3
store volatile i16 %elt2, i16 addrspace(1)* undef, align 2		store volatile i16 %elt2, i16 addrspace(1)* undef, align 2
store volatile i16 %elt3, i16 addrspace(1)* undef, align 2		store volatile i16 %elt3, i16 addrspace(1)* undef, align 2
ret void		ret void
}		}

		; GCN-LABEL: {{^}}v_extractelement_v8i16_2:
		; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4
		; SI: buffer_store_short [[RES]]
		; VI: flat_load_dword [[RES:v[0-9]+]]
		; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
		; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4
		; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
		define amdgpu_kernel void @v_extractelement_v8i16_2(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
		%vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
		%vec.extract = extractelement <8 x i16> %vec, i32 2
		store i16 %vec.extract, i16 addrspace(1)* %out.gep
		ret void
		}

		; GCN-LABEL: {{^}}v_extractelement_v8i16_6:
		; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12
		; SI: buffer_store_short [[RES]]
		; VI: flat_load_dword [[RES:v[0-9]+]]
		; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
		; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12
		; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
		define amdgpu_kernel void @v_extractelement_v8i16_6(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
		%vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
		%vec.extract = extractelement <8 x i16> %vec, i32 6
		store i16 %vec.extract, i16 addrspace(1)* %out.gep
		ret void
		}

		; GCN-LABEL: {{^}}v_extractelement_v8i16_dynamic_sgpr:
		; GCN-COUNT-7: v_cndmask_b32_e32
		define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %n) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
		%vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
		%vec.extract = extractelement <8 x i16> %vec, i32 %n
		store i16 %vec.extract, i16 addrspace(1)* %out.gep
		ret void
		}

declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }

llvm/test/CodeGen/AMDGPU/function-returns.ll

	Show First 20 Lines • Show All 382 Lines • ▼ Show 20 Lines
	define <4 x half> @v4f16_func_void() #0 {			define <4 x half> @v4f16_func_void() #0 {
	%val = load <4 x half>, <4 x half> addrspace(1)* undef			%val = load <4 x half>, <4 x half> addrspace(1)* undef
	ret <4 x half> %val			ret <4 x half> %val
	}			}

	; FIXME: Mixing buffer and global			; FIXME: Mixing buffer and global
	; FIXME: Should not scalarize			; FIXME: Should not scalarize
	; GCN-LABEL: {{^}}v5i16_func_void:			; GCN-LABEL: {{^}}v5i16_func_void:
	; GFX9: buffer_load_dwordx2 v[0:1]			; GFX9: buffer_load_dwordx4 v[0:3]
	; GFX9-NEXT: s_nop 0
	; GFX9-NEXT: global_load_short_d16 v2
	; GFX9-NEXT: s_waitcnt			; GFX9-NEXT: s_waitcnt
	; GFX9-NEXT: s_setpc_b64			; GFX9-NEXT: s_setpc_b64
	define <5 x i16> @v5i16_func_void() #0 {			define <5 x i16> @v5i16_func_void() #0 {
	%ptr = load volatile <5 x i16> addrspace(1), <5 x i16> addrspace(1) addrspace(4)* undef			%ptr = load volatile <5 x i16> addrspace(1), <5 x i16> addrspace(1) addrspace(4)* undef
	%val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr			%val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
	ret <5 x i16> %val			ret <5 x i16> %val
	}			}

	▲ Show 20 Lines • Show All 261 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/idot8s.ll

	Show First 20 Lines • Show All 2,283 Lines • ▼ Show 20 Lines
	; GFX8-NEXT: v_mov_b32_e32 v1, s1			; GFX8-NEXT: v_mov_b32_e32 v1, s1
	; GFX8-NEXT: flat_load_ushort v4, v[0:1]			; GFX8-NEXT: flat_load_ushort v4, v[0:1]
	; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX8-NEXT: s_mov_b32 s10, -1			; GFX8-NEXT: s_mov_b32 s10, -1
	; GFX8-NEXT: s_mov_b32 s11, 0xe80000			; GFX8-NEXT: s_mov_b32 s11, 0xe80000
	; GFX8-NEXT: s_add_u32 s8, s8, s3			; GFX8-NEXT: s_add_u32 s8, s8, s3
	; GFX8-NEXT: s_addc_u32 s9, s9, 0			; GFX8-NEXT: s_addc_u32 s9, s9, 0
	; GFX8-NEXT: s_waitcnt vmcnt(2)			; GFX8-NEXT: s_waitcnt vmcnt(2)
	; GFX8-NEXT: v_lshrrev_b32_e32 v6, 4, v3			; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
	; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3			; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
	; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3			; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3
	; GFX8-NEXT: v_lshrrev_b32_e32 v9, 20, v3			; GFX8-NEXT: v_lshlrev_b16_sdwa v9, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX8-NEXT: v_lshrrev_b32_e32 v10, 28, v3			; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3
	; GFX8-NEXT: v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3			; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3
	; GFX8-NEXT: s_waitcnt vmcnt(1)			; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3
	; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v2
	; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2
	; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
	; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2
	; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2
	; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3			; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3
	; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3			; GFX8-NEXT: s_waitcnt vmcnt(1)
				; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2
				; GFX8-NEXT: v_lshlrev_b16_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
				; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2
	; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1			; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2
				; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2
				; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2
	; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2			; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2
	; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6			; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
	; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3			; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
	; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11			; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v18
	; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2			; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
	; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7			; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11
	; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12			; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v17
	; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6			; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
	; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11			; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
	; GFX8-NEXT: s_waitcnt vmcnt(0)			; GFX8-NEXT: s_waitcnt vmcnt(0)
	; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4			; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
	; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
	; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
	; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
	; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
	; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
	; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
	; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
	; GFX8-NEXT: v_mad_u16 v2, v7, v12, v2
	; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
	; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
	; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
	; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
	; GFX8-NEXT: v_mad_u16 v2, v8, v13, v2
	; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
	; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
	; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2
	; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10			; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10
				; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v16
				; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
				; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
				; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2
				; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
	; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16			; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
				; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2
				; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
				; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
	; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15			; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15
	; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18			; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
	; GFX8-NEXT: v_mad_u16 v2, v9, v14, v2			; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2
	; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10			; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
	; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15			; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
	; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2			; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2
	; GFX8-NEXT: v_mad_u16 v2, v10, v15, v2			; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6
				; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
				; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
				; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
				; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
				; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
				; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
				; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
				; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
	; GFX8-NEXT: flat_store_short v[0:1], v2			; GFX8-NEXT: flat_store_short v[0:1], v2
	; GFX8-NEXT: s_endpgm			; GFX8-NEXT: s_endpgm
	;			;
	; GFX9-LABEL: idot8_acc16_vecMul:			; GFX9-LABEL: idot8_acc16_vecMul:
	; GFX9: ; %bb.0: ; %entry			; GFX9: ; %bb.0: ; %entry
	; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX9-NEXT: s_mov_b32 s10, -1			; GFX9-NEXT: s_mov_b32 s10, -1
	; GFX9-NEXT: s_mov_b32 s11, 0xe00000			; GFX9-NEXT: s_mov_b32 s11, 0xe00000
	; GFX9-NEXT: s_add_u32 s8, s8, s3			; GFX9-NEXT: s_add_u32 s8, s8, s3
	; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff			; GFX9-NEXT: v_mov_b32_e32 v4, 12
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dword v1, v0, s[4:5]			; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX9-NEXT: global_load_dword v2, v0, s[6:7]			; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX9-NEXT: v_mov_b32_e32 v0, 0			; GFX9-NEXT: v_mov_b32_e32 v0, 0
	; GFX9-NEXT: global_load_ushort v3, v0, s[2:3]			; GFX9-NEXT: global_load_ushort v3, v0, s[2:3]
	; GFX9-NEXT: s_addc_u32 s9, s9, 0			; GFX9-NEXT: s_addc_u32 s9, s9, 0
	; GFX9-NEXT: s_waitcnt vmcnt(2)			; GFX9-NEXT: s_waitcnt vmcnt(2)
	; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1			; GFX9-NEXT: v_lshrrev_b32_e32 v5, 4, v1
	; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4			; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v1
	; GFX9-NEXT: v_bfe_u32 v7, v1, 20, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
	; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
	; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v9, 20, v1
	; GFX9-NEXT: v_bfe_u32 v10, v1, 8, 4			; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_bfe_u32 v11, v1, 4, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1
	; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
	; GFX9-NEXT: s_waitcnt vmcnt(1)			; GFX9-NEXT: s_waitcnt vmcnt(1)
	; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v2			; GFX9-NEXT: v_lshrrev_b32_e32 v12, 4, v2
	; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4			; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
	; GFX9-NEXT: v_bfe_u32 v14, v2, 20, 4			; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v2
	; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2
	; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2
	; GFX9-NEXT: v_bfe_u32 v17, v2, 8, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v16, 20, v2
	; GFX9-NEXT: v_bfe_u32 v18, v2, 4, 4			; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_and_b32_e32 v2, 15, v2			; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2
	; GFX9-NEXT: v_and_b32_e32 v1, v4, v1			; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
	; GFX9-NEXT: v_and_b32_e32 v2, v4, v2			; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v5
	; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1			; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v6
	; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2			; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v7
	; GFX9-NEXT: v_and_b32_e32 v10, v4, v10			; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v8
	; GFX9-NEXT: v_and_b32_e32 v6, v4, v6			; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v9
	; GFX9-NEXT: v_and_b32_e32 v17, v4, v17			; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v10
	; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]			; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v11
	; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]			; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v12
	; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10			; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v13
	; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v6			; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
	; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17			; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
	; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]			; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
	; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]			; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
	; GFX9-NEXT: v_and_b32_e32 v8, v4, v8			; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v14
	; GFX9-NEXT: v_and_b32_e32 v15, v4, v15			; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v15
	; GFX9-NEXT: v_and_b32_e32 v4, v4, v13			; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7
	; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]			; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12
	; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]			; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5
	; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2			; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
	; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8			; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
	; GFX9-NEXT: v_lshl_or_b32 v8, v14, 16, v15			; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
	; GFX9-NEXT: v_lshl_or_b32 v4, v12, 16, v4			; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7
	; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]			; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v16
	; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]			; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17
				; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v18
				; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9
				; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: v_add_u16_e32 v2, v1, v3			; GFX9-NEXT: v_add_u16_e32 v3, v4, v3
	; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]			; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
	; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]			; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
	; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]			; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
	; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]			; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15
	; GFX9-NEXT: v_pk_mul_lo_u16 v6, v9, v6			; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17
				; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9
				; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2
				; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1
				; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16
				; GFX9-NEXT: v_add_u16_e32 v3, v3, v5
				; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
				; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10
				; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
				; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-NEXT: v_add_u16_e32 v2, v2, v1
	; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1			; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
	; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
	; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
	; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
	; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
	; GFX9-NEXT: v_pk_mul_lo_u16 v4, v5, v4
	; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v8
	; GFX9-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_add_u16_e32 v1, v1, v5
	; GFX9-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_add_u16_e32 v1, v1, v4
	; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: global_store_short v0, v1, s[2:3]			; GFX9-NEXT: global_store_short v0, v1, s[2:3]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; GFX9-DL-LABEL: idot8_acc16_vecMul:			; GFX9-DL-LABEL: idot8_acc16_vecMul:
	; GFX9-DL: ; %bb.0: ; %entry			; GFX9-DL: ; %bb.0: ; %entry
	; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX9-DL-NEXT: s_mov_b32 s10, -1			; GFX9-DL-NEXT: s_mov_b32 s10, -1
	; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000			; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
	; GFX9-DL-NEXT: s_add_u32 s8, s8, s3			; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
	; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff			; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12
	; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]			; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]			; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0			; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
	; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]			; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
	; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0			; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
	; GFX9-DL-NEXT: s_waitcnt vmcnt(2)			; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v1
	; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v1
	; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 20, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
	; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
	; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 20, v1
	; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 8, 4			; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 4, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1
	; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
	; GFX9-DL-NEXT: s_waitcnt vmcnt(1)			; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v2			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 4, v2
	; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4			; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
	; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 20, 4			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v2
	; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2
	; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v2
	; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 8, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 20, v2
	; GFX9-DL-NEXT: v_bfe_u32 v18, v2, 4, 4			; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2
	; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1			; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
	; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v5
	; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v6
	; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v7
	; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v8
	; GFX9-DL-NEXT: v_and_b32_e32 v6, v4, v6			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v9
	; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v10
	; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v11
	; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v12
	; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v13
	; GFX9-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
	; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
	; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
	; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
	; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v14
	; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v15
	; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v13			; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
	; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12
	; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
	; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
	; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
	; GFX9-DL-NEXT: v_lshl_or_b32 v8, v14, 16, v15			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
	; GFX9-DL-NEXT: v_lshl_or_b32 v4, v12, 16, v4			; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7
	; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v16
	; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17
				; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v18
				; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9
				; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14
	; GFX9-DL-NEXT: s_waitcnt vmcnt(0)			; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
	; GFX9-DL-NEXT: v_add_u16_e32 v2, v1, v3			; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3
	; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
	; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
	; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
	; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15
	; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v9, v6			; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17
				; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9
				; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2
				; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1
				; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16
				; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5
				; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
				; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10
				; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
				; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
	; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1			; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
	; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
	; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
	; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
	; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
	; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v8
	; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5
	; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4
	; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]			; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
	; GFX9-DL-NEXT: s_endpgm			; GFX9-DL-NEXT: s_endpgm
	;			;
	; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul:			; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul:
	; GFX10-DL-XNACK: ; %bb.0: ; %entry			; GFX10-DL-XNACK: ; %bb.0: ; %entry
	; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0xffff			; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0xffff
	; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1			; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
	; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000			; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
	; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3			; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
	; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0			; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
	; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-DL-XNACK-NEXT: s_clause 0x1			; GFX10-DL-XNACK-NEXT: s_clause 0x1
	; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]			; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]			; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0			; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
	; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]			; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
	; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)			; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
	; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v6, v1, 24, 4			; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v1
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 20, 4
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 16, 4
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 12, 4
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v10, v1, 8, 4
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v11, v1, 4, 4
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 15, v1
	; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)			; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 15, v2			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v16, v2, 4, 4			; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v2
	; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 28, v2			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v14, v2, 24, 4			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v2
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v4, v1			; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
				; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13
				; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1
				; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2
				; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
				; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13			; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 20, 4			; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v6, v4, v6
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 16, 4			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v18, v2, 12, 4			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v2
	; GFX10-DL-XNACK-NEXT: v_bfe_u32 v2, v2, 8, 4			; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
	; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1			; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14
	; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13			; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v10			; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15
				; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13
				; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6
				; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1
				; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2
				; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
				; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v14
				; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v15
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, v4, v8			; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, v4, v8
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v4, v2			; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12
	; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1
	; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1]			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1
	; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v9, 16, v10			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v2
				; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2
				; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
				; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v17
				; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13
	; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8			; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8
	; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v18, 16, v2			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5
	; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
	; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1]
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v17
	; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
	; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
	; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
	; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8
	; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v15, 16, v10
	; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
	; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
	; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
	; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
	; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)			; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
	; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3			; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v5, v3
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v6			; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
	; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1]			; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
	; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v14			; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v16
	; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2			; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
	; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10			; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, v4, v14
	; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3			; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v10
	; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]			; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6
	; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v12, 16, v4			; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8
	; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v2			; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11
	; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2			; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v18, 12, v18
	; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v3 op_sel_hi:[0,1]			; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
	; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]			; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
				; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5
				; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10
				; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6
				; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11
				; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v18
				; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v4, v2
				; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v4, v1
	; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5			; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5
	; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v6			; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8
	; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]			; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v6, 16, v2
	; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]			; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1
	; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
	; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4			; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4
	; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3			; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2
	; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5			; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v2, v3, v5
	; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2			; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
	; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2			; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1
	; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3			; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
	; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]			; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
	; GFX10-DL-XNACK-NEXT: s_endpgm			; GFX10-DL-XNACK-NEXT: s_endpgm
	;			;
	; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:			; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
	; GFX10-DL-NOXNACK: ; %bb.0: ; %entry			; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
	; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0			; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
	; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0xffff			; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0xffff
	; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1			; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
	; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000			; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
	; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3			; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
	; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0			; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
	; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-DL-NOXNACK-NEXT: s_clause 0x1			; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
	; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]			; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]			; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
	; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]			; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
	; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)			; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
	; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v6, v1, 24, 4			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v1
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 20, 4
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 16, 4
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 12, 4
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v10, v1, 8, 4
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v11, v1, 4, 4
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 15, v1
	; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)			; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 15, v0			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v16, v0, 4, 4			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v0
	; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 28, v0			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v14, v0, 24, 4			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v0
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v4, v1			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
				; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13
				; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1
				; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0
				; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
				; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13			; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 20, 4			; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v6, v4, v6
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 16, 4			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v18, v0, 12, 4			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v0
	; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v0, v0, 8, 4			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
	; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14
	; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13			; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v10			; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15
				; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13
				; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6
				; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1
				; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0
				; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
				; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v14
				; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v15
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, v4, v8			; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, v4, v8
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v4, v0			; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12
	; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1
	; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1]			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1
	; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v9, 16, v10			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v0
				; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0
				; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
				; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v17
				; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13
	; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8			; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8
	; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v18, 16, v0			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5
	; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
	; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1]
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v17
	; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
	; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1]
	; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
	; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8
	; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v15, 16, v10
	; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
	; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
	; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
	; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
	; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)			; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
	; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3			; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v5, v3
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v6			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
	; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1]			; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
	; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v14			; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v16
	; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
	; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10			; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, v4, v14
	; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3			; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v10
	; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]			; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6
	; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v12, 16, v4			; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8
	; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v0			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11
	; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0			; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v18, 12, v18
	; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1]			; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
	; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]			; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
				; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5
				; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10
				; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6
				; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11
				; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v18
				; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v4, v0
				; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v4, v1
	; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5			; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5
	; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v6			; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8
	; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]			; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v6, 16, v0
	; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]			; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1
	; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
	; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v4			; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4
	; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3			; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0
	; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v5			; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v5
	; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1			; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0
	; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1			; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0
	; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3			; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3
	; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]			; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
	; GFX10-DL-NOXNACK-NEXT: s_endpgm			; GFX10-DL-NOXNACK-NEXT: s_endpgm
	; GFX10-DL-LABEL: idot8_acc16_vecMul:			; GFX10-DL-LABEL: idot8_acc16_vecMul:
	; GFX10-DL: ; %bb.0: ; %entry			; GFX10-DL: ; %bb.0: ; %entry
	; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34			; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
	; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0			; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
	; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0			; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
	▲ Show 20 Lines • Show All 830 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/idot8u.ll

	Show First 20 Lines • Show All 2,181 Lines • ▼ Show 20 Lines
	; GFX8-NEXT: v_mov_b32_e32 v0, s0			; GFX8-NEXT: v_mov_b32_e32 v0, s0
	; GFX8-NEXT: v_mov_b32_e32 v1, s1			; GFX8-NEXT: v_mov_b32_e32 v1, s1
	; GFX8-NEXT: flat_load_ushort v4, v[0:1]			; GFX8-NEXT: flat_load_ushort v4, v[0:1]
	; GFX8-NEXT: s_mov_b32 s10, -1			; GFX8-NEXT: s_mov_b32 s10, -1
	; GFX8-NEXT: s_mov_b32 s11, 0xe80000			; GFX8-NEXT: s_mov_b32 s11, 0xe80000
	; GFX8-NEXT: s_add_u32 s8, s8, s3			; GFX8-NEXT: s_add_u32 s8, s8, s3
	; GFX8-NEXT: s_addc_u32 s9, s9, 0			; GFX8-NEXT: s_addc_u32 s9, s9, 0
	; GFX8-NEXT: s_waitcnt vmcnt(2)			; GFX8-NEXT: s_waitcnt vmcnt(2)
	; GFX8-NEXT: v_and_b32_e32 v5, 15, v3			; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
	; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4			; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
	; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4			; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
	; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4			; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
	; GFX8-NEXT: v_bfe_u32 v9, v3, 16, 4			; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
	; GFX8-NEXT: v_bfe_u32 v10, v3, 20, 4			; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
				; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
				; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
	; GFX8-NEXT: s_waitcnt vmcnt(1)			; GFX8-NEXT: s_waitcnt vmcnt(1)
	; GFX8-NEXT: v_and_b32_e32 v12, 15, v2			; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
	; GFX8-NEXT: v_bfe_u32 v13, v2, 4, 4			; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4
	; GFX8-NEXT: v_bfe_u32 v14, v2, 8, 4			; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
				; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
				; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
				; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
				; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
				; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
	; GFX8-NEXT: s_waitcnt vmcnt(0)			; GFX8-NEXT: s_waitcnt vmcnt(0)
	; GFX8-NEXT: v_mad_u16 v4, v5, v12, v4
	; GFX8-NEXT: v_mad_u16 v4, v6, v13, v4
	; GFX8-NEXT: v_bfe_u32 v15, v2, 12, 4
	; GFX8-NEXT: v_mad_u16 v4, v7, v14, v4
	; GFX8-NEXT: v_bfe_u32 v16, v2, 16, 4
	; GFX8-NEXT: v_mad_u16 v4, v8, v15, v4
	; GFX8-NEXT: v_bfe_u32 v17, v2, 20, 4
	; GFX8-NEXT: v_mad_u16 v4, v9, v16, v4
	; GFX8-NEXT: v_bfe_u32 v11, v3, 24, 4
	; GFX8-NEXT: v_bfe_u32 v18, v2, 24, 4
	; GFX8-NEXT: v_mad_u16 v4, v10, v17, v4
	; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3
	; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v2
	; GFX8-NEXT: v_mad_u16 v4, v11, v18, v4
	; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4			; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
				; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
				; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
				; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
				; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
				; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
				; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
				; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
	; GFX8-NEXT: flat_store_short v[0:1], v2			; GFX8-NEXT: flat_store_short v[0:1], v2
	; GFX8-NEXT: s_endpgm			; GFX8-NEXT: s_endpgm
	;			;
	; GFX9-LABEL: udot8_acc16_vecMul:			; GFX9-LABEL: udot8_acc16_vecMul:
	; GFX9: ; %bb.0: ; %entry			; GFX9: ; %bb.0: ; %entry
	; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX9-NEXT: s_mov_b32 s10, -1			; GFX9-NEXT: s_mov_b32 s10, -1
	; GFX9-NEXT: s_mov_b32 s11, 0xe00000			; GFX9-NEXT: s_mov_b32 s11, 0xe00000
	; GFX9-NEXT: s_add_u32 s8, s8, s3			; GFX9-NEXT: s_add_u32 s8, s8, s3
	; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff			; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dword v1, v0, s[4:5]			; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX9-NEXT: global_load_dword v2, v0, s[6:7]			; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX9-NEXT: v_mov_b32_e32 v0, 0			; GFX9-NEXT: v_mov_b32_e32 v0, 0
	; GFX9-NEXT: global_load_ushort v3, v0, s[2:3]			; GFX9-NEXT: global_load_ushort v3, v0, s[2:3]
	; GFX9-NEXT: s_addc_u32 s9, s9, 0			; GFX9-NEXT: s_addc_u32 s9, s9, 0
	; GFX9-NEXT: s_waitcnt vmcnt(2)			; GFX9-NEXT: s_waitcnt vmcnt(2)
	; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4			; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4
	; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4			; GFX9-NEXT: v_and_b32_e32 v6, 15, v1
	; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4			; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
	; GFX9-NEXT: v_and_b32_e32 v11, 15, v1			; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4
				; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4
				; GFX9-NEXT: v_bfe_u32 v10, v1, 16, 4
				; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1
				; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4
	; GFX9-NEXT: s_waitcnt vmcnt(1)			; GFX9-NEXT: s_waitcnt vmcnt(1)
	; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4			; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4
	; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4			; GFX9-NEXT: v_and_b32_e32 v13, 15, v2
	; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4			; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4
	; GFX9-NEXT: v_and_b32_e32 v18, 15, v2			; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4
	; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1			; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4
	; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4			; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 4
	; GFX9-NEXT: v_bfe_u32 v10, v1, 12, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2
	; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4			; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4
	; GFX9-NEXT: v_lshrrev_b32_e32 v13, 28, v2			; GFX9-NEXT: v_and_b32_e32 v2, v4, v2
	; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4			; GFX9-NEXT: v_and_b32_e32 v1, v4, v1
	; GFX9-NEXT: v_bfe_u32 v17, v2, 12, 4			; GFX9-NEXT: v_and_b32_e32 v17, v4, v17
	; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4			; GFX9-NEXT: v_and_b32_e32 v10, v4, v10
	; GFX9-NEXT: v_and_b32_e32 v12, v4, v12			; GFX9-NEXT: v_and_b32_e32 v15, v4, v15
	; GFX9-NEXT: v_and_b32_e32 v5, v4, v5			; GFX9-NEXT: v_and_b32_e32 v8, v4, v8
	; GFX9-NEXT: v_and_b32_e32 v14, v4, v14			; GFX9-NEXT: v_and_b32_e32 v13, v4, v13
	; GFX9-NEXT: v_and_b32_e32 v7, v4, v7			; GFX9-NEXT: v_and_b32_e32 v4, v4, v6
	; GFX9-NEXT: v_and_b32_e32 v16, v4, v16			; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8
	; GFX9-NEXT: v_and_b32_e32 v9, v4, v9			; GFX9-NEXT: v_lshl_or_b32 v8, v12, 16, v13
	; GFX9-NEXT: v_and_b32_e32 v18, v4, v18			; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4
	; GFX9-NEXT: v_and_b32_e32 v4, v4, v11			; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8
	; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v18			; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10
	; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4			; GFX9-NEXT: v_lshl_or_b32 v10, v14, 16, v15
	; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
	; GFX9-NEXT: v_lshl_or_b32 v11, v13, 16, v12
	; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5
	; GFX9-NEXT: v_lshl_or_b32 v6, v15, 16, v14
	; GFX9-NEXT: v_lshl_or_b32 v7, v8, 16, v7
	; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v16
	; GFX9-NEXT: v_lshl_or_b32 v9, v10, 16, v9
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: v_add_u16_e32 v2, v1, v3			; GFX9-NEXT: v_add_u16_e32 v3, v4, v3
	; GFX9-NEXT: v_pk_mul_lo_u16 v4, v5, v11			; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v10
	; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v6			; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_pk_mul_lo_u16 v6, v9, v8			; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2
				; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1
				; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17
				; GFX9-NEXT: v_add_u16_e32 v3, v3, v5
				; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
				; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v6
				; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
				; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-NEXT: v_add_u16_e32 v2, v2, v1
	; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1			; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
	; GFX9-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_add_u16_e32 v1, v1, v5
	; GFX9-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_add_u16_e32 v1, v1, v4
	; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: global_store_short v0, v1, s[2:3]			; GFX9-NEXT: global_store_short v0, v1, s[2:3]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; GFX9-DL-LABEL: udot8_acc16_vecMul:			; GFX9-DL-LABEL: udot8_acc16_vecMul:
	; GFX9-DL: ; %bb.0: ; %entry			; GFX9-DL: ; %bb.0: ; %entry
	; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX9-DL-NEXT: s_mov_b32 s10, -1			; GFX9-DL-NEXT: s_mov_b32 s10, -1
	; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000			; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
	; GFX9-DL-NEXT: s_add_u32 s8, s8, s3			; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
	; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff			; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff
	; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]			; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]			; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0			; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
	; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]			; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
	; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0			; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
	; GFX9-DL-NEXT: s_waitcnt vmcnt(2)			; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
	; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4			; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4
	; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4			; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v1
	; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4			; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
	; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v1			; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4
				; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4
				; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 16, 4
				; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1
				; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
	; GFX9-DL-NEXT: s_waitcnt vmcnt(1)			; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
	; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4			; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4
	; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4			; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v2
	; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4			; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4
	; GFX9-DL-NEXT: v_and_b32_e32 v18, 15, v2			; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1			; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4
	; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4			; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 16, 4
	; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 12, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2
	; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4			; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 28, v2			; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2
	; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4			; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1
	; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 12, 4			; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17
	; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4			; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10
	; GFX9-DL-NEXT: v_and_b32_e32 v12, v4, v12			; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15
	; GFX9-DL-NEXT: v_and_b32_e32 v5, v4, v5			; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8
	; GFX9-DL-NEXT: v_and_b32_e32 v14, v4, v14			; GFX9-DL-NEXT: v_and_b32_e32 v13, v4, v13
	; GFX9-DL-NEXT: v_and_b32_e32 v7, v4, v7			; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v6
	; GFX9-DL-NEXT: v_and_b32_e32 v16, v4, v16			; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8
	; GFX9-DL-NEXT: v_and_b32_e32 v9, v4, v9			; GFX9-DL-NEXT: v_lshl_or_b32 v8, v12, 16, v13
	; GFX9-DL-NEXT: v_and_b32_e32 v18, v4, v18			; GFX9-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v4
	; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v11			; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8
	; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v18			; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10
	; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4			; GFX9-DL-NEXT: v_lshl_or_b32 v10, v14, 16, v15
	; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
	; GFX9-DL-NEXT: v_lshl_or_b32 v11, v13, 16, v12
	; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v5
	; GFX9-DL-NEXT: v_lshl_or_b32 v6, v15, 16, v14
	; GFX9-DL-NEXT: v_lshl_or_b32 v7, v8, 16, v7
	; GFX9-DL-NEXT: v_lshl_or_b32 v8, v17, 16, v16
	; GFX9-DL-NEXT: v_lshl_or_b32 v9, v10, 16, v9
	; GFX9-DL-NEXT: s_waitcnt vmcnt(0)			; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
	; GFX9-DL-NEXT: v_add_u16_e32 v2, v1, v3			; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3
	; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v11			; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v10
	; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v6			; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v9, v8			; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2
				; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1
				; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17
				; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5
				; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
				; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v6
				; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
				; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
				; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
	; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1			; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
	; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5
	; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4
	; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]			; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
	; GFX9-DL-NEXT: s_endpgm			; GFX9-DL-NEXT: s_endpgm
	;			;
	; GFX10-DL-LABEL: udot8_acc16_vecMul:			; GFX10-DL-LABEL: udot8_acc16_vecMul:
	; GFX10-DL: ; %bb.0: ; %entry			; GFX10-DL: ; %bb.0: ; %entry
	; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	Show All 9 Lines
	; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]			; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]			; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0			; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
	; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]			; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
	; GFX10-DL-NEXT: s_waitcnt vmcnt(2)			; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
	; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1			; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1
	; GFX10-DL-NEXT: s_waitcnt vmcnt(1)			; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
	; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2			; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2
	; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 4, 4			; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4
	; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4			; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
	; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 8, 4			; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
	; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7			; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
	; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6			; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
	; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4			; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
	; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4			; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
	; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12			; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9
	; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7			; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7
	; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6			; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6
	; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4
	; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4			; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
	; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13			; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13
	; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4			; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4
	; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6			; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9
	; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4			; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6
	; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12			; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4
	; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13			; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1			; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v6			; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12
				; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5
	; GFX10-DL-NEXT: s_waitcnt vmcnt(0)			; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3			; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
	; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4			; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4
	; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4			; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
	; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7			; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10
	; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1
	; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v12
	; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4
	; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
	; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9			; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
				; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
				; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4
				; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
				; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12
				; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8
				; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
	; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10			; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9
	; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5			; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1
	; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6			; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7			; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10
	; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9			; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8
	; GFX10-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4			; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
	; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v1			; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
	; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2			; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
	; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5			; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
	; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2			; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
	; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3			; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
	; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]			; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
	; GFX10-DL-NEXT: s_endpgm			; GFX10-DL-NEXT: s_endpgm
	<8 x i4> addrspace(1)* %src2,			<8 x i4> addrspace(1)* %src2,
	i16 addrspace(1)* nocapture %dst) {			i16 addrspace(1)* nocapture %dst) {
	entry:			entry:
	%idx = call i32 @llvm.amdgcn.workitem.id.x()			%idx = call i32 @llvm.amdgcn.workitem.id.x()
	%gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx			%gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
	▲ Show 20 Lines • Show All 141 Lines • ▼ Show 20 Lines
	; GFX8-NEXT: v_mov_b32_e32 v0, s0			; GFX8-NEXT: v_mov_b32_e32 v0, s0
	; GFX8-NEXT: v_mov_b32_e32 v1, s1			; GFX8-NEXT: v_mov_b32_e32 v1, s1
	; GFX8-NEXT: flat_load_ubyte v4, v[0:1]			; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
	; GFX8-NEXT: s_mov_b32 s10, -1			; GFX8-NEXT: s_mov_b32 s10, -1
	; GFX8-NEXT: s_mov_b32 s11, 0xe80000			; GFX8-NEXT: s_mov_b32 s11, 0xe80000
	; GFX8-NEXT: s_add_u32 s8, s8, s3			; GFX8-NEXT: s_add_u32 s8, s8, s3
	; GFX8-NEXT: s_addc_u32 s9, s9, 0			; GFX8-NEXT: s_addc_u32 s9, s9, 0
	; GFX8-NEXT: s_waitcnt vmcnt(2)			; GFX8-NEXT: s_waitcnt vmcnt(2)
	; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4			; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3
	; GFX8-NEXT: v_bfe_u32 v6, v3, 20, 4			; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4
	; GFX8-NEXT: v_bfe_u32 v7, v3, 24, 4			; GFX8-NEXT: v_bfe_u32 v11, v3, 20, 4
	; GFX8-NEXT: v_lshrrev_b32_e32 v8, 28, v3			; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4
	; GFX8-NEXT: v_bfe_u32 v9, v3, 8, 4			; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4
	; GFX8-NEXT: v_bfe_u32 v10, v3, 12, 4			; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4
	; GFX8-NEXT: s_waitcnt vmcnt(1)			; GFX8-NEXT: s_waitcnt vmcnt(1)
	; GFX8-NEXT: v_bfe_u32 v12, v2, 16, 4			; GFX8-NEXT: v_lshrrev_b32_e32 v16, 28, v2
	; GFX8-NEXT: v_bfe_u32 v13, v2, 20, 4			; GFX8-NEXT: v_bfe_u32 v17, v2, 24, 4
	; GFX8-NEXT: v_bfe_u32 v14, v2, 24, 4			; GFX8-NEXT: v_bfe_u32 v18, v2, 20, 4
	; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2			; GFX8-NEXT: v_bfe_u32 v14, v2, 12, 4
	; GFX8-NEXT: v_and_b32_e32 v11, 15, v3			; GFX8-NEXT: v_bfe_u32 v15, v2, 8, 4
	; GFX8-NEXT: v_bfe_u32 v3, v3, 4, 4			; GFX8-NEXT: v_bfe_u32 v19, v2, 16, 4
	; GFX8-NEXT: v_bfe_u32 v16, v2, 8, 4			; GFX8-NEXT: v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX8-NEXT: v_bfe_u32 v17, v2, 12, 4			; GFX8-NEXT: v_mul_lo_u16_e32 v18, v10, v17
	; GFX8-NEXT: v_and_b32_e32 v18, 15, v2			; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX8-NEXT: v_bfe_u32 v2, v2, 4, 4			; GFX8-NEXT: v_bfe_u32 v5, v3, 4, 4
	; GFX8-NEXT: v_mul_lo_u16_e32 v19, v5, v12			; GFX8-NEXT: v_and_b32_e32 v6, 15, v3
	; GFX8-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX8-NEXT: v_bfe_u32 v3, v2, 4, 4
	; GFX8-NEXT: v_mul_lo_u16_e32 v13, v7, v14			; GFX8-NEXT: v_and_b32_e32 v13, 15, v2
	; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX8-NEXT: v_mul_lo_u16_e32 v2, v12, v19
	; GFX8-NEXT: v_mul_lo_u16_e32 v9, v9, v16			; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v15
	; GFX8-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX8-NEXT: v_mul_lo_u16_sdwa v15, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX8-NEXT: v_or_b32_e32 v9, v18, v9
	; GFX8-NEXT: v_or_b32_e32 v3, v19, v6			; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX8-NEXT: v_or_b32_e32 v6, v13, v8			; GFX8-NEXT: v_or_b32_e32 v3, v2, v11
	; GFX8-NEXT: v_or_b32_e32 v8, v9, v10			; GFX8-NEXT: v_or_b32_e32 v7, v8, v7
	; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v6			; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9
	; GFX8-NEXT: v_mul_lo_u16_e32 v11, v11, v18			; GFX8-NEXT: v_mul_lo_u16_e32 v6, v6, v13
	; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8			; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
	; GFX8-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX8-NEXT: v_or_b32_e32 v9, v11, v15			; GFX8-NEXT: v_or_b32_e32 v6, v6, v5
	; GFX8-NEXT: v_or_b32_e32 v10, v15, v2			; GFX8-NEXT: v_or_b32_e32 v5, v5, v2
	; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3			; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
	; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3]			; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3]
	; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v10			; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5
	; GFX8-NEXT: s_waitcnt vmcnt(0)			; GFX8-NEXT: s_waitcnt vmcnt(0)
	; GFX8-NEXT: v_add_u16_e32 v3, v9, v4			; GFX8-NEXT: v_add_u16_e32 v3, v6, v4
	; GFX8-NEXT: v_add_u16_e32 v3, v3, v10			; GFX8-NEXT: v_add_u16_e32 v3, v3, v5
	; GFX8-NEXT: v_add_u16_e32 v3, v3, v8			; GFX8-NEXT: v_add_u16_e32 v3, v3, v7
	; GFX8-NEXT: v_add_u16_e32 v2, v3, v2			; GFX8-NEXT: v_add_u16_e32 v2, v3, v2
	; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2			; GFX8-NEXT: v_mad_u16 v2, v12, v19, v2
	; GFX8-NEXT: v_add_u16_e32 v2, v2, v11			; GFX8-NEXT: v_add_u16_e32 v2, v2, v8
	; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v6			; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v9
	; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2			; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
	; GFX8-NEXT: v_add_u16_e32 v2, v2, v6			; GFX8-NEXT: v_add_u16_e32 v2, v2, v9
	; GFX8-NEXT: flat_store_byte v[0:1], v2			; GFX8-NEXT: flat_store_byte v[0:1], v2
	; GFX8-NEXT: s_endpgm			; GFX8-NEXT: s_endpgm
	;			;
	; GFX9-LABEL: udot8_acc8_vecMul:			; GFX9-LABEL: udot8_acc8_vecMul:
	; GFX9: ; %bb.0: ; %entry			; GFX9: ; %bb.0: ; %entry
	; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX9-NEXT: s_mov_b32 s10, -1			; GFX9-NEXT: s_mov_b32 s10, -1
	; GFX9-NEXT: s_mov_b32 s11, 0xe00000			; GFX9-NEXT: s_mov_b32 s11, 0xe00000
	; GFX9-NEXT: s_add_u32 s8, s8, s3			; GFX9-NEXT: s_add_u32 s8, s8, s3
	; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX9-NEXT: v_mov_b32_e32 v3, 0			; GFX9-NEXT: v_mov_b32_e32 v3, 0
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dword v1, v0, s[4:5]			; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX9-NEXT: global_load_dword v2, v0, s[6:7]			; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3]			; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3]
	; GFX9-NEXT: s_addc_u32 s9, s9, 0			; GFX9-NEXT: s_addc_u32 s9, s9, 0
	; GFX9-NEXT: s_waitcnt vmcnt(2)			; GFX9-NEXT: s_waitcnt vmcnt(2)
	; GFX9-NEXT: v_bfe_u32 v0, v1, 20, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1
	; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4			; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4
	; GFX9-NEXT: v_lshrrev_b32_e32 v7, 28, v1			; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4
	; GFX9-NEXT: s_waitcnt vmcnt(1)			; GFX9-NEXT: s_waitcnt vmcnt(1)
	; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4			; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2
	; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4			; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4
	; GFX9-NEXT: v_lshrrev_b32_e32 v14, 28, v2			; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4
	; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 4			; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4
	; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4			; GFX9-NEXT: v_and_b32_e32 v5, 15, v1
	; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4			; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4
	; GFX9-NEXT: v_and_b32_e32 v10, 15, v1			; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
	; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4			; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4
	; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 4			; GFX9-NEXT: v_bfe_u32 v1, v2, 4, 4
	; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4			; GFX9-NEXT: v_and_b32_e32 v12, 15, v2
	; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4			; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4
	; GFX9-NEXT: v_and_b32_e32 v17, 15, v2			; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4
	; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4			; GFX9-NEXT: v_bfe_u32 v2, v2, 16, 4
	; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9-NEXT: v_mul_lo_u16_e32 v12, v6, v13			; GFX9-NEXT: v_mul_lo_u16_e32 v17, v9, v16
	; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9-NEXT: v_mul_lo_u16_e32 v18, v5, v11			; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2
	; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15			; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14
	; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17			; GFX9-NEXT: v_or_b32_e32 v8, v17, v8
	; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12
	; GFX9-NEXT: v_or_b32_e32 v7, v12, v7			; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9-NEXT: v_or_b32_e32 v1, v18, v0			; GFX9-NEXT: v_or_b32_e32 v1, v18, v10
	; GFX9-NEXT: v_or_b32_e32 v8, v8, v9			; GFX9-NEXT: v_or_b32_e32 v6, v7, v6
	; GFX9-NEXT: v_or_b32_e32 v9, v10, v2			; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v8
	; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7			; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6
	; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v8			; GFX9-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; GFX9-NEXT: v_or_b32_e32 v5, v5, v12
	; GFX9-NEXT: v_or_b32_e32 v2, v2, v0			; GFX9-NEXT: v_or_b32_e32 v7, v12, v0
	; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1			; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1
	; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]			; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
	; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2			; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: v_add_u16_e32 v1, v9, v4			; GFX9-NEXT: v_add_u16_e32 v1, v5, v4
	; GFX9-NEXT: v_add_u16_e32 v1, v1, v2			; GFX9-NEXT: v_add_u16_e32 v1, v1, v7
	; GFX9-NEXT: v_add_u16_e32 v1, v1, v8			; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
	; GFX9-NEXT: v_add_u16_e32 v0, v1, v0			; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
	; GFX9-NEXT: v_mad_legacy_u16 v0, v5, v11, v0			; GFX9-NEXT: v_mad_legacy_u16 v0, v11, v2, v0
	; GFX9-NEXT: v_add_u16_e32 v0, v0, v10			; GFX9-NEXT: v_add_u16_e32 v0, v0, v10
	; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7			; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8
	; GFX9-NEXT: v_mad_legacy_u16 v0, v6, v13, v0			; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
	; GFX9-NEXT: v_add_u16_e32 v0, v0, v7			; GFX9-NEXT: v_add_u16_e32 v0, v0, v8
	; GFX9-NEXT: global_store_byte v3, v0, s[2:3]			; GFX9-NEXT: global_store_byte v3, v0, s[2:3]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; GFX9-DL-LABEL: udot8_acc8_vecMul:			; GFX9-DL-LABEL: udot8_acc8_vecMul:
	; GFX9-DL: ; %bb.0: ; %entry			; GFX9-DL: ; %bb.0: ; %entry
	; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX9-DL-NEXT: s_mov_b32 s10, -1			; GFX9-DL-NEXT: s_mov_b32 s10, -1
	; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000			; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
	; GFX9-DL-NEXT: s_add_u32 s8, s8, s3			; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
	; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0			; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0
	; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]			; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]			; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3]			; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3]
	; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0			; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
	; GFX9-DL-NEXT: s_waitcnt vmcnt(2)			; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
	; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 20, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
	; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4			; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1			; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
	; GFX9-DL-NEXT: s_waitcnt vmcnt(1)			; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
	; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 20, 4			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2
	; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4			; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2			; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4
	; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 16, 4			; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
	; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4			; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1
	; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4			; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
	; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v1			; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
	; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4			; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
	; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 16, 4			; GFX9-DL-NEXT: v_bfe_u32 v1, v2, 4, 4
	; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4			; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2
	; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4			; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4
	; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2			; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4
	; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4			; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 16, 4
	; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9-DL-NEXT: v_mul_lo_u16_e32 v12, v6, v13			; GFX9-DL-NEXT: v_mul_lo_u16_e32 v17, v9, v16
	; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v5, v11			; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2
	; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15			; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14
	; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17			; GFX9-DL-NEXT: v_or_b32_e32 v8, v17, v8
	; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12
	; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v7			; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v0			; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v10
	; GFX9-DL-NEXT: v_or_b32_e32 v8, v8, v9			; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6
	; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v2			; GFX9-DL-NEXT: v_lshlrev_b32_e32 v7, 16, v8
	; GFX9-DL-NEXT: v_lshlrev_b32_e32 v10, 16, v7			; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
	; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8			; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; GFX9-DL-NEXT: v_or_b32_e32 v5, v5, v12
	; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0			; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v0
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1
	; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]			; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7
	; GFX9-DL-NEXT: s_waitcnt vmcnt(0)			; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v9, v4			; GFX9-DL-NEXT: v_add_u16_e32 v1, v5, v4
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2			; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8			; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
	; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0			; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0
	; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v11, v0			; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v11, v2, v0
	; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v10			; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v10
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7			; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v8
	; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v6, v13, v0			; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
	; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7			; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8
	; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3]			; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3]
	; GFX9-DL-NEXT: s_endpgm			; GFX9-DL-NEXT: s_endpgm
	;			;
	; GFX10-DL-LABEL: udot8_acc8_vecMul:			; GFX10-DL-LABEL: udot8_acc8_vecMul:
	; GFX10-DL: ; %bb.0: ; %entry			; GFX10-DL: ; %bb.0: ; %entry
	; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0			; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
	; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX10-DL-NEXT: s_mov_b32 s10, -1			; GFX10-DL-NEXT: s_mov_b32 s10, -1
	; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000			; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
	; GFX10-DL-NEXT: s_add_u32 s8, s8, s3			; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
	; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0			; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
	; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-DL-NEXT: s_clause 0x1			; GFX10-DL-NEXT: s_clause 0x1
	; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]			; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]			; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1]			; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1]
	; GFX10-DL-NEXT: s_waitcnt vmcnt(2)			; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
	; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4			; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
	; GFX10-DL-NEXT: s_waitcnt vmcnt(1)			; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
	; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4			; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
	; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4			; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
	; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4			; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2
	; GFX10-DL-NEXT: v_mul_lo_u16 v9, v9, v10			; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v10
	; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4			; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
	; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 20, 4			; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1
	; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 24, 4			; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 24, 4
	; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v1			; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4
	; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 4, 4			; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4
	; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4			; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 4, 4
	; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v13			; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v13
	; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9			; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6
	; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4			; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v2
	; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4			; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 24, 4
	; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14			; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
	; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 16, 4			; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 16, 4
	; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2			; GFX10-DL-NEXT: v_mul_lo_u16 v2, v8, v14
	; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v15			; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v1
	; GFX10-DL-NEXT: v_or_b32_e32 v8, v8, v9			; GFX10-DL-NEXT: v_or_b32_e32 v6, v7, v6
	; GFX10-DL-NEXT: v_mul_lo_u16 v9, v0, v10			; GFX10-DL-NEXT: v_mul_lo_u16 v1, v11, v13
	; GFX10-DL-NEXT: v_mul_lo_u16 v10, v6, v13			; GFX10-DL-NEXT: v_mul_lo_u16 v7, v9, v15
	; GFX10-DL-NEXT: v_lshlrev_b16 v7, 8, v7			; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2
				; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v0
				; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
				; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v10
				; GFX10-DL-NEXT: v_mul_lo_u16 v10, v12, v16
	; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1			; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1
	; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8			; GFX10-DL-NEXT: v_or_b32_e32 v7, v7, v2
	; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2			; GFX10-DL-NEXT: v_or_b32_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v12			; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v8
	; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9			; GFX10-DL-NEXT: v_or_b32_e32 v1, v10, v1
	; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7			; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v7
	; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2
	; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1
	; GFX10-DL-NEXT: v_or_b32_e32 v2, v11, v9
	; GFX10-DL-NEXT: v_lshlrev_b32_e32 v9, 16, v7
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v10
	; GFX10-DL-NEXT: s_waitcnt vmcnt(0)			; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v1, v3			; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
	; GFX10-DL-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX10-DL-NEXT: v_add_nc_u16 v9, v3, v10			; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v2
	; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]			; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1
	; GFX10-DL-NEXT: v_add_nc_u16 v0, v9, v8			; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6
	; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2			; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2
	; GFX10-DL-NEXT: v_mad_u16 v0, v5, v12, v0			; GFX10-DL-NEXT: v_mad_u16 v0, v12, v16, v0
	; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1			; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7
	; GFX10-DL-NEXT: v_mad_u16 v0, v6, v13, v0			; GFX10-DL-NEXT: v_mad_u16 v0, v9, v15, v0
	; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1			; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
	; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]			; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]
	; GFX10-DL-NEXT: s_endpgm			; GFX10-DL-NEXT: s_endpgm
	<8 x i4> addrspace(1)* %src2,			<8 x i4> addrspace(1)* %src2,
	i8 addrspace(1)* nocapture %dst) {			i8 addrspace(1)* nocapture %dst) {
	entry:			entry:
	%idx = call i32 @llvm.amdgcn.workitem.id.x()			%idx = call i32 @llvm.amdgcn.workitem.id.x()
	%gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx			%gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
	▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines
	; GFX8-NEXT: v_mov_b32_e32 v0, s0			; GFX8-NEXT: v_mov_b32_e32 v0, s0
	; GFX8-NEXT: v_mov_b32_e32 v1, s1			; GFX8-NEXT: v_mov_b32_e32 v1, s1
	; GFX8-NEXT: flat_load_ubyte v4, v[0:1]			; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
	; GFX8-NEXT: s_mov_b32 s10, -1			; GFX8-NEXT: s_mov_b32 s10, -1
	; GFX8-NEXT: s_mov_b32 s11, 0xe80000			; GFX8-NEXT: s_mov_b32 s11, 0xe80000
	; GFX8-NEXT: s_add_u32 s8, s8, s3			; GFX8-NEXT: s_add_u32 s8, s8, s3
	; GFX8-NEXT: s_addc_u32 s9, s9, 0			; GFX8-NEXT: s_addc_u32 s9, s9, 0
	; GFX8-NEXT: s_waitcnt vmcnt(2)			; GFX8-NEXT: s_waitcnt vmcnt(2)
	; GFX8-NEXT: v_and_b32_e32 v5, 15, v3			; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
	; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4			; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
	; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4			; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
	; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4			; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
	; GFX8-NEXT: v_bfe_u32 v9, v3, 16, 4			; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
	; GFX8-NEXT: v_bfe_u32 v10, v3, 20, 4			; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
				; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
				; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
	; GFX8-NEXT: s_waitcnt vmcnt(1)			; GFX8-NEXT: s_waitcnt vmcnt(1)
	; GFX8-NEXT: v_and_b32_e32 v12, 15, v2			; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
	; GFX8-NEXT: v_bfe_u32 v13, v2, 4, 4			; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2
	; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12			; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
	; GFX8-NEXT: v_bfe_u32 v14, v2, 8, 4			; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
	; GFX8-NEXT: v_mul_u32_u24_e32 v6, v6, v13			; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
				; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
				; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
				; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
	; GFX8-NEXT: s_waitcnt vmcnt(0)			; GFX8-NEXT: s_waitcnt vmcnt(0)
	; GFX8-NEXT: v_add_u16_e32 v4, v5, v4			; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
	; GFX8-NEXT: v_bfe_u32 v15, v2, 12, 4			; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
	; GFX8-NEXT: v_mul_u32_u24_e32 v7, v7, v14			; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
	; GFX8-NEXT: v_add_u16_e32 v4, v4, v6			; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
	; GFX8-NEXT: v_bfe_u32 v16, v2, 16, 4			; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
	; GFX8-NEXT: v_mul_u32_u24_e32 v8, v8, v15			; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
	; GFX8-NEXT: v_add_u16_e32 v4, v4, v7			; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
	; GFX8-NEXT: v_bfe_u32 v17, v2, 20, 4			; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
	; GFX8-NEXT: v_mul_u32_u24_e32 v9, v9, v16
	; GFX8-NEXT: v_add_u16_e32 v4, v4, v8
	; GFX8-NEXT: v_bfe_u32 v11, v3, 24, 4
	; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3
	; GFX8-NEXT: v_bfe_u32 v18, v2, 24, 4
	; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v2
	; GFX8-NEXT: v_mul_u32_u24_e32 v10, v10, v17
	; GFX8-NEXT: v_add_u16_e32 v4, v4, v9
	; GFX8-NEXT: v_mul_u32_u24_e32 v2, v3, v2
	; GFX8-NEXT: v_mul_u32_u24_e32 v3, v11, v18
	; GFX8-NEXT: v_add_u16_e32 v4, v4, v10
	; GFX8-NEXT: v_add_u16_e32 v3, v4, v3
	; GFX8-NEXT: v_add_u16_e32 v2, v3, v2
	; GFX8-NEXT: v_and_b32_e32 v2, 15, v2			; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
	; GFX8-NEXT: flat_store_byte v[0:1], v2			; GFX8-NEXT: flat_store_byte v[0:1], v2
	; GFX8-NEXT: s_endpgm			; GFX8-NEXT: s_endpgm
	;			;
	; GFX9-LABEL: udot8_acc4_vecMul:			; GFX9-LABEL: udot8_acc4_vecMul:
	; GFX9: ; %bb.0: ; %entry			; GFX9: ; %bb.0: ; %entry
	; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX9-NEXT: s_mov_b32 s10, -1			; GFX9-NEXT: s_mov_b32 s10, -1
	; GFX9-NEXT: s_mov_b32 s11, 0xe00000			; GFX9-NEXT: s_mov_b32 s11, 0xe00000
	; GFX9-NEXT: s_add_u32 s8, s8, s3			; GFX9-NEXT: s_add_u32 s8, s8, s3
	; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX9-NEXT: s_addc_u32 s9, s9, 0			; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dword v1, v0, s[4:5]			; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX9-NEXT: global_load_dword v2, v0, s[6:7]			; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX9-NEXT: v_mov_b32_e32 v0, 0			; GFX9-NEXT: v_mov_b32_e32 v0, 0
	; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3]			; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3]
				; GFX9-NEXT: s_addc_u32 s9, s9, 0
	; GFX9-NEXT: s_waitcnt vmcnt(2)			; GFX9-NEXT: s_waitcnt vmcnt(2)
	; GFX9-NEXT: v_and_b32_e32 v4, 15, v1
	; GFX9-NEXT: s_waitcnt vmcnt(1)
	; GFX9-NEXT: v_and_b32_e32 v11, 15, v2
	; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4			; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4
				; GFX9-NEXT: v_and_b32_e32 v6, 15, v1
				; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
				; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4
				; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4
				; GFX9-NEXT: v_bfe_u32 v10, v1, 16, 4
				; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1
				; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4
				; GFX9-NEXT: s_waitcnt vmcnt(1)
	; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4			; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4
	; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11			; GFX9-NEXT: v_and_b32_e32 v13, 15, v2
	; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4			; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4
	; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4			; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4
	; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12			; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4
				; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 4
				; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2
				; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4
				; GFX9-NEXT: v_and_b32_e32 v2, v4, v2
				; GFX9-NEXT: v_and_b32_e32 v1, v4, v1
				; GFX9-NEXT: v_and_b32_e32 v17, v4, v17
				; GFX9-NEXT: v_and_b32_e32 v10, v4, v10
				; GFX9-NEXT: v_and_b32_e32 v15, v4, v15
				; GFX9-NEXT: v_and_b32_e32 v8, v4, v8
				; GFX9-NEXT: v_and_b32_e32 v13, v4, v13
				; GFX9-NEXT: v_and_b32_e32 v4, v4, v6
				; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8
				; GFX9-NEXT: v_lshl_or_b32 v8, v12, 16, v13
				; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4
				; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8
				; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10
				; GFX9-NEXT: v_lshl_or_b32 v10, v14, 16, v15
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: v_add_u16_e32 v3, v4, v3			; GFX9-NEXT: v_add_u16_e32 v3, v4, v3
	; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4			; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v10
	; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4			; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13			; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2
				; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1
				; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17
	; GFX9-NEXT: v_add_u16_e32 v3, v3, v5			; GFX9-NEXT: v_add_u16_e32 v3, v3, v5
	; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4			; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
	; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4			; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v6
	; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14			; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_add_u16_e32 v3, v3, v6			; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
	; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4			; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4			; GFX9-NEXT: v_add_u16_e32 v2, v2, v1
	; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15			; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_add_u16_e32 v3, v3, v7
	; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4
	; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1
	; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4
	; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2
	; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16
	; GFX9-NEXT: v_add_u16_e32 v3, v3, v8
	; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2
	; GFX9-NEXT: v_mul_u32_u24_e32 v2, v10, v17
	; GFX9-NEXT: v_add_u16_e32 v3, v3, v9
	; GFX9-NEXT: v_add_u16_e32 v2, v3, v2
	; GFX9-NEXT: v_add_u16_e32 v1, v2, v1
	; GFX9-NEXT: v_and_b32_e32 v1, 15, v1			; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
	; GFX9-NEXT: global_store_byte v0, v1, s[2:3]			; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; GFX9-DL-LABEL: udot8_acc4_vecMul:			; GFX9-DL-LABEL: udot8_acc4_vecMul:
	; GFX9-DL: ; %bb.0: ; %entry			; GFX9-DL: ; %bb.0: ; %entry
	; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX9-DL-NEXT: s_mov_b32 s10, -1			; GFX9-DL-NEXT: s_mov_b32 s10, -1
	; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000			; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
	; GFX9-DL-NEXT: s_add_u32 s8, s8, s3			; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
	; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0			; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff
	; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]			; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]			; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0			; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
	; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]			; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
				; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
	; GFX9-DL-NEXT: s_waitcnt vmcnt(2)			; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
	; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1
	; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
	; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2
	; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4			; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4
				; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v1
				; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
				; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4
				; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4
				; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 16, 4
				; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1
				; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
				; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
	; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4			; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4
	; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11			; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v2
	; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4			; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4
	; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4			; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4
	; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12			; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4
				; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 16, 4
				; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2
				; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4
				; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2
				; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1
				; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17
				; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10
				; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15
				; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8
				; GFX9-DL-NEXT: v_and_b32_e32 v13, v4, v13
				; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v6
				; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8
				; GFX9-DL-NEXT: v_lshl_or_b32 v8, v12, 16, v13
				; GFX9-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v4
				; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8
				; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10
				; GFX9-DL-NEXT: v_lshl_or_b32 v10, v14, 16, v15
	; GFX9-DL-NEXT: s_waitcnt vmcnt(0)			; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
	; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3			; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3
	; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4			; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v10
	; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4			; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13			; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2
				; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1
				; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17
	; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5			; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5
	; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4			; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
	; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4			; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v6
	; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14			; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v6			; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
	; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4			; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4			; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
	; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15			; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v7
	; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
	; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4
	; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
	; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16
	; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v8
	; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2
	; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v10, v17
	; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v9
	; GFX9-DL-NEXT: v_add_u16_e32 v2, v3, v2
	; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1
	; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1			; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
	; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]			; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
	; GFX9-DL-NEXT: s_endpgm			; GFX9-DL-NEXT: s_endpgm
	;			;
	; GFX10-DL-LABEL: udot8_acc4_vecMul:			; GFX10-DL-LABEL: udot8_acc4_vecMul:
	; GFX10-DL: ; %bb.0: ; %entry			; GFX10-DL: ; %bb.0: ; %entry
				; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
				; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
				; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
				; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff
	; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0			; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
	; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1			; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
	; GFX10-DL-NEXT: s_mov_b32 s10, -1			; GFX10-DL-NEXT: s_mov_b32 s10, -1
	; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000			; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
	; GFX10-DL-NEXT: s_add_u32 s8, s8, s3			; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
	; GFX10-DL-NEXT: s_clause 0x1
	; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0			; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
	; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-DL-NEXT: s_clause 0x1			; GFX10-DL-NEXT: s_clause 0x1
	; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]			; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
	; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]			; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
	; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0			; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
	; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3]			; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
	; GFX10-DL-NEXT: s_waitcnt vmcnt(2)			; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
	; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1			; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1
	; GFX10-DL-NEXT: s_waitcnt vmcnt(1)			; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
	; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2			; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2
	; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4			; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4
	; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 4, 4			; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
	; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 8, 4			; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
	; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v5			; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
	; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 8, 4			; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
	; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v7			; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
	; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 12, 4			; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
				; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9
				; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7
				; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6
				; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
				; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13
				; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4
				; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9
				; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6
				; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4
				; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13
				; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4
				; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12
				; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5
	; GFX10-DL-NEXT: s_waitcnt vmcnt(0)			; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3			; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
	; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 12, 4			; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4
	; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v8			; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
	; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 16, 4			; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1
	; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4			; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
	; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v7			; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
	; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 20, 4			; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5			; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
	; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4			; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12
	; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v8			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8
	; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4			; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
				; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9
				; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1
				; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5
				; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10
				; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8
				; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1
				; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4			; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
	; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4			; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
	; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v7			; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5
	; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1			; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6
	; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v8
	; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2
	; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5
	; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4
	; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1			; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
				; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
	; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1			; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
	; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3]			; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
	; GFX10-DL-NEXT: s_endpgm			; GFX10-DL-NEXT: s_endpgm
	<8 x i4> addrspace(1)* %src2,			<8 x i4> addrspace(1)* %src2,
	i4 addrspace(1)* nocapture %dst) {			i4 addrspace(1)* nocapture %dst) {
	entry:			entry:
	%idx = call i32 @llvm.amdgcn.workitem.id.x()			%idx = call i32 @llvm.amdgcn.workitem.id.x()
	%gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx			%gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
	%vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1			%vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
	%gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx			%gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
	▲ Show 20 Lines • Show All 260 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll

	Show All 14 Lines
	define amdgpu_kernel void @v_input_output_i8() {			define amdgpu_kernel void @v_input_output_i8() {
	%v = tail call i8 asm sideeffect "v_mov_b32 $0, -1", "=v"()			%v = tail call i8 asm sideeffect "v_mov_b32 $0, -1", "=v"()
	tail call void asm sideeffect "; use $0", "v"(i8 %v)			tail call void asm sideeffect "; use $0", "v"(i8 %v)
	ret void			ret void
	}			}

	; GCN: error: couldn't allocate output register for constraint 's'			; GCN: error: couldn't allocate output register for constraint 's'
	; GCN: error: couldn't allocate input reg for constraint 's'			; GCN: error: couldn't allocate input reg for constraint 's'
	define amdgpu_kernel void @s_input_output_v8f16() {			define amdgpu_kernel void @s_input_output_v16f16() {
	%v = tail call <8 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()			%v = tail call <16 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
	tail call void asm sideeffect "; use $0", "s"(<8 x half> %v)			tail call void asm sideeffect "; use $0", "s"(<16 x half> %v)
	ret void			ret void
	}			}

	; SICI: error: couldn't allocate output register for constraint 's'			; SICI: error: couldn't allocate output register for constraint 's'
	; SICI: error: couldn't allocate input reg for constraint 's'			; SICI: error: couldn't allocate input reg for constraint 's'
	; VI-NOT: error			; VI-NOT: error
	define amdgpu_kernel void @s_input_output_v2f16() {			define amdgpu_kernel void @s_input_output_v2f16() {
	%v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()			%v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
	Show All 28 Lines

llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

Show First 20 Lines • Show All 1,739 Lines • ▼ Show 20 Lines	; CI-NEXT: s_endpgm
%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep		%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
%val.trunc = trunc i32 %val to i16		%val.trunc = trunc i32 %val to i16
%val.cvt = bitcast i16 %val.trunc to half		%val.cvt = bitcast i16 %val.trunc to half
%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval		%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep		store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
ret void		ret void
}		}

		define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val) {
		; GFX9-LABEL: v_insertelement_v8f16_3:
		; GFX9: ; %bb.0:
		; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
		; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
		; GFX9-NEXT: s_add_u32 s0, s0, s7
		; GFX9-NEXT: s_addc_u32 s1, s1, 0
		; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; GFX9-NEXT: s_waitcnt lgkmcnt(0)
		; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[10:11]
		; GFX9-NEXT: v_mov_b32_e32 v5, s6
		; GFX9-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16
		; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:16
		; GFX9-NEXT: s_waitcnt vmcnt(2)
		; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
		; GFX9-NEXT: s_waitcnt vmcnt(0)
		; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1
		; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
		; GFX9-NEXT: s_endpgm
		;
		; VI-LABEL: v_insertelement_v8f16_3:
		; VI: ; %bb.0:
		; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
		; VI-NEXT: s_load_dword s4, s[4:5], 0x10
		; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; VI-NEXT: s_add_u32 s0, s0, s7
		; VI-NEXT: s_addc_u32 s1, s1, 0
		; VI-NEXT: s_waitcnt lgkmcnt(0)
		; VI-NEXT: v_mov_b32_e32 v1, s11
		; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v4
		; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
		; VI-NEXT: v_mov_b32_e32 v5, s4
		; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
		; VI-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16
		; VI-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:16
		; VI-NEXT: v_mov_b32_e32 v5, s9
		; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4
		; VI-NEXT: s_mov_b32 s4, 0xffff
		; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
		; VI-NEXT: s_waitcnt vmcnt(2)
		; VI-NEXT: v_bfi_b32 v3, s4, v3, v3
		; VI-NEXT: s_waitcnt vmcnt(0)
		; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
		; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
		; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
		; VI-NEXT: s_endpgm
		;
		; CI-LABEL: v_insertelement_v8f16_3:
		; CI: ; %bb.0:
		; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
		; CI-NEXT: s_load_dword s4, s[4:5], 0x4
		; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; CI-NEXT: s_waitcnt lgkmcnt(0)
		; CI-NEXT: v_mov_b32_e32 v1, s3
		; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
		; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
		; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
		; CI-NEXT: v_mov_b32_e32 v5, s1
		; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
		; CI-NEXT: s_lshl_b32 s0, s4, 16
		; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
		; CI-NEXT: s_waitcnt vmcnt(0)
		; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
		; CI-NEXT: v_or_b32_e32 v1, s0, v1
		; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
		; CI-NEXT: s_endpgm
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext
		%vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to half
		%vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3
		store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep
		ret void
		}

		define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %val) {
		; GFX9-LABEL: v_insertelement_v8i16_6:
		; GFX9: ; %bb.0:
		; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
		; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
		; GFX9-NEXT: s_add_u32 s0, s0, s7
		; GFX9-NEXT: s_addc_u32 s1, s1, 0
		; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; GFX9-NEXT: s_waitcnt lgkmcnt(0)
		; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[10:11]
		; GFX9-NEXT: v_mov_b32_e32 v5, s6
		; GFX9-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16
		; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:16
		; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff
		; GFX9-NEXT: s_waitcnt vmcnt(0)
		; GFX9-NEXT: v_bfi_b32 v3, v6, v5, v3
		; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
		; GFX9-NEXT: s_endpgm
		;
		; VI-LABEL: v_insertelement_v8i16_6:
		; VI: ; %bb.0:
		; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
		; VI-NEXT: s_load_dword s4, s[4:5], 0x10
		; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; VI-NEXT: s_add_u32 s0, s0, s7
		; VI-NEXT: s_addc_u32 s1, s1, 0
		; VI-NEXT: s_waitcnt lgkmcnt(0)
		; VI-NEXT: v_mov_b32_e32 v1, s11
		; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v4
		; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
		; VI-NEXT: v_mov_b32_e32 v5, s4
		; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
		; VI-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16
		; VI-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:16
		; VI-NEXT: s_mov_b32 s4, 0xffff
		; VI-NEXT: v_mov_b32_e32 v5, s9
		; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4
		; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
		; VI-NEXT: s_waitcnt vmcnt(2)
		; VI-NEXT: v_bfi_b32 v1, s4, v1, v1
		; VI-NEXT: s_waitcnt vmcnt(0)
		; VI-NEXT: v_bfi_b32 v3, s4, v6, v3
		; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
		; VI-NEXT: s_endpgm
		;
		; CI-LABEL: v_insertelement_v8i16_6:
		; CI: ; %bb.0:
		; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
		; CI-NEXT: s_load_dword s4, s[4:5], 0x4
		; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; CI-NEXT: s_waitcnt lgkmcnt(0)
		; CI-NEXT: v_mov_b32_e32 v1, s3
		; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
		; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
		; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
		; CI-NEXT: v_mov_b32_e32 v5, s1
		; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
		; CI-NEXT: s_mov_b32 s0, 0xffff
		; CI-NEXT: v_mov_b32_e32 v6, s4
		; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
		; CI-NEXT: s_waitcnt vmcnt(0)
		; CI-NEXT: v_bfi_b32 v3, s0, v6, v3
		; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
		; CI-NEXT: s_endpgm
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %out, i64 %tid.ext
		%vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to i16
		%vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6
		store <8 x i16> %vecins, <8 x i16> addrspace(1)* %out.gep
		ret void
		}

		define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val, i32 %n) {
		; GFX9-LABEL: v_insertelement_v8f16_dynamic:
		; GFX9: ; %bb.0:
		; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
		; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
		; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff
		; GFX9-NEXT: s_waitcnt lgkmcnt(0)
		; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
		; GFX9-NEXT: s_cmp_eq_u32 s7, 7
		; GFX9-NEXT: v_mov_b32_e32 v6, s6
		; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
		; GFX9-NEXT: s_cmp_eq_u32 s7, 6
		; GFX9-NEXT: s_waitcnt vmcnt(0)
		; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
		; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
		; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
		; GFX9-NEXT: s_cmp_eq_u32 s7, 5
		; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2
		; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
		; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
		; GFX9-NEXT: s_cmp_eq_u32 s7, 4
		; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
		; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
		; GFX9-NEXT: s_cmp_eq_u32 s7, 3
		; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
		; GFX9-NEXT: v_and_b32_e32 v3, v5, v3
		; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
		; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
		; GFX9-NEXT: s_cmp_eq_u32 s7, 2
		; GFX9-NEXT: v_lshl_or_b32 v3, v7, 16, v3
		; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc
		; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
		; GFX9-NEXT: s_cmp_eq_u32 s7, 1
		; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0
		; GFX9-NEXT: v_and_b32_e32 v2, v5, v2
		; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
		; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
		; GFX9-NEXT: s_cmp_eq_u32 s7, 0
		; GFX9-NEXT: v_lshl_or_b32 v2, v8, 16, v2
		; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v6, vcc
		; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
		; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
		; GFX9-NEXT: v_and_b32_e32 v1, v5, v1
		; GFX9-NEXT: v_and_b32_e32 v0, v5, v0
		; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v1
		; GFX9-NEXT: v_lshl_or_b32 v0, v8, 16, v0
		; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
		; GFX9-NEXT: s_endpgm
		;
		; VI-LABEL: v_insertelement_v8f16_dynamic:
		; VI: ; %bb.0:
		; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
		; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
		; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; VI-NEXT: s_waitcnt lgkmcnt(0)
		; VI-NEXT: v_mov_b32_e32 v1, s3
		; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
		; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
		; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
		; VI-NEXT: v_mov_b32_e32 v5, s1
		; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
		; VI-NEXT: s_cmp_eq_u32 s5, 6
		; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
		; VI-NEXT: v_mov_b32_e32 v6, s4
		; VI-NEXT: s_cselect_b64 vcc, -1, 0
		; VI-NEXT: s_cmp_eq_u32 s5, 7
		; VI-NEXT: s_waitcnt vmcnt(0)
		; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
		; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
		; VI-NEXT: s_cselect_b64 vcc, -1, 0
		; VI-NEXT: s_cmp_eq_u32 s5, 4
		; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
		; VI-NEXT: s_cselect_b64 vcc, -1, 0
		; VI-NEXT: s_cmp_eq_u32 s5, 5
		; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
		; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
		; VI-NEXT: s_cselect_b64 vcc, -1, 0
		; VI-NEXT: s_cmp_eq_u32 s5, 2
		; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
		; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
		; VI-NEXT: s_cselect_b64 vcc, -1, 0
		; VI-NEXT: s_cmp_eq_u32 s5, 3
		; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
		; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
		; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
		; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
		; VI-NEXT: s_cselect_b64 vcc, -1, 0
		; VI-NEXT: s_cmp_eq_u32 s5, 0
		; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
		; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc
		; VI-NEXT: s_cselect_b64 vcc, -1, 0
		; VI-NEXT: s_cmp_eq_u32 s5, 1
		; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
		; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
		; VI-NEXT: s_cselect_b64 vcc, -1, 0
		; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
		; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
		; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
		; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
		; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
		; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
		; VI-NEXT: s_endpgm
		;
		; CI-LABEL: v_insertelement_v8f16_dynamic:
		; CI: ; %bb.0:
		; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
		; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
		; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; CI-NEXT: s_waitcnt lgkmcnt(0)
		; CI-NEXT: v_mov_b32_e32 v1, s3
		; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
		; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
		; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
		; CI-NEXT: v_mov_b32_e32 v5, s1
		; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
		; CI-NEXT: v_cvt_f32_f16_e32 v6, s4
		; CI-NEXT: s_cmp_eq_u32 s5, 7
		; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
		; CI-NEXT: s_cselect_b64 vcc, -1, 0
		; CI-NEXT: s_cmp_eq_u32 s5, 6
		; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
		; CI-NEXT: s_cmp_eq_u32 s5, 5
		; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
		; CI-NEXT: s_cmp_eq_u32 s5, 4
		; CI-NEXT: s_waitcnt vmcnt(0)
		; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
		; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
		; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
		; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
		; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
		; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
		; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
		; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
		; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
		; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
		; CI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
		; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
		; CI-NEXT: s_cmp_eq_u32 s5, 3
		; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
		; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
		; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
		; CI-NEXT: s_cselect_b64 vcc, -1, 0
		; CI-NEXT: s_cmp_eq_u32 s5, 2
		; CI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc
		; CI-NEXT: s_cselect_b64 vcc, -1, 0
		; CI-NEXT: s_cmp_eq_u32 s5, 1
		; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
		; CI-NEXT: s_cselect_b64 vcc, -1, 0
		; CI-NEXT: s_cmp_eq_u32 s5, 0
		; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3]
		; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
		; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc
		; CI-NEXT: s_cselect_b64 vcc, -1, 0
		; CI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
		; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
		; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
		; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
		; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
		; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
		; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
		; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
		; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
		; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7
		; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
		; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9
		; CI-NEXT: v_or_b32_e32 v3, v3, v6
		; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10
		; CI-NEXT: v_or_b32_e32 v2, v2, v7
		; CI-NEXT: v_or_b32_e32 v1, v1, v8
		; CI-NEXT: v_or_b32_e32 v0, v0, v6
		; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
		; CI-NEXT: s_endpgm
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext
		%vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to half
		%vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n
		store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep
		ret void
		}

declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }

llvm/test/CodeGen/AMDGPU/kernel-args.ll

	Show First 20 Lines • Show All 1,869 Lines • ▼ Show 20 Lines
	; VI-NEXT: v_mov_b32_e32 v3, s1			; VI-NEXT: v_mov_b32_e32 v3, s1
	; VI-NEXT: v_mov_b32_e32 v1, s3			; VI-NEXT: v_mov_b32_e32 v1, s3
	; VI-NEXT: v_mov_b32_e32 v2, s0			; VI-NEXT: v_mov_b32_e32 v2, s0
	; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]			; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	;			;
	; GFX9-LABEL: v5i16_arg:			; GFX9-LABEL: v5i16_arg:
	; GFX9: ; %bb.0: ; %entry			; GFX9: ; %bb.0: ; %entry
	; GFX9-NEXT: s_load_dword s6, s[4:5], 0x18			; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
	; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10			; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
	; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
	; GFX9-NEXT: v_mov_b32_e32 v2, 0			; GFX9-NEXT: v_mov_b32_e32 v2, 0
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: v_mov_b32_e32 v3, s6			; GFX9-NEXT: v_mov_b32_e32 v3, s2
	; GFX9-NEXT: v_mov_b32_e32 v0, s0			; GFX9-NEXT: v_mov_b32_e32 v0, s0
	; GFX9-NEXT: v_mov_b32_e32 v1, s1			; GFX9-NEXT: v_mov_b32_e32 v1, s1
	; GFX9-NEXT: global_store_short v2, v3, s[2:3] offset:8			; GFX9-NEXT: global_store_short v2, v3, s[6:7] offset:8
	; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]			; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; EG-LABEL: v5i16_arg:			; EG-LABEL: v5i16_arg:
	; EG: ; %bb.0: ; %entry			; EG: ; %bb.0: ; %entry
	; EG-NEXT: ALU 0, @20, KC0[], KC1[]			; EG-NEXT: ALU 0, @20, KC0[], KC1[]
	; EG-NEXT: TEX 4 @10			; EG-NEXT: TEX 4 @10
	; EG-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[]			; EG-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[]
	; EG-NEXT: MEM_RAT MSKOR T5.XW, T9.X			; EG-NEXT: MEM_RAT MSKOR T5.XW, T9.X
	▲ Show 20 Lines • Show All 4,481 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/load-constant-i16.ll

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 4,721 Lines • ▼ Show 20 Lines
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000			; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1			; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
	; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1			; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
	; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0			; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0			; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1			; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
	; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, 0xffff			; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16
	; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16			; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff
	; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0			; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
	; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1			; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
	; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0			; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
	; GCN-NOHSA-VI-NEXT: s_endpgm			; GCN-NOHSA-VI-NEXT: s_endpgm
	;			;
	; EG-LABEL: constant_zextload_v2i16_to_v2i64:			; EG-LABEL: constant_zextload_v2i16_to_v2i64:
	; EG: ; %bb.0:			; EG: ; %bb.0:
	; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]			; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
	; EG-NEXT: TEX 0 @6			; EG-NEXT: TEX 0 @6
	; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]			; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
	▲ Show 20 Lines • Show All 2,591 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/load-global-i16.ll

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 5,575 Lines • ▼ Show 20 Lines
	; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24			; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000			; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1			; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6			; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7			; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
	; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2			; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3			; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
	; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[8:11], 0			; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
	; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0			; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0			; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
	; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1			; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
	; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1			; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
	; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)			; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
	; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2			; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
	; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2			; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
	; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0			; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
	; GCN-NOHSA-VI-NEXT: s_endpgm			; GCN-NOHSA-VI-NEXT: s_endpgm
	;			;
	; EG-LABEL: global_zextload_v2i16_to_v2i64:			; EG-LABEL: global_zextload_v2i16_to_v2i64:
	; EG: ; %bb.0:			; EG: ; %bb.0:
	; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]			; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
	; EG-NEXT: TEX 0 @6			; EG-NEXT: TEX 0 @6
	; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]			; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
	▲ Show 20 Lines • Show All 3,188 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/sub.v2i16.ll

	Show First 20 Lines • Show All 565 Lines • ▼ Show 20 Lines

	; FIXME: Need to handle non-uniform case for function below (load without gep).			; FIXME: Need to handle non-uniform case for function below (load without gep).
	define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {			define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
	; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:			; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
	; GFX9: ; %bb.0:			; GFX9: ; %bb.0:
	; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
				; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
	; GFX9-NEXT: v_mov_b32_e32 v1, 0			; GFX9-NEXT: v_mov_b32_e32 v1, 0
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc			; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc			; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: s_mov_b32 s7, 0xf000			; GFX9-NEXT: s_mov_b32 s7, 0xf000
	; GFX9-NEXT: s_mov_b32 s6, -1			; GFX9-NEXT: s_mov_b32 s6, -1
	; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3			; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3
	; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2			; GFX9-NEXT: v_and_b32_e32 v0, v4, v2
	; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2			; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX9-NEXT: v_mov_b32_e32 v3, v1			; GFX9-NEXT: v_mov_b32_e32 v3, v1
	; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0			; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:			; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	Show All 19 Lines
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	;			;
	; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64:			; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_clause 0x1			; GFX10-NEXT: s_clause 0x1
	; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
				; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
	; GFX10-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc			; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
	; GFX10-NEXT: s_waitcnt vmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc			; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
	; GFX10-NEXT: s_waitcnt vmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: s_waitcnt_depctr 0xffe3			; GFX10-NEXT: s_waitcnt_depctr 0xffe3
	; GFX10-NEXT: s_mov_b32 s7, 0x31016000			; GFX10-NEXT: s_mov_b32 s7, 0x31016000
	; GFX10-NEXT: s_mov_b32 s6, -1			; GFX10-NEXT: s_mov_b32 s6, -1
	; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2			; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2
	; GFX10-NEXT: v_mov_b32_e32 v1, 0			; GFX10-NEXT: v_mov_b32_e32 v1, 0
	; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2			; GFX10-NEXT: v_and_b32_e32 v0, v3, v2
	; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2			; GFX10-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
	; GFX10-NEXT: v_mov_b32_e32 v3, v1			; GFX10-NEXT: v_mov_b32_e32 v3, v1
	; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0			; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
	; GFX10-NEXT: s_endpgm			; GFX10-NEXT: s_endpgm
	%tid = call i32 @llvm.amdgcn.workitem.id.x()			%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid			%gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
	%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid			%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
	%gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid			%gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
	%a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0			%a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
	▲ Show 20 Lines • Show All 169 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Make v8i16/v8f16 legalClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 402618

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll

llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll

llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll

llvm/test/Analysis/CostModel/AMDGPU/cast.ll

llvm/test/Analysis/CostModel/AMDGPU/fadd.ll

llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll

llvm/test/Analysis/CostModel/AMDGPU/fma.ll

llvm/test/Analysis/CostModel/AMDGPU/fmul.ll

llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll

llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll

llvm/test/Analysis/CostModel/AMDGPU/fsub.ll

llvm/test/Analysis/CostModel/AMDGPU/mul.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir

llvm/test/CodeGen/AMDGPU/add.v2i16.ll

llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll

llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll

llvm/test/CodeGen/AMDGPU/function-returns.ll

llvm/test/CodeGen/AMDGPU/idot8s.ll

llvm/test/CodeGen/AMDGPU/idot8u.ll

llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll

llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

llvm/test/CodeGen/AMDGPU/kernel-args.ll

llvm/test/CodeGen/AMDGPU/load-constant-i16.ll

llvm/test/CodeGen/AMDGPU/load-global-i16.ll

llvm/test/CodeGen/AMDGPU/sub.v2i16.ll

[AMDGPU] Make v8i16/v8f16 legal
ClosedPublic