Diff 158091

lib/Target/AMDGPU/AMDGPUISelLowering.h

Show First 20 Lines • Show All 361 Lines • ▼ Show 20 Lines	enum NodeType : unsigned {
UMAX3,		UMAX3,
FMIN3,		FMIN3,
SMIN3,		SMIN3,
UMIN3,		UMIN3,
FMED3,		FMED3,
SMED3,		SMED3,
UMED3,		UMED3,
FDOT2,		FDOT2,
		SDOT2,
		UDOT2,
URECIP,		URECIP,
DIV_SCALE,		DIV_SCALE,
DIV_FMAS,		DIV_FMAS,
DIV_FIXUP,		DIV_FIXUP,
// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is		// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
// treated as an illegal operation.		// treated as an illegal operation.
FMAD_FTZ,		FMAD_FTZ,
TRIG_PREOP, // 1 ULP max error for f64		TRIG_PREOP, // 1 ULP max error for f64
▲ Show 20 Lines • Show All 128 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 4,021 Lines • ▼ Show 20 Lines	const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(UMAX3)		NODE_NAME_CASE(UMAX3)
NODE_NAME_CASE(FMIN3)		NODE_NAME_CASE(FMIN3)
NODE_NAME_CASE(SMIN3)		NODE_NAME_CASE(SMIN3)
NODE_NAME_CASE(UMIN3)		NODE_NAME_CASE(UMIN3)
NODE_NAME_CASE(FMED3)		NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)		NODE_NAME_CASE(SMED3)
NODE_NAME_CASE(UMED3)		NODE_NAME_CASE(UMED3)
NODE_NAME_CASE(FDOT2)		NODE_NAME_CASE(FDOT2)
		NODE_NAME_CASE(SDOT2)
		NODE_NAME_CASE(UDOT2)
NODE_NAME_CASE(URECIP)		NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)		NODE_NAME_CASE(DIV_SCALE)
NODE_NAME_CASE(DIV_FMAS)		NODE_NAME_CASE(DIV_FMAS)
NODE_NAME_CASE(DIV_FIXUP)		NODE_NAME_CASE(DIV_FIXUP)
NODE_NAME_CASE(FMAD_FTZ)		NODE_NAME_CASE(FMAD_FTZ)
NODE_NAME_CASE(TRIG_PREOP)		NODE_NAME_CASE(TRIG_PREOP)
NODE_NAME_CASE(RCP)		NODE_NAME_CASE(RCP)
NODE_NAME_CASE(RSQ)		NODE_NAME_CASE(RSQ)
▲ Show 20 Lines • Show All 288 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUInstrInfo.td

	Show First 20 Lines • Show All 340 Lines • ▼ Show 20 Lines

	def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;			def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;

	def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",			def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
	SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,			SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
	SDTCisFP<0>, SDTCisVec<1>]>,			SDTCisFP<0>, SDTCisVec<1>]>,
	[]>;			[]>;

				def AMDGPUsdot2 : SDNode<"AMDGPUISD::SDOT2",
				SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
				SDTCisInt<0>, SDTCisVec<1>]>,
				[]>;

				def AMDGPUudot2 : SDNode<"AMDGPUISD::UDOT2",
				SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
				SDTCisInt<0>, SDTCisVec<1>]>,
				[]>;

	def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;			def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;

	def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",			def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
	SDTypeProfile<0, 1, [SDTCisInt<0>]>,			SDTypeProfile<0, 1, [SDTCisInt<0>]>,
	[SDNPHasChain, SDNPInGlue]>;			[SDNPHasChain, SDNPInGlue]>;

	def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",			def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
	SDTypeProfile<0, 2,			SDTypeProfile<0, 2,
	▲ Show 20 Lines • Show All 78 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,940 Lines • ▼ Show 20 Lines	return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
Op.getOperand(2), DAG.getCondCode(CCOpcode));		Op.getOperand(2), DAG.getCondCode(CCOpcode));
}		}
case Intrinsic::amdgcn_fmed3:		case Intrinsic::amdgcn_fmed3:
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,		return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));		Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_fdot2:		case Intrinsic::amdgcn_fdot2:
return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,		return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));		Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
		case Intrinsic::amdgcn_sdot2:
		return DAG.getNode(AMDGPUISD::SDOT2, DL, VT,
		Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
		case Intrinsic::amdgcn_udot2:
		return DAG.getNode(AMDGPUISD::UDOT2, DL, VT,
		Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
		arsenmUnsubmitted Not Done Reply Inline Actions I would split the intrinsics into a separate patch arsenm: I would split the intrinsics into a separate patch
		arsenmUnsubmitted Not Done Reply Inline Actions The intrinsic definition also seems to be missing from the patch arsenm: The intrinsic definition also seems to be missing from the patch
		kzhuravlUnsubmitted Not Done Reply Inline Actions Intrinsic definitions were already committed. kzhuravl: Intrinsic definitions were already committed.
case Intrinsic::amdgcn_fmul_legacy:		case Intrinsic::amdgcn_fmul_legacy:
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,		return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
Op.getOperand(1), Op.getOperand(2));		Op.getOperand(1), Op.getOperand(2));
case Intrinsic::amdgcn_sffbh:		case Intrinsic::amdgcn_sffbh:
return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));		return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_sbfe:		case Intrinsic::amdgcn_sbfe:
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,		return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));		Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
▲ Show 20 Lines • Show All 2,301 Lines • ▼ Show 20 Lines	static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
SDValue N0, SDValue N1, SDValue N2,		SDValue N0, SDValue N1, SDValue N2,
bool Signed) {		bool Signed) {
unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;		unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);		SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);		SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);		return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
}		}

		// Return true if operands of MUL are from the correspondent elements of two
		// different vectors, such as: S0.x * S1.x.
		static bool isProductOfDot2(const SDValue &MUL,
		SDValue &Vec1,
		SDValue &Vec2,
		SDValue &Indx) {

		SDValue MulOp1 = MUL.getOperand(0);
		SDValue MulOp2 = MUL.getOperand(1);

		if (MulOp1.getOpcode() != ISD::SIGN_EXTEND &&
		MulOp1.getOpcode() != ISD::ZERO_EXTEND)
		return false;

		if (MulOp1.getOpcode() != MulOp2.getOpcode())
		return false;

		MulOp1 = MulOp1.getOperand(0);
		MulOp2 = MulOp2.getOperand(0);

		if (MulOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
		MulOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
		return false;

		// Operands should be coming from different vectors.
		if (MulOp1.getOperand(0) == MulOp2.getOperand(0))
		return false;

		if (MulOp1.getOperand(1) != MulOp2.getOperand(1))
		return false;

		Vec1 = MulOp1.getOperand(0);
		Vec2 = MulOp2.getOperand(0);
		Indx = MulOp1.getOperand(1);

		return true;
		}

		// Returns true if N = add(mul (S0.x, S1.x),
		// add(mul S0.y, S1.y), S3))
		static bool isMAD_OfMAD(const SDNode *N,
		SDValue &M1,
		SDValue &M2,
		SDValue &Acc) {
		if (N->getOpcode() != ISD::ADD \|\| N->getValueType(0) != MVT::i32)
		return false;

		SDValue ADD = N->getOperand(0);
		SDValue MUL1 = N->getOperand(1);

		if (ADD.getOpcode() != ISD::ADD && MUL1.getOpcode() != ISD::ADD)
		return false;

		if (ADD.getOpcode() != ISD::ADD)
		std::swap(ADD, MUL1);

		unsigned MUL1Opc = MUL1.getOpcode();
		if (MUL1Opc != AMDGPUISD::MUL_U24 &&
		MUL1Opc != AMDGPUISD::MUL_I24)
		return false;

		SDValue MUL2 = ADD.getOperand(0);
		SDValue Z = ADD.getOperand(1);
		unsigned MUL2Opc = MUL2.getOpcode();

		if (MUL2Opc != MUL1Opc) {
		std::swap(MUL2, Z);
		MUL2Opc = MUL2.getOpcode();

		if (MUL2Opc != MUL1Opc)
		return false;
		}

		M1 = MUL1;
		M2 = MUL2;
		Acc = Z;
		return true;
		}

		static bool getIntDot2(const SDNode *N,
		SDValue &Dot2Inst,
		SelectionDAG &DAG) {
		if (N->getOpcode() != ISD::ADD \|\| N->getValueType(0) != MVT::i32)
		return false;
		arsenmUnsubmitted Not Done Reply Inline Actions return the SDValue results rather than bool + out argument? arsenm: return the SDValue results rather than bool + out argument?

		SDLoc SL(N);
		SDValue MUL1 = SDValue();
		arsenmUnsubmitted Not Done Reply Inline Actions Don't need explicit initializers arsenm: Don't need explicit initializers
		SDValue MUL2 = SDValue();
		SDValue OP3 = SDValue();

		// Look for the following two patterns.
		// Pattern1: add(mul (S0.x, S1.x),
		// add_32 (mul S0.y, S1.y), S3))
		// Pattern2: add(add_32(mul (S0.x, S1.x),
		// mul (S0.y, S1.y)),
		// S3)
		if (!isMAD_OfMAD(N, MUL1, MUL2, OP3)) {
		SDValue MULs = N->getOperand(0);
		OP3 = N->getOperand(1);
		if (MULs.getOpcode() != ISD::ADD && OP3.getOpcode() != ISD::ADD)
		return false;
		if (MULs.getOpcode() != ISD::ADD)
		std::swap(MULs, OP3);

		MUL1 = MULs.getOperand(0);
		MUL2 = MULs.getOperand(1);

		if (MUL1.getOpcode() != MUL2.getOpcode())
		return false;

		if (MUL1.getOpcode() != AMDGPUISD::MUL_U24 &&
		MUL1.getOpcode() != AMDGPUISD::MUL_I24)
		return false;
		}
		SDValue Vec1 = SDValue();
		SDValue Vec2 = SDValue();
		SDValue Indx1 = SDValue();

		if (!isProductOfDot2(MUL1, Vec1, Vec2, Indx1))
		return false;

		SDValue Vec3 = SDValue();
		SDValue Vec4 = SDValue();
		SDValue Indx2 = SDValue();
		if (!isProductOfDot2(MUL2, Vec3, Vec4, Indx2))
		return false;

		if (Indx1 == Indx2)
		return false;

		if (Vec1.getValueType() != MVT::v2i16 \|\| Vec2.getValueType() != MVT::v2i16)
		return false;

		if ((Vec1 == Vec3 && Vec2 == Vec4) \|\|
		(Vec1 == Vec4 && Vec2 == Vec3)) {
		Dot2Inst = (MUL1.getOpcode() == AMDGPUISD::MUL_I24) ?
		DAG.getNode(AMDGPUISD::SDOT2, SL, MVT::i32, Vec1, Vec2, OP3) :
		DAG.getNode(AMDGPUISD::UDOT2, SL, MVT::i32, Vec1, Vec2, OP3);
		return true;
		}
		return false;
		}

SDValue SITargetLowering::performAddCombine(SDNode *N,		SDValue SITargetLowering::performAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {		DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;		SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
SDLoc SL(N);		SDLoc SL(N);
SDValue LHS = N->getOperand(0);		SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);		SDValue RHS = N->getOperand(1);

Show All 22 Lines	if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);		MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);		AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);		return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
}		}

return SDValue();		return SDValue();
}		}

		SDValue IDot2 = SDValue();
		if (Subtarget->hasDLInsts() && getIntDot2(N, IDot2, DAG))
		return IDot2;

if (VT != MVT::i32 \|\| !DCI.isAfterLegalizeDAG())		if (VT != MVT::i32 \|\| !DCI.isAfterLegalizeDAG())
return SDValue();		return SDValue();

// add x, zext (setcc) => addcarry x, 0, setcc		// add x, zext (setcc) => addcarry x, 0, setcc
// add x, sext (setcc) => subcarry x, 0, setcc		// add x, sext (setcc) => subcarry x, 0, setcc
unsigned Opc = LHS.getOpcode();		unsigned Opc = LHS.getOpcode();
if (Opc == ISD::ZERO_EXTEND \|\| Opc == ISD::SIGN_EXTEND \|\|		if (Opc == ISD::ZERO_EXTEND \|\| Opc == ISD::SIGN_EXTEND \|\|
Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ADDCARRY)		Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ADDCARRY)
▲ Show 20 Lines • Show All 1,126 Lines • Show Last 20 Lines

lib/Target/AMDGPU/VOP3PInstructions.td

	Show First 20 Lines • Show All 162 Lines • ▼ Show 20 Lines
	}			}

	defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;			defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
	}			}

	let SubtargetPredicate = HasDLInsts in {			let SubtargetPredicate = HasDLInsts in {

	def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;			def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
	def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;			def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, AMDGPUsdot2>;
	def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;			def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, AMDGPUudot2>;
	def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;			def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
	def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;			def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;
	def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;			def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;
	def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;			def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;

	} // End SubtargetPredicate = HasDLInsts			} // End SubtargetPredicate = HasDLInsts

	multiclass VOP3P_Real_vi<bits<10> op> {			multiclass VOP3P_Real_vi<bits<10> op> {
	def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,			def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
				arsenmUnsubmitted Not Done Reply Inline Actions Patterns matching sext_inreg directly is kind of unusual, especially for a specific size. I would expect this to be a number of known sign bits check? arsenm: Patterns matching sext_inreg directly is kind of unusual, especially for a specific size. I…
				FarhanaAleenAuthorUnsubmitted Not Done Reply Inline Actions It's not unusual, I see all the other targets doing this sext_inreg matching with a specific size, specially for vectors. For vectors lying in a 32bit register, we need to make sure that each element is lying on a specific location inside the register. I feel like doing the known bits check would be redundant since DAG combiner already performed this check before generating sign_extend_inreg. Also, performing this check does not provide any benefit in our vector case since we cannot allow other sizes being sign extended to 16 or higher. It has to be exactly coming from the lower/upper 16bit of a 32bit register unless we rearrange the data orientation inside the 32bit register. FarhanaAleen: It's not unusual, I see all the other targets doing this sext_inreg matching with a specific…
	VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {			VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
	let AssemblerPredicates = [HasVOP3PInsts];			let AssemblerPredicates = [HasVOP3PInsts];
	let DecoderNamespace = "VI";			let DecoderNamespace = "VI";
	}			}
	}			}

	defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;			defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;
	defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;			defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
	▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/idot2.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck %s -check-prefixes=GCN,GFX900
				; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s \| FileCheck %s -check-prefixes=GCN,GFX906

				arsenmUnsubmitted Not Done Reply Inline Actions Should also include run lines with a CI and VI target to make sure those don't break arsenm: Should also include run lines with a CI and VI target to make sure those don't break
				; add(mul(S0.x, S1.y),
				; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I\|U}32_{I\|U}16(S1, S2, S3)
				declare <2 x i16> @myload2(<2 x i16> addrspace(1)*)

				; GCN-LABEL: {{^}}udot2

				; GFX900: v_mad_u32_u24
				; GFX900: v_mad_u32_u24

				; GFX906: v_dot2_u32_u16
				arsenmUnsubmitted Not Done Reply Inline Actions These tests are a bit thin. Perhaps use update_llc_test_checks? arsenm: These tests are a bit thin. Perhaps use update_llc_test_checks?
				define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
				<2 x i16> addrspace(1)* %src2,
				i32 addrspace(1)* nocapture %dst) {
				entry:
				%vec1 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src1)
				%vec2 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src2)

				%s1.elt1 = extractelement <2 x i16> %vec1, i64 0
				%conv = zext i16 %s1.elt1 to i32
				%s2.elt1 = extractelement <2 x i16> %vec2, i64 0
				%conv2 = zext i16 %s2.elt1 to i32
				%mul1 = mul nuw i32 %conv2, %conv
				arsenmUnsubmitted Not Done Reply Inline Actions What happens if everything is done in i16? Can this still be matched? arsenm: What happens if everything is done in i16? Can this still be matched?
				FarhanaAleenAuthorUnsubmitted Not Done Reply Inline Actions The pattern will not be matched if everything is done in 16. I will support it in a separate patch. FarhanaAleen: The pattern will not be matched if everything is done in 16. I will support it in a separate…

				%s1.elt2 = extractelement <2 x i16> %vec1, i64 1
				%conv3 = zext i16 %s1.elt2 to i32
				%s2.elt2 = extractelement <2 x i16> %vec2, i64 1
				%conv4 = zext i16 %s2.elt2 to i32
				%mul2 = mul nuw i32 %conv4, %conv3

				%s3 = load i32, i32 addrspace(1)* %dst, align 4
				%add = add i32 %mul2, %s3
				%add6 = add i32 %add, %mul1
				store i32 %add6, i32 addrspace(1)* %dst, align 4
				ret void
				}

				; add(S3,
				; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I\|U}32_{I\|U}16(S1, S2, S3)
				; GCN-LABEL: {{^}}udot2_MulMul

				; GFX900: v_add_u32_e32

				; GFX906: v_dot2_u32_u16
				define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
				<2 x i16> addrspace(1)* %src2,
				i32 addrspace(1)* nocapture %dst) {
				entry:
				%vec1 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src1)
				%vec2 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src2)

				%s1.elt1 = extractelement <2 x i16> %vec1, i64 0
				%conv = zext i16 %s1.elt1 to i32
				%s2.elt1 = extractelement <2 x i16> %vec2, i64 0
				%conv2 = zext i16 %s2.elt1 to i32
				%mul1 = mul nuw i32 %conv2, %conv

				%s1.elt2 = extractelement <2 x i16> %vec1, i64 1
				%conv3 = zext i16 %s1.elt2 to i32
				%s2.elt2 = extractelement <2 x i16> %vec2, i64 1
				%conv4 = zext i16 %s2.elt2 to i32
				%mul2 = mul nuw i32 %conv4, %conv3

				%s3 = load i32, i32 addrspace(1)* %dst, align 4
				%add = add i32 %mul2, %mul1
				%add6 = add i32 %add, %s3
				store i32 %add6, i32 addrspace(1)* %dst, align 4
				ret void
				}

				; GCN-LABEL: {{^}}idot2

				; GFX900: v_mad_i32_i24
				; GFX900: v_mad_i32_i24

				; GFX906: v_dot2_i32_i16
				define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
				<2 x i16> addrspace(1)* %src2,
				i32 addrspace(1)* nocapture %dst) {
				entry:
				%vec1 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src1)
				%vec2 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src2)

				%s1.elt1 = extractelement <2 x i16> %vec1, i64 0
				%conv = sext i16 %s1.elt1 to i32
				%s2.elt1 = extractelement <2 x i16> %vec2, i64 0
				%conv2 = sext i16 %s2.elt1 to i32
				%mul1 = mul nuw i32 %conv2, %conv

				arsenmUnsubmitted Not Done Reply Inline Actions Should include a test with the explicit sext_inreg patterns done on i32, and with different bit widths than i16 arsenm: Should include a test with the explicit sext_inreg patterns done on i32, and with different bit…
				%s1.elt2 = extractelement <2 x i16> %vec1, i64 1
				%conv3 = sext i16 %s1.elt2 to i32
				%s2.elt2 = extractelement <2 x i16> %vec2, i64 1
				%conv4 = sext i16 %s2.elt2 to i32
				%mul2 = mul nuw i32 %conv4, %conv3

				%s3 = load i32, i32 addrspace(1)* %dst, align 4
				%add = add i32 %mul2, %s3
				%add6 = add i32 %add, %mul1
				store i32 %add6, i32 addrspace(1)* %dst, align 4
				ret void
				}

				; GCN-LABEL: {{^}}idot2_MixedTypedMul

				; GFX900: v_mad_i32_i24

				; GFX906: v_mad_i32_i24
				define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
				<2 x i16> addrspace(1)* %src2,
				i32 addrspace(1)* nocapture %dst) {
				entry:
				%vec1 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src1)
				%vec2 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src2)

				%s1.elt1 = extractelement <2 x i16> %vec1, i64 0
				%conv = sext i16 %s1.elt1 to i32
				%s2.elt1 = extractelement <2 x i16> %vec2, i64 0
				%conv2 = sext i16 %s2.elt1 to i32
				%mul1 = mul nuw i32 %conv2, %conv

				%s1.elt2 = extractelement <2 x i16> %vec1, i64 1
				%conv3 = zext i16 %s1.elt2 to i32
				%s2.elt2 = extractelement <2 x i16> %vec2, i64 1
				%conv4 = zext i16 %s2.elt2 to i32
				%mul2 = mul nuw i32 %conv4, %conv3

				%s3 = load i32, i32 addrspace(1)* %dst, align 4
				%add = add i32 %mul2, %s3
				%add6 = add i32 %add, %mul1
				store i32 %add6, i32 addrspace(1)* %dst, align 4
				ret void
				}

				; GCN-LABEL: {{^}}udot2_alt_AddOperands

				; GFX900: v_mad_u32_u24
				; GFX900: v_mad_u32_u24

				; GFX906: v_dot2_u32_u16
				define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
				<2 x i16> addrspace(1)* %src2,
				i32 addrspace(1)* nocapture %dst) {
				entry:
				%vec1 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src1)
				%vec2 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src2)

				%s1.elt1 = extractelement <2 x i16> %vec1, i64 0
				%conv = zext i16 %s1.elt1 to i32
				%s2.elt1 = extractelement <2 x i16> %vec2, i64 0
				%conv2 = zext i16 %s2.elt1 to i32
				%mul1 = mul nuw i32 %conv2, %conv

				%s1.elt2 = extractelement <2 x i16> %vec1, i64 1
				%conv3 = zext i16 %s1.elt2 to i32
				%s2.elt2 = extractelement <2 x i16> %vec2, i64 1
				%conv4 = zext i16 %s2.elt2 to i32
				%mul2 = mul nuw i32 %conv4, %conv3

				%s3 = load i32, i32 addrspace(1)* %dst, align 4
				%add = add i32 %s3, %mul2
				%add6 = add i32 %mul1, %add
				store i32 %add6, i32 addrspace(1)* %dst, align 4
				ret void
				}

				; GCN-LABEL: {{^}}idot2_MixedExt

				; GFX900: v_mad_i32_i24
				; GFX900: v_mad_i32_i24

				; GFX906: v_mad_i32_i24
				define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
				<2 x i16> addrspace(1)* %src2,
				i32 addrspace(1)* nocapture %dst) {
				entry:
				%vec1 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src1)
				%vec2 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src2)

				%s1.elt1 = extractelement <2 x i16> %vec1, i64 0
				%conv = sext i16 %s1.elt1 to i32
				%s2.elt1 = extractelement <2 x i16> %vec2, i64 0
				%conv2 = zext i16 %s2.elt1 to i32
				%mul1 = mul nuw i32 %conv2, %conv

				%s1.elt2 = extractelement <2 x i16> %vec1, i64 1
				%conv3 = sext i16 %s1.elt2 to i32
				%s2.elt2 = extractelement <2 x i16> %vec2, i64 1
				%conv4 = sext i16 %s2.elt2 to i32
				%mul2 = mul nuw i32 %conv4, %conv3

				%s3 = load i32, i32 addrspace(1)* %dst, align 4
				%add = add i32 %mul2, %s3
				%add6 = add i32 %add, %mul1
				store i32 %add6, i32 addrspace(1)* %dst, align 4
				ret void
				}

				; GCN-LABEL: {{^}}notudot2_SameVec

				; GFX900: v_mad_u32_u24

				; GFX906: v_mad_u32_u24
				define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
				<2 x i16> addrspace(1)* %src2,
				i32 addrspace(1)* nocapture %dst) {
				entry:
				%vec1 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src1)
				%vec2 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src2)

				%s1.elt1 = extractelement <2 x i16> %vec1, i64 0
				%conv = zext i16 %s1.elt1 to i32
				%s2.elt1 = extractelement <2 x i16> %vec1, i64 0
				%conv2 = zext i16 %s2.elt1 to i32
				%mul1 = mul i32 %conv2, %conv

				%s1.elt2 = extractelement <2 x i16> %vec2, i64 1
				%conv3 = zext i16 %s1.elt2 to i32
				%s2.elt2 = extractelement <2 x i16> %vec2, i64 1
				%conv4 = zext i16 %s2.elt2 to i32
				%mul2 = mul i32 %conv4, %conv3

				%s3 = load i32, i32 addrspace(1)* %dst, align 4
				%add = add i32 %mul2, %s3
				%add6 = add i32 %add, %mul1
				store i32 %add6, i32 addrspace(1)* %dst, align 4
				ret void
				}

				; GCN-LABEL: {{^}}notudot2_LargerVec

				; GFX900: v_mad_u32_u24

				; GFX906: v_mad_u32_u24
				define amdgpu_kernel void @notudot2_LargerVec(<4 x i16> addrspace(1)* %src1,
				<4 x i16> addrspace(1)* %src2,
				i32 addrspace(1)* nocapture %dst) {
				entry:
				%vec1 = tail call <4 x i16> @myload4(<4 x i16> addrspace(1)* %src1)
				%vec2 = tail call <4 x i16> @myload4(<4 x i16> addrspace(1)* %src2)

				%s1.elt1 = extractelement <4 x i16> %vec1, i64 0
				%conv = zext i16 %s1.elt1 to i32
				%s2.elt1 = extractelement <4 x i16> %vec2, i64 0
				%conv2 = zext i16 %s2.elt1 to i32
				%mul1 = mul i32 %conv2, %conv

				%s1.elt2 = extractelement <4 x i16> %vec1, i64 1
				%conv3 = zext i16 %s1.elt2 to i32
				%s2.elt2 = extractelement <4 x i16> %vec2, i64 1
				%conv4 = zext i16 %s2.elt2 to i32
				%mul2 = mul i32 %conv4, %conv3

				%s3 = load i32, i32 addrspace(1)* %dst, align 4
				%add = add i32 %mul2, %s3
				%add6 = add i32 %add, %mul1
				store i32 %add6, i32 addrspace(1)* %dst, align 4
				ret void
				}

				; GCN-LABEL: {{^}}notudot2_DiffIndex

				; GFX900: v_mad_u32_u24

				; GFX906: v_mad_u32_u24
				define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
				<2 x i16> addrspace(1)* %src2,
				i32 addrspace(1)* nocapture %dst) {
				entry:
				%vec1 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src1)
				%vec2 = tail call <2 x i16> @myload2(<2 x i16> addrspace(1)* %src2)
				arsenmUnsubmitted Not Done Reply Inline Actions Avoid using function calls in tests that aren't specifically testing calls arsenm: Avoid using function calls in tests that aren't specifically testing calls

				%s1.elt1 = extractelement <2 x i16> %vec1, i64 0
				%conv = zext i16 %s1.elt1 to i32
				%s2.elt1 = extractelement <2 x i16> %vec2, i64 1
				%conv2 = zext i16 %s2.elt1 to i32
				%mul1 = mul i32 %conv2, %conv

				%s1.elt2 = extractelement <2 x i16> %vec1, i64 1
				%conv3 = zext i16 %s1.elt2 to i32
				%s2.elt2 = extractelement <2 x i16> %vec2, i64 0
				%conv4 = zext i16 %s2.elt2 to i32
				%mul2 = mul i32 %conv4, %conv3

				%s3 = load i32, i32 addrspace(1)* %dst, align 4
				%add = add i32 %mul2, %s3
				%add6 = add i32 %add, %mul1
				store i32 %add6, i32 addrspace(1)* %dst, align 4
				ret void
				}

				declare <4 x i16> @myload4(<4 x i16> addrspace(1)*)

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Support idot2 pattern.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 158091

lib/Target/AMDGPU/AMDGPUISelLowering.h

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

lib/Target/AMDGPU/AMDGPUInstrInfo.td

lib/Target/AMDGPU/SIISelLowering.cpp

lib/Target/AMDGPU/VOP3PInstructions.td

test/CodeGen/AMDGPU/idot2.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Support idot2 pattern.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 158091

lib/Target/AMDGPU/AMDGPUISelLowering.h

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

lib/Target/AMDGPU/AMDGPUInstrInfo.td

lib/Target/AMDGPU/SIISelLowering.cpp

lib/Target/AMDGPU/VOP3PInstructions.td

test/CodeGen/AMDGPU/idot2.ll

[AMDGPU] Support idot2 pattern.
ClosedPublic