Diff 351875

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Show First 20 Lines • Show All 234 Lines • ▼ Show 20 Lines	enum NodeType : unsigned {
// Vector rounding halving addition		// Vector rounding halving addition
SRHADD,		SRHADD,
URHADD,		URHADD,

// Absolute difference		// Absolute difference
UABD,		UABD,
SABD,		SABD,

		// Unsigned Add Long Pairwise
		UADDLP,

// udot/sdot instructions		// udot/sdot instructions
UDOT,		UDOT,
SDOT,		SDOT,

// Vector across-lanes min/max		// Vector across-lanes min/max
// Only the lower result lane is defined.		// Only the lower result lane is defined.
SMINV,		SMINV,
UMINV,		UMINV,
▲ Show 20 Lines • Show All 863 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,104 Lines • ▼ Show 20 Lines	case AArch64ISD::FIRST_NUMBER:
MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)		MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)		MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)		MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)		MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)		MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::INDEX_VECTOR)		MAKE_CASE(AArch64ISD::INDEX_VECTOR)
MAKE_CASE(AArch64ISD::UABD)		MAKE_CASE(AArch64ISD::UABD)
MAKE_CASE(AArch64ISD::SABD)		MAKE_CASE(AArch64ISD::SABD)
		MAKE_CASE(AArch64ISD::UADDLP)
MAKE_CASE(AArch64ISD::CALL_RVMARKER)		MAKE_CASE(AArch64ISD::CALL_RVMARKER)
}		}
#undef MAKE_CASE		#undef MAKE_CASE
return nullptr;		return nullptr;
}		}

MachineBasicBlock *		MachineBasicBlock *
AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,		AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
▲ Show 20 Lines • Show All 1,952 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}		}
case Intrinsic::aarch64_neon_sabd:		case Intrinsic::aarch64_neon_sabd:
case Intrinsic::aarch64_neon_uabd: {		case Intrinsic::aarch64_neon_uabd: {
unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? AArch64ISD::UABD		unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? AArch64ISD::UABD
: AArch64ISD::SABD;		: AArch64ISD::SABD;
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),		return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));		Op.getOperand(2));
}		}
		case Intrinsic::aarch64_neon_uaddlp: {
		unsigned Opcode = AArch64ISD::UADDLP;
		return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
		}
case Intrinsic::aarch64_neon_sdot:		case Intrinsic::aarch64_neon_sdot:
case Intrinsic::aarch64_neon_udot:		case Intrinsic::aarch64_neon_udot:
case Intrinsic::aarch64_sve_sdot:		case Intrinsic::aarch64_sve_sdot:
case Intrinsic::aarch64_sve_udot: {		case Intrinsic::aarch64_sve_udot: {
unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot \|\|		unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot \|\|
IntNo == Intrinsic::aarch64_sve_udot)		IntNo == Intrinsic::aarch64_sve_udot)
? AArch64ISD::UDOT		? AArch64ISD::UDOT
: AArch64ISD::SDOT;		: AArch64ISD::SDOT;
▲ Show 20 Lines • Show All 7,887 Lines • ▼ Show 20 Lines	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));		auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
EVT ShiftEltTy = Shift.getValueType().getVectorElementType();		EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)		if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
return SDValue();		return SDValue();

return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));		return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
}		}

		// Given a vecreduce_add node, detect the below pattern and convert it to the
		// node sequence with UABDL, [S\|U]ADB and UADDLP.
		//
		// i32 vecreduce_add(
		// v16i32 abs(
		// v16i32 sub(
		// v16i32 [sign\|zero]_extend(v16i8 a), v16i32 [sign\|zero]_extend(v16i8 b))))
		// =================>
		// i32 vecreduce_add(
		// v4i32 UADDLP(
		// v8i16 add(
		// v8i16 zext(
		// v8i8 [S\|U]ABD low8:v16i8 a, low8:v16i8 b
		// v8i16 zext(
		// v8i8 [S\|U]ABD high8:v16i8 a, high8:v16i8 b
		static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
		dmgreenUnsubmitted Not Done Reply Inline Actions This doesn't appear to use UADALP. Is that meant as a shorthand for UADA + UADDLP? dmgreen: This doesn't appear to use UADALP. Is that meant as a shorthand for UADA + UADDLP?
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Sorry for typo. Let me update the name. jaykang10: Sorry for typo. Let me update the name.
		SelectionDAG &DAG) {
		// Assumed i32 vecreduce_add
		if (N->getValueType(0) != MVT::i32)
		return SDValue();

		SDValue VecReduceOp0 = N->getOperand(0);
		unsigned Opcode = VecReduceOp0.getOpcode();
		dmgreenUnsubmitted Not Done Reply Inline Actions This is mostly a style comment, so feel free to ignore if you like. They Opcode's are only used once in the checks, so could be done inline without the variable. Having the "ABSOp0" and "SUB" nodes could just be one variable called "SUB" too, as we are checking the opcode on them straight away. dmgreen: This is mostly a style comment, so feel free to ignore if you like. They Opcode's are only used…
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Yep, let me update it. jaykang10: Yep, let me update it.
		// Assumed v16i32 abs
		if (Opcode != ISD::ABS \|\| VecReduceOp0->getValueType(0) != MVT::v16i32)
		return SDValue();

		SDValue ABS = VecReduceOp0;
		// Assumed v16i32 sub
		if (ABS->getOperand(0)->getOpcode() != ISD::SUB \|\|
		ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
		return SDValue();

		SDValue SUB = ABS->getOperand(0);
		unsigned Opcode0 = SUB->getOperand(0).getOpcode();
		unsigned Opcode1 = SUB->getOperand(1).getOpcode();
		// Assumed v16i32 type
		if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 \|\|
		SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
		return SDValue();

		// Assumed zext or sext
		bool IsZExt = false;
		if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
		IsZExt = true;
		} else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
		IsZExt = false;
		} else
		return SDValue();

		SDValue EXT0 = SUB->getOperand(0);
		SDValue EXT1 = SUB->getOperand(1);
		// Assumed zext's operand has v16i8 type
		if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 \|\|
		EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
		return SDValue();

		// Pattern is dectected. Let's convert it to sequence of nodes.
		SDLoc DL(N);

		// First, create the node pattern of UABD/SABD.
		SDValue UABDHigh8Op0 =
		DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
		DAG.getConstant(8, DL, MVT::i64));
		SDValue UABDHigh8Op1 =
		DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
		DAG.getConstant(8, DL, MVT::i64));
		SDValue UABDHigh8 = DAG.getNode(IsZExt ? AArch64ISD::UABD : AArch64ISD::SABD,
		DL, MVT::v8i8, UABDHigh8Op0, UABDHigh8Op1);
		SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);

		// Second, create the node pattern of UABAL.
		SDValue UABDLo8Op0 =
		DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
		DAG.getConstant(0, DL, MVT::i64));
		SDValue UABDLo8Op1 =
		DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
		DAG.getConstant(0, DL, MVT::i64));
		SDValue UABDLo8 = DAG.getNode(IsZExt ? AArch64ISD::UABD : AArch64ISD::SABD,
		DL, MVT::v8i8, UABDLo8Op0, UABDLo8Op1);
		SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
		SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);

		// Third, create the node of UADDLP.
		SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);

		dmgreenUnsubmitted Not Done Reply Inline Actions Just use UABAL directly? It would seem simpler that way. Same for the other SDValues in this function, which seem to be copied more than they need to be. dmgreen: Just use UABAL directly? It would seem simpler that way. Same for the other SDValues in this…
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Yep, let me update it. jaykang10: Yep, let me update it.
		// Fourth, create the node of VECREDUCE_ADD.
		return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
		}

// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce		// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))		// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))		// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,		static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *ST) {		const AArch64Subtarget *ST) {
		if (!ST->hasDotProd())
		return performVecReduceAddCombineWithUADDLP(N, DAG);

SDValue Op0 = N->getOperand(0);		SDValue Op0 = N->getOperand(0);
if (!ST->hasDotProd() \|\| N->getValueType(0) != MVT::i32 \|\|		if (N->getValueType(0) != MVT::i32 \|\|
Op0.getValueType().getVectorElementType() != MVT::i32)		Op0.getValueType().getVectorElementType() != MVT::i32)
return SDValue();		return SDValue();

unsigned ExtOpcode = Op0.getOpcode();		unsigned ExtOpcode = Op0.getOpcode();
SDValue A = Op0;		SDValue A = Op0;
SDValue B;		SDValue B;
if (ExtOpcode == ISD::MUL) {		if (ExtOpcode == ISD::MUL) {
A = Op0.getOperand(0);		A = Op0.getOperand(0);
▲ Show 20 Lines • Show All 6,277 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64InstrInfo.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 265 Lines • ▼ Show 20 Lines
	def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;			def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
	def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;			def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;

	def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;			def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;

	def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,			def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
	SDTCisPtrTy<1>]>;			SDTCisPtrTy<1>]>;

				def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;

	def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;			def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
	def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;			def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
	def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;			def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;

	// Generates the general dynamic sequences, i.e.			// Generates the general dynamic sequences, i.e.
	// adrp x0, :tlsdesc:var			// adrp x0, :tlsdesc:var
	// ldr x1, [x0, #:tlsdesc_lo12:var]			// ldr x1, [x0, #:tlsdesc_lo12:var]
	// add x0, x0, #:tlsdesc_lo12:var			// add x0, x0, #:tlsdesc_lo12:var
	▲ Show 20 Lines • Show All 300 Lines • ▼ Show 20 Lines

	def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs),			def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs),
	[(AArch64uabd_n node:$lhs, node:$rhs),			[(AArch64uabd_n node:$lhs, node:$rhs),
	(int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;			(int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
	def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),			def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),
	[(AArch64sabd_n node:$lhs, node:$rhs),			[(AArch64sabd_n node:$lhs, node:$rhs),
	(int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;			(int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;

				def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>;
				def AArch64uaddlp : PatFrags<(ops node:$src),
				[(AArch64uaddlp_n node:$src),
				(int_aarch64_neon_uaddlp node:$src)]>;

	def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;			def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
	def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;			def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;			def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;			def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;			def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

	def SDT_AArch64unpk : SDTypeProfile<1, 1, [			def SDT_AArch64unpk : SDTypeProfile<1, 1, [
	SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>			SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>
	▲ Show 20 Lines • Show All 3,575 Lines • ▼ Show 20 Lines
	defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;			defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
	defm SHLL : SIMDVectorLShiftLongBySizeBHS;			defm SHLL : SIMDVectorLShiftLongBySizeBHS;
	defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;			defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
	defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;			defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
	defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;			defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
	defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;			defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
	defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;			defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
	defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",			defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
	BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;			BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >;
	defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",			defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>;
	int_aarch64_neon_uaddlp>;
	defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;			defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
	defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;			defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
	defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;			defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
	defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;			defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
	defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;			defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
	defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;			defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;

	def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;			def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
	▲ Show 20 Lines • Show All 3,869 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/arm64-vabs.ll

Show First 20 Lines • Show All 212 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff		%absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
%reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel)		%reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel)
ret i16 %reduced_v		ret i16 %reduced_v
}		}

define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {		define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: uabd16b_rdx_i32:		; CHECK-LABEL: uabd16b_rdx_i32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: uabd.16b v0, v0, v1		; CHECK-NEXT: uabdl.8h v2, v0, v1
; CHECK-NEXT: ushll2.8h v1, v0, #0		; CHECK-NEXT: uabal2.8h v2, v0, v1
; CHECK-NEXT: ushll.8h v0, v0, #0		; CHECK-NEXT: uaddlp.4s v0, v2
; CHECK-NEXT: uaddl2.4s v2, v0, v1
; CHECK-NEXT: uaddl.4s v0, v0, v1
; CHECK-NEXT: add.4s v0, v0, v2
; CHECK-NEXT: addv.4s s0, v0		; CHECK-NEXT: addv.4s s0, v0
; CHECK-NEXT: fmov w0, s0		; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%aext = zext <16 x i8> %a to <16 x i32>		%aext = zext <16 x i8> %a to <16 x i32>
%bext = zext <16 x i8> %b to <16 x i32>		%bext = zext <16 x i8> %b to <16 x i32>
%abdiff = sub nsw <16 x i32> %aext, %bext		%abdiff = sub nsw <16 x i32> %aext, %bext
%abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer		%abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
%ababs = sub nsw <16 x i32> zeroinitializer, %abdiff		%ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
%absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff		%absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
%reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)		%reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
ret i32 %reduced_v		ret i32 %reduced_v
}		}

define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {		define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: sabd16b_rdx_i32:		; CHECK-LABEL: sabd16b_rdx_i32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: sabd.16b v0, v0, v1		; CHECK-NEXT: sabdl.8h v2, v0, v1
; CHECK-NEXT: ushll2.8h v1, v0, #0		; CHECK-NEXT: sabal2.8h v2, v0, v1
; CHECK-NEXT: ushll.8h v0, v0, #0		; CHECK-NEXT: uaddlp.4s v0, v2
; CHECK-NEXT: uaddl2.4s v2, v0, v1
; CHECK-NEXT: uaddl.4s v0, v0, v1
; CHECK-NEXT: add.4s v0, v0, v2
; CHECK-NEXT: addv.4s s0, v0		; CHECK-NEXT: addv.4s s0, v0
; CHECK-NEXT: fmov w0, s0		; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%aext = sext <16 x i8> %a to <16 x i32>		%aext = sext <16 x i8> %a to <16 x i32>
%bext = sext <16 x i8> %b to <16 x i32>		%bext = sext <16 x i8> %b to <16 x i32>
%abdiff = sub nsw <16 x i32> %aext, %bext		%abdiff = sub nsw <16 x i32> %aext, %bext
%abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer		%abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
%ababs = sub nsw <16 x i32> zeroinitializer, %abdiff		%ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
▲ Show 20 Lines • Show All 1,402 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/neon-sad.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple aarch64-none-linux-gnu < %s \| FileCheck %s			; RUN: llc -mtriple aarch64-none-linux-gnu < %s \| FileCheck %s

	declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1 immarg)			declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1 immarg)
	declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)			declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)

	define i32 @test_sad_v16i8_zext(i8* nocapture readonly %a, i8* nocapture readonly %b) {			define i32 @test_sad_v16i8_zext(i8* nocapture readonly %a, i8* nocapture readonly %b) {
				dmgreenUnsubmitted Not Done Reply Inline Actions It can be good to pre-commit the tests, to just show the differences in the review. It makes it easier to see what the patch does. dmgreen: It can be good to pre-commit the tests, to just show the differences in the review. It makes it…
				jaykang10AuthorUnsubmitted Done Reply Inline Actions Yep, let me pre-commit this test. jaykang10: Yep, let me pre-commit this test.
	; CHECK-LABEL: test_sad_v16i8_zext:			; CHECK-LABEL: test_sad_v16i8_zext:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: ldr q0, [x0]			; CHECK-NEXT: ldr q0, [x0]
	; CHECK-NEXT: ldr q1, [x1]			; CHECK-NEXT: ldr q1, [x1]
	; CHECK-NEXT: uabd v0.16b, v1.16b, v0.16b			; CHECK-NEXT: uabdl v2.8h, v1.8b, v0.8b
	; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0			; CHECK-NEXT: uabal2 v2.8h, v1.16b, v0.16b
	; CHECK-NEXT: ushll v0.8h, v0.8b, #0			; CHECK-NEXT: uaddlp v0.4s, v2.8h
	; CHECK-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
	; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
	; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
	; CHECK-NEXT: addv s0, v0.4s			; CHECK-NEXT: addv s0, v0.4s
	; CHECK-NEXT: fmov w0, s0			; CHECK-NEXT: fmov w0, s0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast i8* %a to <16 x i8>*			%0 = bitcast i8* %a to <16 x i8>*
	%1 = load <16 x i8>, <16 x i8>* %0			%1 = load <16 x i8>, <16 x i8>* %0
	%2 = zext <16 x i8> %1 to <16 x i32>			%2 = zext <16 x i8> %1 to <16 x i32>
	%3 = bitcast i8* %b to <16 x i8>*			%3 = bitcast i8* %b to <16 x i8>*
	%4 = load <16 x i8>, <16 x i8>* %3			%4 = load <16 x i8>, <16 x i8>* %3
	%5 = zext <16 x i8> %4 to <16 x i32>			%5 = zext <16 x i8> %4 to <16 x i32>
	%6 = sub nsw <16 x i32> %5, %2			%6 = sub nsw <16 x i32> %5, %2
	%7 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %6, i1 true)			%7 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %6, i1 true)
	%8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)			%8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
	ret i32 %8			ret i32 %8
	}			}

	define i32 @test_sad_v16i8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b) {			define i32 @test_sad_v16i8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b) {
	; CHECK-LABEL: test_sad_v16i8_sext:			; CHECK-LABEL: test_sad_v16i8_sext:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: ldr q0, [x0]			; CHECK-NEXT: ldr q0, [x0]
	; CHECK-NEXT: ldr q1, [x1]			; CHECK-NEXT: ldr q1, [x1]
	; CHECK-NEXT: sabd v0.16b, v1.16b, v0.16b			; CHECK-NEXT: sabdl v2.8h, v1.8b, v0.8b
	; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0			; CHECK-NEXT: sabal2 v2.8h, v1.16b, v0.16b
	; CHECK-NEXT: ushll v0.8h, v0.8b, #0			; CHECK-NEXT: uaddlp v0.4s, v2.8h
	; CHECK-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
	; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
	; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
	; CHECK-NEXT: addv s0, v0.4s			; CHECK-NEXT: addv s0, v0.4s
	; CHECK-NEXT: fmov w0, s0			; CHECK-NEXT: fmov w0, s0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast i8* %a to <16 x i8>*			%0 = bitcast i8* %a to <16 x i8>*
	%1 = load <16 x i8>, <16 x i8>* %0			%1 = load <16 x i8>, <16 x i8>* %0
	%2 = sext <16 x i8> %1 to <16 x i32>			%2 = sext <16 x i8> %1 to <16 x i32>
	%3 = bitcast i8* %b to <16 x i8>*			%3 = bitcast i8* %b to <16 x i8>*
	%4 = load <16 x i8>, <16 x i8>* %3			%4 = load <16 x i8>, <16 x i8>* %3
	%5 = sext <16 x i8> %4 to <16 x i32>			%5 = sext <16 x i8> %4 to <16 x i32>
	%6 = sub nsw <16 x i32> %5, %2			%6 = sub nsw <16 x i32> %5, %2
	%7 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %6, i1 true)			%7 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %6, i1 true)
	%8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)			%8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
	ret i32 %8			ret i32 %8
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Improve SAD pattern
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 351875

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/lib/Target/AArch64/AArch64InstrInfo.td

llvm/test/CodeGen/AArch64/arm64-vabs.ll

llvm/test/CodeGen/AArch64/neon-sad.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Improve SAD patternClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 351875

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/lib/Target/AArch64/AArch64InstrInfo.td

llvm/test/CodeGen/AArch64/arm64-vabs.ll

llvm/test/CodeGen/AArch64/neon-sad.ll

[AArch64] Improve SAD pattern
ClosedPublic