Diff 275399

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Show First 20 Lines • Show All 181 Lines • ▼ Show 20 Lines	enum NodeType : unsigned {
FCMLEz,		FCMLEz,
FCMLTz,		FCMLTz,

// Vector across-lanes addition		// Vector across-lanes addition
// Only the lower result lane is defined.		// Only the lower result lane is defined.
SADDV,		SADDV,
UADDV,		UADDV,

		// Vector rounding halving addition
		SRHADD,
		dmgreenUnsubmitted Done Reply Inline Actions Is it possible to add srhadd at the same time? I guess there is also uhadd and shadd? dmgreen: Is it possible to add srhadd at the same time? I guess there is also uhadd and shadd?
		URHADD,

// Vector across-lanes min/max		// Vector across-lanes min/max
// Only the lower result lane is defined.		// Only the lower result lane is defined.
SMINV,		SMINV,
UMINV,		UMINV,
SMAXV,		SMAXV,
UMAXV,		UMAXV,

SMAXV_PRED,		SMAXV_PRED,
▲ Show 20 Lines • Show All 660 Lines • ▼ Show 20 Lines	private:
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
		SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,		SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
SDValue &Size,		SDValue &Size,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;
SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,		SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 832 Lines • ▼ Show 20 Lines	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);		setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);		setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);

// Saturates		// Saturates
setOperationAction(ISD::SADDSAT, VT, Legal);		setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::UADDSAT, VT, Legal);		setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);		setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);		setOperationAction(ISD::USUBSAT, VT, Legal);

		setOperationAction(ISD::TRUNCATE, VT, Custom);
}		}
for (MVT VT : { MVT::v4f16, MVT::v2f32,		for (MVT VT : { MVT::v4f16, MVT::v2f32,
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {		MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);		setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);		setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
}		}

setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);		setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
▲ Show 20 Lines • Show All 578 Lines • ▼ Show 20 Lines	case AArch64ISD::FIRST_NUMBER:
MAKE_CASE(AArch64ISD::CMLTz)		MAKE_CASE(AArch64ISD::CMLTz)
MAKE_CASE(AArch64ISD::FCMEQz)		MAKE_CASE(AArch64ISD::FCMEQz)
MAKE_CASE(AArch64ISD::FCMGEz)		MAKE_CASE(AArch64ISD::FCMGEz)
MAKE_CASE(AArch64ISD::FCMGTz)		MAKE_CASE(AArch64ISD::FCMGTz)
MAKE_CASE(AArch64ISD::FCMLEz)		MAKE_CASE(AArch64ISD::FCMLEz)
MAKE_CASE(AArch64ISD::FCMLTz)		MAKE_CASE(AArch64ISD::FCMLTz)
MAKE_CASE(AArch64ISD::SADDV)		MAKE_CASE(AArch64ISD::SADDV)
MAKE_CASE(AArch64ISD::UADDV)		MAKE_CASE(AArch64ISD::UADDV)
		MAKE_CASE(AArch64ISD::SRHADD)
		MAKE_CASE(AArch64ISD::URHADD)
MAKE_CASE(AArch64ISD::SMINV)		MAKE_CASE(AArch64ISD::SMINV)
MAKE_CASE(AArch64ISD::UMINV)		MAKE_CASE(AArch64ISD::UMINV)
MAKE_CASE(AArch64ISD::SMAXV)		MAKE_CASE(AArch64ISD::SMAXV)
MAKE_CASE(AArch64ISD::UMAXV)		MAKE_CASE(AArch64ISD::UMAXV)
MAKE_CASE(AArch64ISD::SMAXV_PRED)		MAKE_CASE(AArch64ISD::SMAXV_PRED)
MAKE_CASE(AArch64ISD::UMAXV_PRED)		MAKE_CASE(AArch64ISD::UMAXV_PRED)
MAKE_CASE(AArch64ISD::SMINV_PRED)		MAKE_CASE(AArch64ISD::SMINV_PRED)
MAKE_CASE(AArch64ISD::UMINV_PRED)		MAKE_CASE(AArch64ISD::UMINV_PRED)
▲ Show 20 Lines • Show All 1,812 Lines • ▼ Show 20 Lines	case Intrinsic::aarch64_neon_vsli: {

assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());		assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());

bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;		bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;		unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),		return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
Op.getOperand(3));		Op.getOperand(3));
}		}

		case Intrinsic::aarch64_neon_srhadd:
		case Intrinsic::aarch64_neon_urhadd: {
		bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
		unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
		return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
		Op.getOperand(2));
		}
}		}
}		}

bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {		bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();		return ExtVal.getValueType().isScalableVector();
}		}

// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.		// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
▲ Show 20 Lines • Show All 248 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::ATOMIC_LOAD_SUB:		case ISD::ATOMIC_LOAD_SUB:
return LowerATOMIC_LOAD_SUB(Op, DAG);		return LowerATOMIC_LOAD_SUB(Op, DAG);
case ISD::ATOMIC_LOAD_AND:		case ISD::ATOMIC_LOAD_AND:
return LowerATOMIC_LOAD_AND(Op, DAG);		return LowerATOMIC_LOAD_AND(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:		case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);		return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::VSCALE:		case ISD::VSCALE:
return LowerVSCALE(Op, DAG);		return LowerVSCALE(Op, DAG);
		case ISD::TRUNCATE:
		return LowerTRUNCATE(Op, DAG);
case ISD::LOAD:		case ISD::LOAD:
if (useSVEForFixedLengthVectorVT(Op.getValueType()))		if (useSVEForFixedLengthVectorVT(Op.getValueType()))
return LowerFixedLengthVectorLoadToSVE(Op, DAG);		return LowerFixedLengthVectorLoadToSVE(Op, DAG);
llvm_unreachable("Unexpected request to lower ISD::LOAD");		llvm_unreachable("Unexpected request to lower ISD::LOAD");
case ISD::ADD:		case ISD::ADD:
if (useSVEForFixedLengthVectorVT(Op.getValueType()))		if (useSVEForFixedLengthVectorVT(Op.getValueType()))
return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);		return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
llvm_unreachable("Unexpected request to lower ISD::ADD");		llvm_unreachable("Unexpected request to lower ISD::ADD");
▲ Show 20 Lines • Show All 5,233 Lines • ▼ Show 20 Lines
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {		static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");		assert(VT.isVector() && "vector shift count is not a vector type");
int64_t ElementBits = VT.getScalarSizeInBits();		int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))		if (!getVShiftImm(Op, ElementBits, Cnt))
return false;		return false;
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));		return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
}		}

		// Attempt to form urhadd(OpA, OpB) from
		// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
		// The original form of this expression is
		// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
		// is called the srl will have been lowered to AArch64ISD::VLSHR and the
		// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
		// This pass can also recognize a variant of this pattern that uses sign
		// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
		SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
		SelectionDAG &DAG) const {
		dmgreenUnsubmitted Done Reply Inline Actions It might be worth adding a check that this isn't a scalable vector. dmgreen: It might be worth adding a check that this isn't a scalable vector.
		EVT VT = Op.getValueType();

		if (!VT.isVector() \|\| VT.isScalableVector())
		return Op;

		// Since we are looking for a right shift by a constant value of 1 and we are
		// operating on types at least 16 bits in length (sign/zero extended OpA and
		dmgreenUnsubmitted Done Reply Inline Actions This is always a VLSHR, as opposed to a VASHR because the type is large enough that the signed shift bits are never demanded? If so can you add a comment about that. dmgreen: This is always a VLSHR, as opposed to a VASHR because the type is large enough that the signed…
		// OpB, which are at least 8 bits), it follows that the truncate will always
		dmgreenUnsubmitted Done Reply Inline Actions getConstantOperandVal dmgreen: getConstantOperandVal
		// discard the shifted-in bit and therefore the right shift will be logical
		// regardless of the signedness of OpA and OpB.
		SDValue Shift = Op.getOperand(0);
		if (Shift.getOpcode() != AArch64ISD::VLSHR)
		return Op;

		// Is the right shift using an immediate value of 1?
		uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
		if (ShiftAmount != 1)
		return Op;

		SDValue Sub = Shift->getOperand(0);
		if (Sub.getOpcode() != ISD::SUB)
		return Op;

		SDValue Xor = Sub.getOperand(1);
		if (Xor.getOpcode() != ISD::XOR)
		return Op;

		SDValue ExtendOpA = Xor.getOperand(0);
		SDValue ExtendOpB = Sub.getOperand(0);
		unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
		unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
		if (!(ExtendOpAOpc == ExtendOpBOpc &&
		(ExtendOpAOpc == ISD::ZERO_EXTEND \|\| ExtendOpAOpc == ISD::SIGN_EXTEND)))
		return Op;

		// Is the result of the right shift being truncated to the same value type as
		// the original operands, OpA and OpB?
		SDValue OpA = ExtendOpA.getOperand(0);
		SDValue OpB = ExtendOpB.getOperand(0);
		EVT OpAVT = OpA.getValueType();
		assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
		if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
		return Op;

		// Is the XOR using a constant amount of all ones in the right hand side?
		uint64_t C;
		if (!isAllConstantBuildVector(Xor.getOperand(1), C))
		return Op;

		unsigned ElemSizeInBits = VT.getScalarSizeInBits();
		APInt CAsAPInt(ElemSizeInBits, C);
		if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
		return Op;

		dmgreenUnsubmitted Done Reply Inline Actions This debug isn't usually added, the combiner will print this kind of info already. dmgreen: This debug isn't usually added, the combiner will print this kind of info already.
		SDLoc DL(Op);
		bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
		unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
		SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);

		return ResultURHADD;
		}

SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,		SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
SDLoc DL(Op);		SDLoc DL(Op);
int64_t Cnt;		int64_t Cnt;

if (!Op.getOperand(1).getValueType().isVector())		if (!Op.getOperand(1).getValueType().isVector())
return Op;		return Op;
▲ Show 20 Lines • Show All 2,193 Lines • ▼ Show 20 Lines
}		}

static SDValue performConcatVectorsCombine(SDNode *N,		static SDValue performConcatVectorsCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
SDLoc dl(N);		SDLoc dl(N);
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);		SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
		unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();

// Optimize concat_vectors of truncated vectors, where the intermediate		// Optimize concat_vectors of truncated vectors, where the intermediate
// type is illegal, to avoid said illegality, e.g.,		// type is illegal, to avoid said illegality, e.g.,
// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),		// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
// (v2i16 (truncate (v2i64)))))		// (v2i16 (truncate (v2i64)))))
// ->		// ->
// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),		// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
// (v4i32 (bitcast (v2i64))),		// (v4i32 (bitcast (v2i64))),
// <0, 2, 4, 6>)))		// <0, 2, 4, 6>)))
// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed		// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
// on both input and result type, so we might generate worse code.		// on both input and result type, so we might generate worse code.
// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.		// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
if (N->getNumOperands() == 2 &&		if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
N0->getOpcode() == ISD::TRUNCATE &&		N1Opc == ISD::TRUNCATE) {
N1->getOpcode() == ISD::TRUNCATE) {
SDValue N00 = N0->getOperand(0);		SDValue N00 = N0->getOperand(0);
SDValue N10 = N1->getOperand(0);		SDValue N10 = N1->getOperand(0);
EVT N00VT = N00.getValueType();		EVT N00VT = N00.getValueType();

if (N00VT == N10.getValueType() &&		if (N00VT == N10.getValueType() &&
(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&		(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {		N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);		MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
SmallVector<int, 8> Mask(MidVT.getVectorNumElements());		SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
for (size_t i = 0; i < Mask.size(); ++i)		for (size_t i = 0; i < Mask.size(); ++i)
Mask[i] = i * 2;		Mask[i] = i * 2;
return DAG.getNode(ISD::TRUNCATE, dl, VT,		return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getVectorShuffle(		DAG.getVectorShuffle(
MidVT, dl,		MidVT, dl,
DAG.getNode(ISD::BITCAST, dl, MidVT, N00),		DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));		DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
}		}
}		}

// Wait 'til after everything is legalized to try this. That way we have		// Wait 'til after everything is legalized to try this. That way we have
// legal vector types and such.		// legal vector types and such.
if (DCI.isBeforeLegalizeOps())		if (DCI.isBeforeLegalizeOps())
return SDValue();		return SDValue();

		// Optimise concat_vectors of two [us]rhadds that use extracted subvectors
		// from the same original vectors. Combine these into a single [us]rhadd that
		// operates on the two original vectors. Example:
		// (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
		// extract_subvector (v16i8 OpB,
		// <0>))),
		// (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
		// extract_subvector (v16i8 OpB,
		// <8>)))))
		// ->
		dmgreenUnsubmitted Done Reply Inline Actions I'm a little surprised that there is no code to do this already. I guess it doesn't usually come up. Please run clang-format on the patch. dmgreen: I'm a little surprised that there is no code to do this already. I guess it doesn't usually…
		// (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
		if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
		(N0Opc == AArch64ISD::URHADD \|\| N0Opc == AArch64ISD::SRHADD)) {
		SDValue N00 = N0->getOperand(0);
		SDValue N01 = N0->getOperand(1);
		SDValue N10 = N1->getOperand(0);
		SDValue N11 = N1->getOperand(1);

		EVT N00VT = N00.getValueType();
		EVT N10VT = N10.getValueType();

		if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
		N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
		N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
		N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
		dmgreenUnsubmitted Done Reply Inline Actions Make sure you check the 0 and the 8 from the extract_subvector. dmgreen: Make sure you check the 0 and the 8 from the extract_subvector.
		PetreTudorAuthorUnsubmitted Done Reply Inline Actions Added the checks right before returning the new node. PetreTudor: Added the checks right before returning the new node.
		SDValue N00Source = N00->getOperand(0);
		SDValue N01Source = N01->getOperand(0);
		SDValue N10Source = N10->getOperand(0);
		SDValue N11Source = N11->getOperand(0);

		if (N00Source == N10Source && N01Source == N11Source &&
		N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
		assert(N0.getValueType() == N1.getValueType());

		uint64_t N00Index = N00.getConstantOperandVal(1);
		uint64_t N01Index = N01.getConstantOperandVal(1);
		uint64_t N10Index = N10.getConstantOperandVal(1);
		uint64_t N11Index = N11.getConstantOperandVal(1);

		if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
		N10Index == N00VT.getVectorNumElements())
		return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
		}
		}
		}

// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector		// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
// splat. The indexed instructions are going to be expecting a DUPLANE64, so		// splat. The indexed instructions are going to be expecting a DUPLANE64, so
// canonicalise to that.		// canonicalise to that.
if (N0 == N1 && VT.getVectorNumElements() == 2) {		if (N0 == N1 && VT.getVectorNumElements() == 2) {
assert(VT.getScalarSizeInBits() == 64);		assert(VT.getScalarSizeInBits() == 64);
return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),		return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
DAG.getConstant(0, dl, MVT::i64));		DAG.getConstant(0, dl, MVT::i64));
}		}

// Canonicalise concat_vectors so that the right-hand vector has as few		// Canonicalise concat_vectors so that the right-hand vector has as few
// bit-casts as possible before its real operation. The primary matching		// bit-casts as possible before its real operation. The primary matching
// destination for these operations will be the narrowing "2" instructions,		// destination for these operations will be the narrowing "2" instructions,
// which depend on the operation being performed on this right-hand vector.		// which depend on the operation being performed on this right-hand vector.
// For example,		// For example,
// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))		// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
// becomes		// becomes
// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))		// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))

if (N1->getOpcode() != ISD::BITCAST)		if (N1Opc != ISD::BITCAST)
return SDValue();		return SDValue();
SDValue RHS = N1->getOperand(0);		SDValue RHS = N1->getOperand(0);
MVT RHSTy = RHS.getValueType().getSimpleVT();		MVT RHSTy = RHS.getValueType().getSimpleVT();
// If the RHS is not a vector, this is not the pattern we're looking for.		// If the RHS is not a vector, this is not the pattern we're looking for.
if (!RHSTy.isVector())		if (!RHSTy.isVector())
return SDValue();		return SDValue();

LLVM_DEBUG(		LLVM_DEBUG(
▲ Show 20 Lines • Show All 3,876 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64InstrInfo.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 548 Lines • ▼ Show 20 Lines

	def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;			def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
	def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;			def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
	def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;			def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
	def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;			def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
	def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;			def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
	def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;			def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;

				def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>;
				def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>;

	def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;			def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
	def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;			def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;			def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;			def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;			def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

	def SDT_AArch64unpk : SDTypeProfile<1, 1, [			def SDT_AArch64unpk : SDTypeProfile<1, 1, [
	SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>			SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>
	▲ Show 20 Lines • Show All 3,503 Lines • ▼ Show 20 Lines
	defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;			defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
	defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;			defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
	defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;			defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
	defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;			defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
	defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;			defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
	defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;			defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
	defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;			defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
	defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;			defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
	defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;			defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>;
	defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;			defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
	defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;			defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
	defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;			defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
	defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",			defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;			TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
	defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;			defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
	defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;			defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
	defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;			defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
	defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;			defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
	defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;			defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
	defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;			defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
	defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;			defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
	defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;			defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
	defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;			defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
	defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;			defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
	defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;			defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
	defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;			defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>;
	defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;			defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
	defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;			defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
	defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",			defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
	int_aarch64_neon_sqadd>;			int_aarch64_neon_sqadd>;
	defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",			defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
	int_aarch64_neon_sqsub>;			int_aarch64_neon_sqsub>;

	// Extra saturate patterns, other than the intrinsics matches above			// Extra saturate patterns, other than the intrinsics matches above
	▲ Show 20 Lines • Show All 3,563 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/arm64-vhadd.ll

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple \| FileCheck %s			; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple \| FileCheck %s

	define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {			define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
	;CHECK-LABEL: shadd8b:			; CHECK-LABEL: shadd8b:
	;CHECK: shadd.8b			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: shadd.8b v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <8 x i8>, <8 x i8>* %A			%tmp1 = load <8 x i8>, <8 x i8>* %A
	%tmp2 = load <8 x i8>, <8 x i8>* %B			%tmp2 = load <8 x i8>, <8 x i8>* %B
	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)			%tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
	ret <8 x i8> %tmp3			ret <8 x i8> %tmp3
	}			}

	define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {			define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
	;CHECK-LABEL: shadd16b:			; CHECK-LABEL: shadd16b:
	;CHECK: shadd.16b			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: shadd.16b v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <16 x i8>, <16 x i8>* %A			%tmp1 = load <16 x i8>, <16 x i8>* %A
	%tmp2 = load <16 x i8>, <16 x i8>* %B			%tmp2 = load <16 x i8>, <16 x i8>* %B
	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)			%tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
	ret <16 x i8> %tmp3			ret <16 x i8> %tmp3
	}			}

	define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {			define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
	;CHECK-LABEL: shadd4h:			; CHECK-LABEL: shadd4h:
	;CHECK: shadd.4h			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: shadd.4h v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <4 x i16>, <4 x i16>* %A			%tmp1 = load <4 x i16>, <4 x i16>* %A
	%tmp2 = load <4 x i16>, <4 x i16>* %B			%tmp2 = load <4 x i16>, <4 x i16>* %B
	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)			%tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
	ret <4 x i16> %tmp3			ret <4 x i16> %tmp3
	}			}

	define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {			define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
	;CHECK-LABEL: shadd8h:			; CHECK-LABEL: shadd8h:
	;CHECK: shadd.8h			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: shadd.8h v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <8 x i16>, <8 x i16>* %A			%tmp1 = load <8 x i16>, <8 x i16>* %A
	%tmp2 = load <8 x i16>, <8 x i16>* %B			%tmp2 = load <8 x i16>, <8 x i16>* %B
	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)			%tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
	ret <8 x i16> %tmp3			ret <8 x i16> %tmp3
	}			}

	define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {			define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
	;CHECK-LABEL: shadd2s:			; CHECK-LABEL: shadd2s:
	;CHECK: shadd.2s			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: shadd.2s v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <2 x i32>, <2 x i32>* %A			%tmp1 = load <2 x i32>, <2 x i32>* %A
	%tmp2 = load <2 x i32>, <2 x i32>* %B			%tmp2 = load <2 x i32>, <2 x i32>* %B
	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)			%tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
	ret <2 x i32> %tmp3			ret <2 x i32> %tmp3
	}			}

	define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {			define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
	;CHECK-LABEL: shadd4s:			; CHECK-LABEL: shadd4s:
	;CHECK: shadd.4s			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: shadd.4s v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <4 x i32>, <4 x i32>* %A			%tmp1 = load <4 x i32>, <4 x i32>* %A
	%tmp2 = load <4 x i32>, <4 x i32>* %B			%tmp2 = load <4 x i32>, <4 x i32>* %B
	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)			%tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
	ret <4 x i32> %tmp3			ret <4 x i32> %tmp3
	}			}

	define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {			define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
	;CHECK-LABEL: uhadd8b:			; CHECK-LABEL: uhadd8b:
	;CHECK: uhadd.8b			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: uhadd.8b v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <8 x i8>, <8 x i8>* %A			%tmp1 = load <8 x i8>, <8 x i8>* %A
	%tmp2 = load <8 x i8>, <8 x i8>* %B			%tmp2 = load <8 x i8>, <8 x i8>* %B
	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)			%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
	ret <8 x i8> %tmp3			ret <8 x i8> %tmp3
	}			}

	define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {			define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
	;CHECK-LABEL: uhadd16b:			; CHECK-LABEL: uhadd16b:
	;CHECK: uhadd.16b			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: uhadd.16b v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <16 x i8>, <16 x i8>* %A			%tmp1 = load <16 x i8>, <16 x i8>* %A
	%tmp2 = load <16 x i8>, <16 x i8>* %B			%tmp2 = load <16 x i8>, <16 x i8>* %B
	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)			%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
	ret <16 x i8> %tmp3			ret <16 x i8> %tmp3
	}			}

	define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {			define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
	;CHECK-LABEL: uhadd4h:			; CHECK-LABEL: uhadd4h:
	;CHECK: uhadd.4h			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: uhadd.4h v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <4 x i16>, <4 x i16>* %A			%tmp1 = load <4 x i16>, <4 x i16>* %A
	%tmp2 = load <4 x i16>, <4 x i16>* %B			%tmp2 = load <4 x i16>, <4 x i16>* %B
	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)			%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
	ret <4 x i16> %tmp3			ret <4 x i16> %tmp3
	}			}

	define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {			define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
	;CHECK-LABEL: uhadd8h:			; CHECK-LABEL: uhadd8h:
	;CHECK: uhadd.8h			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: uhadd.8h v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <8 x i16>, <8 x i16>* %A			%tmp1 = load <8 x i16>, <8 x i16>* %A
	%tmp2 = load <8 x i16>, <8 x i16>* %B			%tmp2 = load <8 x i16>, <8 x i16>* %B
	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)			%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
	ret <8 x i16> %tmp3			ret <8 x i16> %tmp3
	}			}

	define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {			define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
	;CHECK-LABEL: uhadd2s:			; CHECK-LABEL: uhadd2s:
	;CHECK: uhadd.2s			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: uhadd.2s v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <2 x i32>, <2 x i32>* %A			%tmp1 = load <2 x i32>, <2 x i32>* %A
	%tmp2 = load <2 x i32>, <2 x i32>* %B			%tmp2 = load <2 x i32>, <2 x i32>* %B
	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)			%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
	ret <2 x i32> %tmp3			ret <2 x i32> %tmp3
	}			}

	define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {			define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
	;CHECK-LABEL: uhadd4s:			; CHECK-LABEL: uhadd4s:
	;CHECK: uhadd.4s			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: uhadd.4s v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <4 x i32>, <4 x i32>* %A			%tmp1 = load <4 x i32>, <4 x i32>* %A
	%tmp2 = load <4 x i32>, <4 x i32>* %B			%tmp2 = load <4 x i32>, <4 x i32>* %B
	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)			%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
	ret <4 x i32> %tmp3			ret <4 x i32> %tmp3
	}			}

	declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone			declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
	declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone			declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
	declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone			declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

	declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone			declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
	declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone			declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
	declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone			declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

	declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone			declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
	declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone			declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
	declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone			declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

	declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone			declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
	declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone			declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
	declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone			declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

	define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {			define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
	;CHECK-LABEL: srhadd8b:			; CHECK-LABEL: srhadd8b:
	;CHECK: srhadd.8b			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: srhadd.8b v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <8 x i8>, <8 x i8>* %A			%tmp1 = load <8 x i8>, <8 x i8>* %A
	%tmp2 = load <8 x i8>, <8 x i8>* %B			%tmp2 = load <8 x i8>, <8 x i8>* %B
	%tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)			%tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
	ret <8 x i8> %tmp3			ret <8 x i8> %tmp3
	}			}

	define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {			define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
	;CHECK-LABEL: srhadd16b:			; CHECK-LABEL: srhadd16b:
	;CHECK: srhadd.16b			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: srhadd.16b v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <16 x i8>, <16 x i8>* %A			%tmp1 = load <16 x i8>, <16 x i8>* %A
	%tmp2 = load <16 x i8>, <16 x i8>* %B			%tmp2 = load <16 x i8>, <16 x i8>* %B
	%tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)			%tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
	ret <16 x i8> %tmp3			ret <16 x i8> %tmp3
	}			}

	define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {			define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
	;CHECK-LABEL: srhadd4h:			; CHECK-LABEL: srhadd4h:
	;CHECK: srhadd.4h			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: srhadd.4h v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <4 x i16>, <4 x i16>* %A			%tmp1 = load <4 x i16>, <4 x i16>* %A
	%tmp2 = load <4 x i16>, <4 x i16>* %B			%tmp2 = load <4 x i16>, <4 x i16>* %B
	%tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)			%tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
	ret <4 x i16> %tmp3			ret <4 x i16> %tmp3
	}			}

	define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {			define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
	;CHECK-LABEL: srhadd8h:			; CHECK-LABEL: srhadd8h:
	;CHECK: srhadd.8h			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: srhadd.8h v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <8 x i16>, <8 x i16>* %A			%tmp1 = load <8 x i16>, <8 x i16>* %A
	%tmp2 = load <8 x i16>, <8 x i16>* %B			%tmp2 = load <8 x i16>, <8 x i16>* %B
	%tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)			%tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
	ret <8 x i16> %tmp3			ret <8 x i16> %tmp3
	}			}

	define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {			define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
	;CHECK-LABEL: srhadd2s:			; CHECK-LABEL: srhadd2s:
	;CHECK: srhadd.2s			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: srhadd.2s v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <2 x i32>, <2 x i32>* %A			%tmp1 = load <2 x i32>, <2 x i32>* %A
	%tmp2 = load <2 x i32>, <2 x i32>* %B			%tmp2 = load <2 x i32>, <2 x i32>* %B
	%tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)			%tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
	ret <2 x i32> %tmp3			ret <2 x i32> %tmp3
	}			}

	define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {			define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
	;CHECK-LABEL: srhadd4s:			; CHECK-LABEL: srhadd4s:
	;CHECK: srhadd.4s			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: srhadd.4s v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <4 x i32>, <4 x i32>* %A			%tmp1 = load <4 x i32>, <4 x i32>* %A
	%tmp2 = load <4 x i32>, <4 x i32>* %B			%tmp2 = load <4 x i32>, <4 x i32>* %B
	%tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)			%tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
	ret <4 x i32> %tmp3			ret <4 x i32> %tmp3
	}			}

	define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {			define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
	;CHECK-LABEL: urhadd8b:			; CHECK-LABEL: urhadd8b:
	;CHECK: urhadd.8b			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: urhadd.8b v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <8 x i8>, <8 x i8>* %A			%tmp1 = load <8 x i8>, <8 x i8>* %A
	%tmp2 = load <8 x i8>, <8 x i8>* %B			%tmp2 = load <8 x i8>, <8 x i8>* %B
	%tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)			%tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
	ret <8 x i8> %tmp3			ret <8 x i8> %tmp3
	}			}

	define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {			define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
	;CHECK-LABEL: urhadd16b:			; CHECK-LABEL: urhadd16b:
	;CHECK: urhadd.16b			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: urhadd.16b v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <16 x i8>, <16 x i8>* %A			%tmp1 = load <16 x i8>, <16 x i8>* %A
	%tmp2 = load <16 x i8>, <16 x i8>* %B			%tmp2 = load <16 x i8>, <16 x i8>* %B
	%tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)			%tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
	ret <16 x i8> %tmp3			ret <16 x i8> %tmp3
	}			}

	define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {			define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
	;CHECK-LABEL: urhadd4h:			; CHECK-LABEL: urhadd4h:
	;CHECK: urhadd.4h			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: urhadd.4h v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <4 x i16>, <4 x i16>* %A			%tmp1 = load <4 x i16>, <4 x i16>* %A
	%tmp2 = load <4 x i16>, <4 x i16>* %B			%tmp2 = load <4 x i16>, <4 x i16>* %B
	%tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)			%tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
	ret <4 x i16> %tmp3			ret <4 x i16> %tmp3
	}			}

	define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {			define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
	;CHECK-LABEL: urhadd8h:			; CHECK-LABEL: urhadd8h:
	;CHECK: urhadd.8h			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: urhadd.8h v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <8 x i16>, <8 x i16>* %A			%tmp1 = load <8 x i16>, <8 x i16>* %A
	%tmp2 = load <8 x i16>, <8 x i16>* %B			%tmp2 = load <8 x i16>, <8 x i16>* %B
	%tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)			%tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
	ret <8 x i16> %tmp3			ret <8 x i16> %tmp3
	}			}

	define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {			define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
	;CHECK-LABEL: urhadd2s:			; CHECK-LABEL: urhadd2s:
	;CHECK: urhadd.2s			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: urhadd.2s v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <2 x i32>, <2 x i32>* %A			%tmp1 = load <2 x i32>, <2 x i32>* %A
	%tmp2 = load <2 x i32>, <2 x i32>* %B			%tmp2 = load <2 x i32>, <2 x i32>* %B
	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)			%tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
	ret <2 x i32> %tmp3			ret <2 x i32> %tmp3
	}			}

	define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {			define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
	;CHECK-LABEL: urhadd4s:			; CHECK-LABEL: urhadd4s:
	;CHECK: urhadd.4s			; CHECK: // %bb.0:
				; CHECK-NEXT: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: urhadd.4s v0, v0, v1
				; CHECK-NEXT: ret
	%tmp1 = load <4 x i32>, <4 x i32>* %A			%tmp1 = load <4 x i32>, <4 x i32>* %A
	%tmp2 = load <4 x i32>, <4 x i32>* %B			%tmp2 = load <4 x i32>, <4 x i32>* %B
	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)			%tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
	ret <4 x i32> %tmp3			ret <4 x i32> %tmp3
	}			}

				define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
				dmgreenUnsubmitted Done Reply Inline Actions It is worth having tests for half width too - <8 x i8>. dmgreen: It is worth having tests for half width too - <8 x i8>.
				; CHECK-LABEL: testLowerToSRHADD8b:
				; CHECK: // %bb.0:
				; CHECK-NEXT: srhadd.8b v0, v0, v1
				; CHECK-NEXT: str d0, [x0]
				; CHECK-NEXT: ret
				%sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
				%sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
				%add1 = add <8 x i16> %sextsrc1, %sextsrc2
				%add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
				%resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
				%result = trunc <8 x i16> %resulti16 to <8 x i8>
				store <8 x i8> %result, <8 x i8>* %dest, align 8
				ret void
				}

				define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
				; CHECK-LABEL: testLowerToSRHADD4h:
				; CHECK: // %bb.0:
				; CHECK-NEXT: srhadd.4h v0, v0, v1
				; CHECK-NEXT: str d0, [x0]
				; CHECK-NEXT: ret
				%sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
				%sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
				%add1 = add <4 x i32> %sextsrc1, %sextsrc2
				%add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
				%resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
				%result = trunc <4 x i32> %resulti16 to <4 x i16>
				store <4 x i16> %result, <4 x i16>* %dest, align 8
				ret void
				}

				define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
				; CHECK-LABEL: testLowerToSRHADD2s:
				; CHECK: // %bb.0:
				; CHECK-NEXT: srhadd.2s v0, v0, v1
				; CHECK-NEXT: str d0, [x0]
				; CHECK-NEXT: ret
				%sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
				%sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
				%add1 = add <2 x i64> %sextsrc1, %sextsrc2
				%add2 = add <2 x i64> %add1, <i64 1, i64 1>
				%resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
				%result = trunc <2 x i64> %resulti16 to <2 x i32>
				store <2 x i32> %result, <2 x i32>* %dest, align 8
				ret void
				}

				define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
				; CHECK-LABEL: testLowerToSRHADD16b:
				; CHECK: // %bb.0:
				; CHECK-NEXT: srhadd.16b v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				%sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
				%sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
				%add1 = add <16 x i16> %sextsrc1, %sextsrc2
				%add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
				%resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
				%result = trunc <16 x i16> %resulti16 to <16 x i8>
				store <16 x i8> %result, <16 x i8>* %dest, align 16
				ret void
				}

				define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
				; CHECK-LABEL: testLowerToSRHADD8h:
				; CHECK: // %bb.0:
				; CHECK-NEXT: srhadd.8h v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				%sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
				%sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
				%add1 = add <8 x i32> %sextsrc1, %sextsrc2
				%add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%result = trunc <8 x i32> %resulti16 to <8 x i16>
				store <8 x i16> %result, <8 x i16>* %dest, align 16
				ret void
				}

				define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
				; CHECK-LABEL: testLowerToSRHADD4s:
				; CHECK: // %bb.0:
				; CHECK-NEXT: srhadd.4s v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				%sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
				%sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
				%add1 = add <4 x i64> %sextsrc1, %sextsrc2
				%add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
				%resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
				%result = trunc <4 x i64> %resulti16 to <4 x i32>
				store <4 x i32> %result, <4 x i32>* %dest, align 16
				ret void
				}

				define void @testLowerToURHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
				; CHECK-LABEL: testLowerToURHADD8b:
				; CHECK: // %bb.0:
				; CHECK-NEXT: urhadd.8b v0, v0, v1
				; CHECK-NEXT: str d0, [x0]
				; CHECK-NEXT: ret
				%zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
				%zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
				%add1 = add <8 x i16> %zextsrc1, %zextsrc2
				%add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
				%resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
				%result = trunc <8 x i16> %resulti16 to <8 x i8>
				store <8 x i8> %result, <8 x i8>* %dest, align 8
				ret void
				}

				define void @testLowerToURHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
				; CHECK-LABEL: testLowerToURHADD4h:
				; CHECK: // %bb.0:
				; CHECK-NEXT: urhadd.4h v0, v0, v1
				; CHECK-NEXT: str d0, [x0]
				; CHECK-NEXT: ret
				%zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
				%zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
				%add1 = add <4 x i32> %zextsrc1, %zextsrc2
				%add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
				%resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
				%result = trunc <4 x i32> %resulti16 to <4 x i16>
				store <4 x i16> %result, <4 x i16>* %dest, align 8
				ret void
				}

				define void @testLowerToURHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
				; CHECK-LABEL: testLowerToURHADD2s:
				; CHECK: // %bb.0:
				; CHECK-NEXT: urhadd.2s v0, v0, v1
				; CHECK-NEXT: str d0, [x0]
				; CHECK-NEXT: ret
				%zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
				%zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
				%add1 = add <2 x i64> %zextsrc1, %zextsrc2
				%add2 = add <2 x i64> %add1, <i64 1, i64 1>
				%resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
				%result = trunc <2 x i64> %resulti16 to <2 x i32>
				store <2 x i32> %result, <2 x i32>* %dest, align 8
				ret void
				}

				define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
				; CHECK-LABEL: testLowerToURHADD16b:
				; CHECK: // %bb.0:
				; CHECK-NEXT: urhadd.16b v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				%zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
				%zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
				%add1 = add <16 x i16> %zextsrc1, %zextsrc2
				%add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
				%resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
				%result = trunc <16 x i16> %resulti16 to <16 x i8>
				store <16 x i8> %result, <16 x i8>* %dest, align 16
				ret void
				}

				define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
				; CHECK-LABEL: testLowerToURHADD8h:
				; CHECK: // %bb.0:
				; CHECK-NEXT: urhadd.8h v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
				%zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
				%add1 = add <8 x i32> %zextsrc1, %zextsrc2
				%add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%result = trunc <8 x i32> %resulti16 to <8 x i16>
				store <8 x i16> %result, <8 x i16>* %dest, align 16
				ret void
				}

				define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
				; CHECK-LABEL: testLowerToURHADD4s:
				; CHECK: // %bb.0:
				; CHECK-NEXT: urhadd.4s v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				%zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
				%zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
				%add1 = add <4 x i64> %zextsrc1, %zextsrc2
				%add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
				%resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
				%result = trunc <4 x i64> %resulti16 to <4 x i32>
				store <4 x i32> %result, <4 x i32>* %dest, align 16
				ret void
				}

	declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone			declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
	declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone			declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
	declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone			declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

	declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone			declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
	declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone			declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
	declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone			declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

	declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone			declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
	declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone			declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
	declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone			declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

	declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone			declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
	declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone			declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
	declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone			declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Generate URHADD from (b - (~a)) >> 1
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 275399

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/lib/Target/AArch64/AArch64InstrInfo.td

llvm/test/CodeGen/AArch64/arm64-vhadd.ll

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Generate URHADD from (b - (~a)) >> 1ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 275399

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/lib/Target/AArch64/AArch64InstrInfo.td

llvm/test/CodeGen/AArch64/arm64-vhadd.ll

[ARM] Generate URHADD from (b - (~a)) >> 1
ClosedPublic