Diff 273824

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	enum NodeType : unsigned {

// Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on		// Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
// ELF.		// ELF.
THREAD_POINTER,		THREAD_POINTER,
ADC,		ADC,
SBC, // adc, sbc instructions		SBC, // adc, sbc instructions

// Arithmetic instructions		// Arithmetic instructions
		ADD_PRED,
		FADD_PRED,
SDIV_PRED,		SDIV_PRED,
UDIV_PRED,		UDIV_PRED,
SMIN_PRED,		SMIN_PRED,
UMIN_PRED,		UMIN_PRED,
SMAX_PRED,		SMAX_PRED,
UMAX_PRED,		UMAX_PRED,
SHL_PRED,		SHL_PRED,
SRL_PRED,		SRL_PRED,
▲ Show 20 Lines • Show All 866 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,035 Lines • ▼ Show 20 Lines	for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
setOperationAction(Op, VT, Expand);		setOperationAction(Op, VT, Expand);

// EXTRACT_SUBVECTOR/INSERT_SUBVECTOR are used to "cast" between scalable		// EXTRACT_SUBVECTOR/INSERT_SUBVECTOR are used to "cast" between scalable
// and fixed length vector types, although with the current level of support		// and fixed length vector types, although with the current level of support
// only the former is exercised.		// only the former is exercised.
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);		setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

// Lower fixed length vector operations to scalable equivalents.		// Lower fixed length vector operations to scalable equivalents.
		setOperationAction(ISD::ADD, VT, Custom);
		setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::LOAD, VT, Custom);		setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);		setOperationAction(ISD::STORE, VT, Custom);

// NOTE: This is a temporary measure to maintain functionality required by
// Analysis/CostModel/AArch64/sve-fixed-length.ll
setOperationAction(ISD::ADD, VT, Legal);
setOperationAction(ISD::FADD, VT, Legal);
}		}

void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {		void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR64RegClass);		addRegisterClass(VT, &AArch64::FPR64RegClass);
addTypeForNEON(VT, MVT::v2i32);		addTypeForNEON(VT, MVT::v2i32);
}		}

void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {		void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
▲ Show 20 Lines • Show All 290 Lines • ▼ Show 20 Lines	const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";		case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
case AArch64ISD::CSEL: return "AArch64ISD::CSEL";		case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";		case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
case AArch64ISD::CSINV: return "AArch64ISD::CSINV";		case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";		case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
case AArch64ISD::CSINC: return "AArch64ISD::CSINC";		case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";		case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";		case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
		case AArch64ISD::ADD_PRED: return "AArch64ISD::ADD_PRED";
case AArch64ISD::SDIV_PRED: return "AArch64ISD::SDIV_PRED";		case AArch64ISD::SDIV_PRED: return "AArch64ISD::SDIV_PRED";
case AArch64ISD::UDIV_PRED: return "AArch64ISD::UDIV_PRED";		case AArch64ISD::UDIV_PRED: return "AArch64ISD::UDIV_PRED";
case AArch64ISD::SMIN_PRED: return "AArch64ISD::SMIN_PRED";		case AArch64ISD::SMIN_PRED: return "AArch64ISD::SMIN_PRED";
case AArch64ISD::UMIN_PRED: return "AArch64ISD::UMIN_PRED";		case AArch64ISD::UMIN_PRED: return "AArch64ISD::UMIN_PRED";
case AArch64ISD::SMAX_PRED: return "AArch64ISD::SMAX_PRED";		case AArch64ISD::SMAX_PRED: return "AArch64ISD::SMAX_PRED";
case AArch64ISD::UMAX_PRED: return "AArch64ISD::UMAX_PRED";		case AArch64ISD::UMAX_PRED: return "AArch64ISD::UMAX_PRED";
case AArch64ISD::SHL_PRED: return "AArch64ISD::SHL_PRED";		case AArch64ISD::SHL_PRED: return "AArch64ISD::SHL_PRED";
case AArch64ISD::SRL_PRED: return "AArch64ISD::SRL_PRED";		case AArch64ISD::SRL_PRED: return "AArch64ISD::SRL_PRED";
▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines	const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::ANDV_PRED: return "AArch64ISD::ANDV_PRED";		case AArch64ISD::ANDV_PRED: return "AArch64ISD::ANDV_PRED";
case AArch64ISD::CLASTA_N: return "AArch64ISD::CLASTA_N";		case AArch64ISD::CLASTA_N: return "AArch64ISD::CLASTA_N";
case AArch64ISD::CLASTB_N: return "AArch64ISD::CLASTB_N";		case AArch64ISD::CLASTB_N: return "AArch64ISD::CLASTB_N";
case AArch64ISD::LASTA: return "AArch64ISD::LASTA";		case AArch64ISD::LASTA: return "AArch64ISD::LASTA";
case AArch64ISD::LASTB: return "AArch64ISD::LASTB";		case AArch64ISD::LASTB: return "AArch64ISD::LASTB";
case AArch64ISD::REV: return "AArch64ISD::REV";		case AArch64ISD::REV: return "AArch64ISD::REV";
case AArch64ISD::REINTERPRET_CAST: return "AArch64ISD::REINTERPRET_CAST";		case AArch64ISD::REINTERPRET_CAST: return "AArch64ISD::REINTERPRET_CAST";
case AArch64ISD::TBL: return "AArch64ISD::TBL";		case AArch64ISD::TBL: return "AArch64ISD::TBL";
		case AArch64ISD::FADD_PRED: return "AArch64ISD::FADD_PRED";
case AArch64ISD::FADDA_PRED: return "AArch64ISD::FADDA_PRED";		case AArch64ISD::FADDA_PRED: return "AArch64ISD::FADDA_PRED";
case AArch64ISD::FADDV_PRED: return "AArch64ISD::FADDV_PRED";		case AArch64ISD::FADDV_PRED: return "AArch64ISD::FADDV_PRED";
case AArch64ISD::FMAXV_PRED: return "AArch64ISD::FMAXV_PRED";		case AArch64ISD::FMAXV_PRED: return "AArch64ISD::FMAXV_PRED";
case AArch64ISD::FMAXNMV_PRED: return "AArch64ISD::FMAXNMV_PRED";		case AArch64ISD::FMAXNMV_PRED: return "AArch64ISD::FMAXNMV_PRED";
case AArch64ISD::FMINV_PRED: return "AArch64ISD::FMINV_PRED";		case AArch64ISD::FMINV_PRED: return "AArch64ISD::FMINV_PRED";
case AArch64ISD::FMINNMV_PRED: return "AArch64ISD::FMINNMV_PRED";		case AArch64ISD::FMINNMV_PRED: return "AArch64ISD::FMINNMV_PRED";
case AArch64ISD::NOT: return "AArch64ISD::NOT";		case AArch64ISD::NOT: return "AArch64ISD::NOT";
case AArch64ISD::BIT: return "AArch64ISD::BIT";		case AArch64ISD::BIT: return "AArch64ISD::BIT";
▲ Show 20 Lines • Show All 1,962 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::SADDO:		case ISD::SADDO:
case ISD::UADDO:		case ISD::UADDO:
case ISD::SSUBO:		case ISD::SSUBO:
case ISD::USUBO:		case ISD::USUBO:
case ISD::SMULO:		case ISD::SMULO:
case ISD::UMULO:		case ISD::UMULO:
return LowerXALUO(Op, DAG);		return LowerXALUO(Op, DAG);
case ISD::FADD:		case ISD::FADD:
		if (useSVEForFixedLengthVectorVT(Op.getValueType()))
		return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
return LowerF128Call(Op, DAG, RTLIB::ADD_F128);		return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
case ISD::FSUB:		case ISD::FSUB:
return LowerF128Call(Op, DAG, RTLIB::SUB_F128);		return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
case ISD::FMUL:		case ISD::FMUL:
return LowerF128Call(Op, DAG, RTLIB::MUL_F128);		return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
case ISD::FDIV:		case ISD::FDIV:
return LowerF128Call(Op, DAG, RTLIB::DIV_F128);		return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
case ISD::FP_ROUND:		case ISD::FP_ROUND:
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	case ISD::ATOMIC_LOAD_AND:
return LowerATOMIC_LOAD_AND(Op, DAG);		return LowerATOMIC_LOAD_AND(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:		case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);		return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::VSCALE:		case ISD::VSCALE:
return LowerVSCALE(Op, DAG);		return LowerVSCALE(Op, DAG);
case ISD::LOAD:		case ISD::LOAD:
if (useSVEForFixedLengthVectorVT(Op.getValueType()))		if (useSVEForFixedLengthVectorVT(Op.getValueType()))
return LowerFixedLengthVectorLoadToSVE(Op, DAG);		return LowerFixedLengthVectorLoadToSVE(Op, DAG);
llvm_unreachable("Unexpected Load.");		llvm_unreachable("Unexpected request to lower ISD::LOAD");
		case ISD::ADD:
		if (useSVEForFixedLengthVectorVT(Op.getValueType()))
		return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
		llvm_unreachable("Unexpected request to lower ISD::ADD");
}		}
}		}

bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {		bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
// Prefer NEON unless larger SVE registers are available.		// Prefer NEON unless larger SVE registers are available.
return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;		return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
}		}

▲ Show 20 Lines • Show All 4,309 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);		SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);		SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);

// create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...		// create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);		SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
return DAG.getNode(ISD::BITCAST, DL, VT, TBL);		return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
}		}

SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
SelectionDAG &DAG,
unsigned NewOp) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);

assert(Op.getOperand(0).getValueType().isScalableVector() &&
Op.getOperand(1).getValueType().isScalableVector() &&
"Only scalable vectors are supported");

auto PredTy =
VT.getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorElementCount());
SDValue Mask = getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);

SmallVector<SDValue, 4> Operands = {Mask};
Operands.append(Op->op_begin(), Op->op_end());
return DAG.getNode(NewOp, DL, VT, Operands);
}

static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,		static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
APInt &UndefBits) {		APInt &UndefBits) {
EVT VT = BVN->getValueType(0);		EVT VT = BVN->getValueType(0);
APInt SplatBits, SplatUndef;		APInt SplatBits, SplatUndef;
unsigned SplatBitSize;		unsigned SplatBitSize;
bool HasAnyUndefs;		bool HasAnyUndefs;
if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {		if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
▲ Show 20 Lines • Show All 6,923 Lines • ▼ Show 20 Lines	case MVT::f64:
MaskVT = MVT::nxv2i1;		MaskVT = MVT::nxv2i1;
break;		break;
}		}

return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,		return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
DAG.getTargetConstant(PgPattern, DL, MVT::i64));		DAG.getTargetConstant(PgPattern, DL, MVT::i64));
}		}

		static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
		EVT VT) {
		assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
		"Expected legal scalable vector!");
		auto PredTy = VT.changeVectorElementType(MVT::i1);
		return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
		}

		static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
		sdesmalenUnsubmitted Done Reply Inline Actions nit: is `getAllTruePredicate` not a better name given the value this function returns? (similar for `getAllTruePredicateFor(Scalable\|FixedLength)Vector`) sdesmalen: nit: is `getAllTruePredicate` not a better name given the value this function returns? (similar…
		paulwalker-armAuthorUnsubmitted Done Reply Inline Actions getPredicateForFixedLengthVector (and by extension getPredicateForVector) doesn't return an AllTrue predicate but rather a predicate with only the first VT.getNumElts() being true with everything else being false. I think we're going to have a similar situation with types like <n x 2 x f32> so tried to avoid AllTrue since that's an artifact of our current ISEL rather than representative of what is actually going on. paulwalker-arm: getPredicateForFixedLengthVector (and by extension getPredicateForVector) doesn't return an…
		sdesmalenUnsubmitted Not Done Reply Inline Actions Yes, that's actually what I meant with `AllTruePredicateForFixedLengthVector(..., EVT VT)` . The `ForFixedLengthVector` part (passed as `VT`) suggests to me that it returns a predicate that is all true for the lanes of the fixed-length vector, not beyond that. sdesmalen: Yes, that's actually what I meant with `AllTruePredicateForFixedLengthVector(..., EVT VT)` .
		if (VT.isFixedLengthVector())
		return getPredicateForFixedLengthVector(DAG, DL, VT);

		return getPredicateForScalableVector(DAG, DL, VT);
		}

// Grow V to consume an entire SVE register.		// Grow V to consume an entire SVE register.
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {		static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
assert(VT.isScalableVector() &&		assert(VT.isScalableVector() &&
"Expected to convert into a scalable vector!");		"Expected to convert into a scalable vector!");
assert(V.getValueType().isFixedLengthVector() &&		assert(V.getValueType().isFixedLengthVector() &&
"Expected a fixed length vector operand!");		"Expected a fixed length vector operand!");
SDLoc DL(V);		SDLoc DL(V);
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);		SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(

auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());		auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
return DAG.getMaskedStore(		return DAG.getMaskedStore(
Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),		Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),		getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
Store->getMemOperand(), Store->getAddressingMode(),		Store->getMemOperand(), Store->getAddressingMode(),
Store->isTruncatingStore());		Store->isTruncatingStore());
}		}

		SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
		SelectionDAG &DAG,
		unsigned NewOp) const {
		EVT VT = Op.getValueType();
		SDLoc DL(Op);
		auto Pg = getPredicateForVector(DAG, DL, VT);

		if (useSVEForFixedLengthVectorVT(VT)) {
		EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

		// Create list of operands by convereting existing ones to scalable types.
		SmallVector<SDValue, 4> Operands = {Pg};
		for (const SDValue &V : Op->op_values()) {
		if (isa<CondCodeSDNode>(V)) {
		Operands.push_back(V);
		continue;
		}

		assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
		"Only fixed length vectors are supported!");
		Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
		}

		auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
		return convertFromScalableVector(DAG, VT, ScalableRes);
		}

		assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");

		SmallVector<SDValue, 4> Operands = {Pg};
		for (const SDValue &V : Op->op_values()) {
		assert((isa<CondCodeSDNode>(V) \|\| V.getValueType().isScalableVector()) &&
		"Only scalable vectors are supported!");
		Operands.push_back(V);
		}

		return DAG.getNode(NewOp, DL, VT, Operands);
		}
		cameron.mcinallyUnsubmitted Done Reply Inline Actions Could we modify the assert to remove the branch? assert(isa<CondCodeSDNode>(V) \|\| V.getValueType().isScalableVector() && "Only scalable vectors are supported!"); Same with the code above too. cameron.mcinally: Could we modify the assert to remove the branch? ``` assert(isa<CondCodeSDNode>(V) \|\| V.
		paulwalker-armAuthorUnsubmitted Done Reply Inline Actions I can modify this case but the earlier code is not a straight copy so some kind of control flow is required. paulwalker-arm: I can modify this case but the earlier code is not a straight copy so some kind of control flow…
		cameron.mcinallyUnsubmitted Not Done Reply Inline Actions Oh, I missed that. If you want to leave it so the two cases are structured similarly, that's fine. cameron.mcinally: Oh, I missed that. If you want to leave it so the two cases are structured similarly, that's…

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Show First 20 Lines • Show All 146 Lines • ▼ Show 20 Lines
def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>;		def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>;
def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;		def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;

def SDT_AArch64Arith : SDTypeProfile<1, 3, [		def SDT_AArch64Arith : SDTypeProfile<1, 3, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,		SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>		SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>
]>;		]>;

		def AArch64add_pred : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>;
		def AArch64fadd_pred : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
def AArch64sdiv_pred : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;		def AArch64sdiv_pred : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
def AArch64udiv_pred : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;		def AArch64udiv_pred : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
def AArch64smin_pred : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;		def AArch64smin_pred : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
def AArch64umin_pred : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;		def AArch64umin_pred : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
def AArch64smax_pred : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;		def AArch64smax_pred : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
def AArch64umax_pred : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;		def AArch64umax_pred : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
def AArch64lsl_pred : SDNode<"AArch64ISD::SHL_PRED", SDT_AArch64Arith>;		def AArch64lsl_pred : SDNode<"AArch64ISD::SHL_PRED", SDT_AArch64Arith>;
def AArch64lsr_pred : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>;		def AArch64lsr_pred : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>;
Show All 36 Lines	let Predicates = [HasSVE] in {
defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;		defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;
defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;		defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;
defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>;		defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>;

defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>;		defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>;
defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ", 1>;		defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ", 1>;
defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", 0>;		defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", 0>;

defm ADD_ZPZZ : sve_int_bin_pred_zx<int_aarch64_sve_add>;		defm ADD_ZPZZ : sve_int_bin_pred_zx<int_aarch64_sve_add, AArch64add_pred>;
defm SUB_ZPZZ : sve_int_bin_pred_zx<int_aarch64_sve_sub>;		defm SUB_ZPZZ : sve_int_bin_pred_zx<int_aarch64_sve_sub>;
defm SUBR_ZPZZ : sve_int_bin_pred_zx<int_aarch64_sve_subr>;		defm SUBR_ZPZZ : sve_int_bin_pred_zx<int_aarch64_sve_subr>;

defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;		defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;		defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;		defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;		defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;

▲ Show 20 Lines • Show All 106 Lines • ▼ Show 20 Lines	let Predicates = [HasSVE, HasBF16] in {
defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;		defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;		defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;
defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>;		defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>;
defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>;		defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>;
defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>;		defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>;
defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", 0>;		defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", 0>;
defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ", 1>;		defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ", 1>;

defm FADD_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fadd>;		defm FADD_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fadd, AArch64fadd_pred>;
defm FSUB_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fsub>;		defm FSUB_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fsub>;
defm FMUL_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fmul>;		defm FMUL_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fmul>;
defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fsubr>;		defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fsubr>;
defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fmaxnm>;		defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fmaxnm>;
defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fminnm>;		defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fminnm>;
defm FMAX_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fmax>;		defm FMAX_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fmax>;
defm FMIN_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fmin>;		defm FMIN_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fmin>;
defm FABD_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fabd>;		defm FABD_ZPZZ : sve_fp_2op_p_zds_zx<int_aarch64_sve_fabd>;
▲ Show 20 Lines • Show All 2,139 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Show First 20 Lines • Show All 595 Lines • ▼ Show 20 Lines	int AArch64TTIImpl::getArithmeticInstrCost(
case ISD::ADD:		case ISD::ADD:
case ISD::MUL:		case ISD::MUL:
case ISD::XOR:		case ISD::XOR:
case ISD::OR:		case ISD::OR:
case ISD::AND:		case ISD::AND:
// These nodes are marked as 'custom' for combining purposes only.		// These nodes are marked as 'custom' for combining purposes only.
// We know that they are legal. See LowerAdd in ISelLowering.		// We know that they are legal. See LowerAdd in ISelLowering.
return (Cost + 1) * LT.first;		return (Cost + 1) * LT.first;

		case ISD::FADD:
		// These nodes are marked as 'custom' just to lower them to SVE.
		// We know said lowering will incur no additional cost.
		if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
		return (Cost + 2) * LT.first;

		return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
		Opd2Info,
		Opd1PropInfo, Opd2PropInfo);
}		}
}		}

int AArch64TTIImpl::getAddressComputationCost(Type Ty, ScalarEvolution SE,		int AArch64TTIImpl::getAddressComputationCost(Type Ty, ScalarEvolution SE,
const SCEV *Ptr) {		const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will		// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the		// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting		// computation can more often be merged into the index mode. The resulting
▲ Show 20 Lines • Show All 446 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/SVEInstrFormats.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,590 Lines • ▼ Show 20 Lines	multiclass sve_fp_2op_p_zds_fscale<bits<4> opc, string asm,
def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;		def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;		def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;

def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;		def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;		def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;		def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}		}

multiclass sve_fp_2op_p_zds_zx<SDPatternOperator op> {		multiclass sve_fp_2op_p_zds_zx<SDPatternOperator int_op,
		SDPatternOperator ir_op = null_frag> {
		def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
		def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
		def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;

def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;		def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;		def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;		def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;

def : SVE_3_Op_Pat_SelZero<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;		def : SVE_3_Op_Pat<nxv8f16, ir_op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
def : SVE_3_Op_Pat_SelZero<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;		def : SVE_3_Op_Pat<nxv4f32, ir_op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
def : SVE_3_Op_Pat_SelZero<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;		def : SVE_3_Op_Pat<nxv2f64, ir_op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>;

		def : SVE_3_Op_Pat_SelZero<nxv8f16, int_op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;
		def : SVE_3_Op_Pat_SelZero<nxv4f32, int_op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;
		def : SVE_3_Op_Pat_SelZero<nxv2f64, int_op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;
}		}
		cameron.mcinallyUnsubmitted Done Reply Inline Actions Is having both ir_op and int_op problematic going forward? E.g. how to match an intrinsic with an undef merge. X86 solves a similar problem with the tables in `lib/Target/X86/X86IntrinsicsInfo.h`. That might be something to consider long term. I'm not sure if it's a great fit though. I don't think this needs to be changed now, but something to consider... cameron.mcinally: Is having both ir_op and int_op problematic going forward? E.g. how to match an intrinsic with…
		paulwalker-armAuthorUnsubmitted Done Reply Inline Actions I don't believe there's anything problematic as there's nothing to prevent adding SVE_3_Op_Pat_SelZero patterns for ir_op and SVE_3_Op_Pat for int_op. It's just that this patch does not have tests for those cases so I've not added the patterns. paulwalker-arm: I don't believe there's anything problematic as there's nothing to prevent adding…
		cameron.mcinallyUnsubmitted Not Done Reply Inline Actions Good point. I'm not sure if canonicalizing the intrinsics would save code or add more, so I'll leave it there. cameron.mcinally: Good point. I'm not sure if canonicalizing the intrinsics would save code or add more, so I'll…

class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>		class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3),		: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3),
asm, "\t$Zdn, $_Zdn, $Zm, $imm3",		asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
"",		"",
[]>, Sched<[]> {		[]>, Sched<[]> {
bits<5> Zdn;		bits<5> Zdn;
bits<5> Zm;		bits<5> Zm;
▲ Show 20 Lines • Show All 3,235 Lines • ▼ Show 20 Lines	def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>,
SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isOrig>;		SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isOrig>;
}		}
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;		def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;		def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;		def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;		def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}		}

multiclass sve_int_bin_pred_zx<SDPatternOperator op> {		multiclass sve_int_bin_pred_zx<SDPatternOperator int_op,
		SDPatternOperator ir_op = null_frag> {
		def _UNDEF_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
		def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
		def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
		def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;

def _ZERO_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesZero>;		def _ZERO_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesZero>;
def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;		def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;		def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;		def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;

def : SVE_3_Op_Pat_SelZero<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _ZERO_B)>;		def : SVE_3_Op_Pat<nxv16i8, ir_op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
def : SVE_3_Op_Pat_SelZero<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _ZERO_H)>;		def : SVE_3_Op_Pat<nxv8i16, ir_op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
def : SVE_3_Op_Pat_SelZero<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _ZERO_S)>;		def : SVE_3_Op_Pat<nxv4i32, ir_op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
def : SVE_3_Op_Pat_SelZero<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;		def : SVE_3_Op_Pat<nxv2i64, ir_op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;

		def : SVE_3_Op_Pat_SelZero<nxv16i8, int_op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _ZERO_B)>;
		def : SVE_3_Op_Pat_SelZero<nxv8i16, int_op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _ZERO_H)>;
		def : SVE_3_Op_Pat_SelZero<nxv4i32, int_op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _ZERO_S)>;
		def : SVE_3_Op_Pat_SelZero<nxv2i64, int_op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;
}		}

multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,		multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,
SDPatternOperator op> {		SDPatternOperator op> {
def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;		def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>;		def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>;
def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>;		def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>;

▲ Show 20 Lines • Show All 2,907 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll

This file was added.

				; RUN: llc -aarch64-sve-vector-bits-min=128 < %s \| FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
				; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
				; RUN: llc -aarch64-sve-vector-bits-min=384 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
				; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
				; RUN: llc -aarch64-sve-vector-bits-min=640 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
				; RUN: llc -aarch64-sve-vector-bits-min=768 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
				; RUN: llc -aarch64-sve-vector-bits-min=896 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK

				; VBYTES represents the useful byte size of a vector register from the code
				; generator's point of view. It is clamped to power-of-2 values because
				; only power-of-2 vector lengths are considered legal, regardless of the
				; user specified vector length.

				target triple = "aarch64-unknown-linux-gnu"

				; Don't use SVE when its registers are no bigger than NEON.
				; NO_SVE-NOT: ptrue

				; Don't use SVE for 64-bit vectors.
				define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
				; CHECK-LABEL: @fadd_v4f16
				; CHECK: fadd v0.4h, v0.4h, v1.4h
				; CHECK: ret
				%res = fadd <4 x half> %op1, %op2
				ret <4 x half> %res
				}

				; Don't use SVE for 128-bit vectors.
				define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
				; CHECK-LABEL: @fadd_v8f16
				; CHECK: fadd v0.8h, v0.8h, v1.8h
				; CHECK: ret
				%res = fadd <8 x half> %op1, %op2
				ret <8 x half> %res
				}

				define void @fadd_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
				; CHECK-LABEL: @fadd_v16f16
				; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
				; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
				; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <16 x half>, <16 x half>* %a
				%op2 = load <16 x half>, <16 x half>* %b
				%res = fadd <16 x half> %op1, %op2
				store <16 x half> %res, <16 x half>* %a
				ret void
				}

				define void @fadd_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
				; CHECK-LABEL: @fadd_v32f16
				; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
				; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
				; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
				; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0]
				; VBITS_LE_256-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
				; VBITS_LE_256-DAG: add x[[B1:[0-9]+]], x1, #[[#VBYTES]]
				; VBITS_LE_256-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x[[A1]]]
				; VBITS_LE_256-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x[[B1]]]
				; VBITS_LE_256-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h
				; VBITS_LE_256-DAG: st1h { [[RES_1]].h }, [[PG]], [x[[A1]]]
				; CHECK: ret
				%op1 = load <32 x half>, <32 x half>* %a
				%op2 = load <32 x half>, <32 x half>* %b
				%res = fadd <32 x half> %op1, %op2
				store <32 x half> %res, <32 x half>* %a
				ret void
				}

				define void @fadd_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
				; CHECK-LABEL: @fadd_v64f16
				; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
				; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
				; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
				; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0]
				; VBITS_LE_512-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
				; VBITS_LE_512-DAG: add x[[B1:[0-9]+]], x1, #[[#VBYTES]]
				; VBITS_LE_512-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x[[A1]]]
				; VBITS_LE_512-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x[[B1]]]
				; VBITS_LE_512-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h
				; VBITS_LE_512-DAG: st1h { [[RES_1]].h }, [[PG]], [x[[A1]]]
				; VBITS_LE_256-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]]
				; VBITS_LE_256-DAG: add x[[B2:[0-9]+]], x1, #[[#mul(VBYTES,2)]]
				; VBITS_LE_256-DAG: ld1h { [[OP1_2:z[0-9]+]].h }, [[PG]]/z, [x[[A2]]]
				; VBITS_LE_256-DAG: ld1h { [[OP2_2:z[0-9]+]].h }, [[PG]]/z, [x[[B2]]]
				; VBITS_LE_256-DAG: fadd [[RES_2:z[0-9]+]].h, [[PG]]/m, [[OP1_2]].h, [[OP2_2]].h
				; VBITS_LE_256-DAG: st1h { [[RES_2]].h }, [[PG]], [x[[A2]]]
				; VBITS_LE_256-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]]
				; VBITS_LE_256-DAG: add x[[B3:[0-9]+]], x1, #[[#mul(VBYTES,3)]]
				; VBITS_LE_256-DAG: ld1h { [[OP1_3:z[0-9]+]].h }, [[PG]]/z, [x[[A3]]]
				; VBITS_LE_256-DAG: ld1h { [[OP2_3:z[0-9]+]].h }, [[PG]]/z, [x[[B3]]]
				; VBITS_LE_256-DAG: fadd [[RES_3:z[0-9]+]].h, [[PG]]/m, [[OP1_3]].h, [[OP2_3]].h
				; VBITS_LE_256-DAG: st1h { [[RES_3]].h }, [[PG]], [x[[A3]]]
				; CHECK: ret
				%op1 = load <64 x half>, <64 x half>* %a
				%op2 = load <64 x half>, <64 x half>* %b
				%res = fadd <64 x half> %op1, %op2
				store <64 x half> %res, <64 x half>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
				; already cover the general legalisation cases.
				define void @fadd_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
				; CHECK-LABEL: @fadd_v128f16
				; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
				; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
				; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <128 x half>, <128 x half>* %a
				%op2 = load <128 x half>, <128 x half>* %b
				%res = fadd <128 x half> %op1, %op2
				store <128 x half> %res, <128 x half>* %a
				ret void
				}

				; Don't use SVE for 64-bit vectors.
				define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
				; CHECK-LABEL: @fadd_v2f32
				; CHECK: fadd v0.2s, v0.2s, v1.2s
				; CHECK: ret
				%res = fadd <2 x float> %op1, %op2
				ret <2 x float> %res
				}

				; Don't use SVE for 128-bit vectors.
				define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
				; CHECK-LABEL: @fadd_v4f32
				; CHECK: fadd v0.4s, v0.4s, v1.4s
				; CHECK: ret
				%res = fadd <4 x float> %op1, %op2
				ret <4 x float> %res
				}

				define void @fadd_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
				; CHECK-LABEL: @fadd_v8f32
				; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
				; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
				; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <8 x float>, <8 x float>* %a
				%op2 = load <8 x float>, <8 x float>* %b
				%res = fadd <8 x float> %op1, %op2
				store <8 x float> %res, <8 x float>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
				; already cover the general legalisation cases.
				define void @fadd_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
				; CHECK-LABEL: @fadd_v16f32
				; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
				; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
				; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <16 x float>, <16 x float>* %a
				%op2 = load <16 x float>, <16 x float>* %b
				%res = fadd <16 x float> %op1, %op2
				store <16 x float> %res, <16 x float>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
				; already cover the general legalisation cases.
				define void @fadd_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
				; CHECK-LABEL: @fadd_v32f32
				; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
				; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
				; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <32 x float>, <32 x float>* %a
				%op2 = load <32 x float>, <32 x float>* %b
				%res = fadd <32 x float> %op1, %op2
				store <32 x float> %res, <32 x float>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
				; already cover the general legalisation cases.
				define void @fadd_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
				; CHECK-LABEL: @fadd_v64f32
				; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
				; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
				; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <64 x float>, <64 x float>* %a
				%op2 = load <64 x float>, <64 x float>* %b
				%res = fadd <64 x float> %op1, %op2
				store <64 x float> %res, <64 x float>* %a
				ret void
				}

				; Don't use SVE for 64-bit vectors.
				define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
				; CHECK-LABEL: @fadd_v1f64
				; CHECK: fadd d0, d0, d1
				; CHECK: ret
				%res = fadd <1 x double> %op1, %op2
				ret <1 x double> %res
				}

				; Don't use SVE for 128-bit vectors.
				define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
				; CHECK-LABEL: @fadd_v2f64
				; CHECK: fadd v0.2d, v0.2d, v1.2d
				; CHECK: ret
				%res = fadd <2 x double> %op1, %op2
				ret <2 x double> %res
				}

				define void @fadd_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
				; CHECK-LABEL: @fadd_v4f64
				; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
				; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
				; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <4 x double>, <4 x double>* %a
				%op2 = load <4 x double>, <4 x double>* %b
				%res = fadd <4 x double> %op1, %op2
				store <4 x double> %res, <4 x double>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
				; already cover the general legalisation cases.
				define void @fadd_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
				; CHECK-LABEL: @fadd_v8f64
				; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
				; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
				; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <8 x double>, <8 x double>* %a
				%op2 = load <8 x double>, <8 x double>* %b
				%res = fadd <8 x double> %op1, %op2
				store <8 x double> %res, <8 x double>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
				; already cover the general legalisation cases.
				define void @fadd_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
				; CHECK-LABEL: @fadd_v16f64
				; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
				; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
				; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <16 x double>, <16 x double>* %a
				%op2 = load <16 x double>, <16 x double>* %b
				%res = fadd <16 x double> %op1, %op2
				store <16 x double> %res, <16 x double>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
				; already cover the general legalisation cases.
				define void @fadd_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
				; CHECK-LABEL: @fadd_v32f64
				; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
				; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
				; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
				; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <32 x double>, <32 x double>* %a
				%op2 = load <32 x double>, <32 x double>* %b
				%res = fadd <32 x double> %op1, %op2
				store <32 x double> %res, <32 x double>* %a
				ret void
				}

				attributes #0 = { "target-features"="+sve" }

llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll

This file was added.

				; RUN: llc -aarch64-sve-vector-bits-min=128 < %s \| FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
				; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
				; RUN: llc -aarch64-sve-vector-bits-min=384 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
				; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
				; RUN: llc -aarch64-sve-vector-bits-min=640 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
				; RUN: llc -aarch64-sve-vector-bits-min=768 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
				; RUN: llc -aarch64-sve-vector-bits-min=896 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
				; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK

				; VBYTES represents the useful byte size of a vector register from the code
				; generator's point of view. It is clamped to power-of-2 values because
				; only power-of-2 vector lengths are considered legal, regardless of the
				; user specified vector length.

				target triple = "aarch64-unknown-linux-gnu"

				; Don't use SVE when its registers are no bigger than NEON.
				; NO_SVE-NOT: ptrue

				; Don't use SVE for 64-bit vectors.
				define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
				; CHECK-LABEL: @add_v8i8
				; CHECK: add v0.8b, v0.8b, v1.8b
				; CHECK: ret
				%res = add <8 x i8> %op1, %op2
				ret <8 x i8> %res
				}

				; Don't use SVE for 128-bit vectors.
				define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
				; CHECK-LABEL: @add_v16i8
				; CHECK: add v0.16b, v0.16b, v1.16b
				; CHECK: ret
				%res = add <16 x i8> %op1, %op2
				ret <16 x i8> %res
				}

				define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
				; CHECK-LABEL: @add_v32i8
				; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
				; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
				; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <32 x i8>, <32 x i8>* %a
				%op2 = load <32 x i8>, <32 x i8>* %b
				%res = add <32 x i8> %op1, %op2
				store <32 x i8> %res, <32 x i8>* %a
				ret void
				}

				define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
				; CHECK-LABEL: @add_v64i8
				; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
				; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
				; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
				; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
				; VBITS_LE_256-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
				; VBITS_LE_256-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
				; VBITS_LE_256-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
				; VBITS_LE_256-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b
				; VBITS_LE_256-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
				; CHECK: ret
				%op1 = load <64 x i8>, <64 x i8>* %a
				%op2 = load <64 x i8>, <64 x i8>* %b
				%res = add <64 x i8> %op1, %op2
				store <64 x i8> %res, <64 x i8>* %a
				ret void
				}

				define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
				; CHECK-LABEL: @add_v128i8
				; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
				; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
				; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
				; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
				; VBITS_LE_512-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
				; VBITS_LE_512-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
				; VBITS_LE_512-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
				; VBITS_LE_512-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b
				; VBITS_LE_512-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
				; VBITS_LE_256-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
				; VBITS_LE_256-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
				; VBITS_LE_256-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
				; VBITS_LE_256-DAG: add [[RES_2:z[0-9]+]].b, [[PG]]/m, [[OP1_2]].b, [[OP2_2]].b
				; VBITS_LE_256-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
				; VBITS_LE_256-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
				; VBITS_LE_256-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
				; VBITS_LE_256-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
				; VBITS_LE_256-DAG: add [[RES_3:z[0-9]+]].b, [[PG]]/m, [[OP1_3]].b, [[OP2_3]].b
				; VBITS_LE_256-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
				; CHECK: ret
				%op1 = load <128 x i8>, <128 x i8>* %a
				%op2 = load <128 x i8>, <128 x i8>* %b
				%res = add <128 x i8> %op1, %op2
				store <128 x i8> %res, <128 x i8>* %a
				ret void
				}

				define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
				; CHECK-LABEL: @add_v256i8
				; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
				; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
				; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
				; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
				; VBITS_LE_1024-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
				; VBITS_LE_1024-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
				; VBITS_LE_1024-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
				; VBITS_LE_1024-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b
				; VBITS_LE_1024-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
				; VBITS_LE_512-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
				; VBITS_LE_512-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
				; VBITS_LE_512-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
				; VBITS_LE_512-DAG: add [[RES_2:z[0-9]+]].b, [[PG]]/m, [[OP1_2]].b, [[OP2_2]].b
				; VBITS_LE_512-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
				; VBITS_LE_512-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
				; VBITS_LE_512-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
				; VBITS_LE_512-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
				; VBITS_LE_512-DAG: add [[RES_3:z[0-9]+]].b, [[PG]]/m, [[OP1_3]].b, [[OP2_3]].b
				; VBITS_LE_512-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
				; VBITS_LE_256-DAG: mov w[[OFF_4:[0-9]+]], #[[#mul(VBYTES,4)]]
				; VBITS_LE_256-DAG: ld1b { [[OP1_4:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_4]]]
				; VBITS_LE_256-DAG: ld1b { [[OP2_4:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_4]]]
				; VBITS_LE_256-DAG: add [[RES_4:z[0-9]+]].b, [[PG]]/m, [[OP1_4]].b, [[OP2_4]].b
				; VBITS_LE_256-DAG: st1b { [[RES_4]].b }, [[PG]], [x0, x[[OFF_4]]]
				; VBITS_LE_256-DAG: mov w[[OFF_5:[0-9]+]], #[[#mul(VBYTES,5)]]
				; VBITS_LE_256-DAG: ld1b { [[OP1_5:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_5]]]
				; VBITS_LE_256-DAG: ld1b { [[OP2_5:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_5]]]
				; VBITS_LE_256-DAG: add [[RES_5:z[0-9]+]].b, [[PG]]/m, [[OP1_5]].b, [[OP2_5]].b
				; VBITS_LE_256-DAG: st1b { [[RES_5]].b }, [[PG]], [x0, x[[OFF_5]]]
				; VBITS_LE_256-DAG: mov w[[OFF_6:[0-9]+]], #[[#mul(VBYTES,6)]]
				; VBITS_LE_256-DAG: ld1b { [[OP1_6:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_6]]]
				; VBITS_LE_256-DAG: ld1b { [[OP2_6:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_6]]]
				; VBITS_LE_256-DAG: add [[RES_6:z[0-9]+]].b, [[PG]]/m, [[OP1_6]].b, [[OP2_6]].b
				; VBITS_LE_256-DAG: st1b { [[RES_6]].b }, [[PG]], [x0, x[[OFF_6]]]
				; VBITS_LE_256-DAG: mov w[[OFF_7:[0-9]+]], #[[#mul(VBYTES,7)]]
				; VBITS_LE_256-DAG: ld1b { [[OP1_7:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_7]]]
				; VBITS_LE_256-DAG: ld1b { [[OP2_7:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_7]]]
				; VBITS_LE_256-DAG: add [[RES_7:z[0-9]+]].b, [[PG]]/m, [[OP1_7]].b, [[OP2_7]].b
				; VBITS_LE_256-DAG: st1b { [[RES_7]].b }, [[PG]], [x0, x[[OFF_7]]]
				; CHECK: ret
				%op1 = load <256 x i8>, <256 x i8>* %a
				%op2 = load <256 x i8>, <256 x i8>* %b
				%res = add <256 x i8> %op1, %op2
				store <256 x i8> %res, <256 x i8>* %a
				ret void
				}

				; Don't use SVE for 64-bit vectors.
				define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
				; CHECK-LABEL: @add_v4i16
				; CHECK: add v0.4h, v0.4h, v1.4h
				; CHECK: ret
				%res = add <4 x i16> %op1, %op2
				ret <4 x i16> %res
				}

				; Don't use SVE for 128-bit vectors.
				define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
				; CHECK-LABEL: @add_v8i16
				; CHECK: add v0.8h, v0.8h, v1.8h
				; CHECK: ret
				%res = add <8 x i16> %op1, %op2
				ret <8 x i16> %res
				}

				define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
				; CHECK-LABEL: @add_v16i16
				; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
				; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
				; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <16 x i16>, <16 x i16>* %a
				%op2 = load <16 x i16>, <16 x i16>* %b
				%res = add <16 x i16> %op1, %op2
				store <16 x i16> %res, <16 x i16>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
				; already cover the general legalisation cases.
				define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
				; CHECK-LABEL: @add_v32i16
				; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
				; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
				; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <32 x i16>, <32 x i16>* %a
				%op2 = load <32 x i16>, <32 x i16>* %b
				%res = add <32 x i16> %op1, %op2
				store <32 x i16> %res, <32 x i16>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
				; already cover the general legalisation cases.
				define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
				; CHECK-LABEL: @add_v64i16
				; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
				; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
				; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <64 x i16>, <64 x i16>* %a
				%op2 = load <64 x i16>, <64 x i16>* %b
				%res = add <64 x i16> %op1, %op2
				store <64 x i16> %res, <64 x i16>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
				; already cover the general legalisation cases.
				define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
				; CHECK-LABEL: @add_v128i16
				; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
				; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
				; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <128 x i16>, <128 x i16>* %a
				%op2 = load <128 x i16>, <128 x i16>* %b
				%res = add <128 x i16> %op1, %op2
				store <128 x i16> %res, <128 x i16>* %a
				ret void
				}

				; Don't use SVE for 64-bit vectors.
				define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
				; CHECK-LABEL: @add_v2i32
				; CHECK: add v0.2s, v0.2s, v1.2s
				; CHECK: ret
				%res = add <2 x i32> %op1, %op2
				ret <2 x i32> %res
				}

				; Don't use SVE for 128-bit vectors.
				define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
				; CHECK-LABEL: @add_v4i32
				; CHECK: add v0.4s, v0.4s, v1.4s
				; CHECK: ret
				%res = add <4 x i32> %op1, %op2
				ret <4 x i32> %res
				}

				define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
				; CHECK-LABEL: @add_v8i32
				; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
				; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
				; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <8 x i32>, <8 x i32>* %a
				%op2 = load <8 x i32>, <8 x i32>* %b
				%res = add <8 x i32> %op1, %op2
				store <8 x i32> %res, <8 x i32>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
				; already cover the general legalisation cases.
				define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
				; CHECK-LABEL: @add_v16i32
				; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
				; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
				; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <16 x i32>, <16 x i32>* %a
				%op2 = load <16 x i32>, <16 x i32>* %b
				%res = add <16 x i32> %op1, %op2
				store <16 x i32> %res, <16 x i32>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
				; already cover the general legalisation cases.
				define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
				; CHECK-LABEL: @add_v32i32
				; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
				; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
				; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <32 x i32>, <32 x i32>* %a
				%op2 = load <32 x i32>, <32 x i32>* %b
				%res = add <32 x i32> %op1, %op2
				store <32 x i32> %res, <32 x i32>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
				; already cover the general legalisation cases.
				define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
				; CHECK-LABEL: @add_v64i32
				; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
				; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
				; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <64 x i32>, <64 x i32>* %a
				%op2 = load <64 x i32>, <64 x i32>* %b
				%res = add <64 x i32> %op1, %op2
				store <64 x i32> %res, <64 x i32>* %a
				ret void
				}

				; Don't use SVE for 64-bit vectors.
				define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
				; CHECK-LABEL: @add_v1i64
				; CHECK: add d0, d0, d1
				; CHECK: ret
				%res = add <1 x i64> %op1, %op2
				ret <1 x i64> %res
				}

				; Don't use SVE for 128-bit vectors.
				define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
				; CHECK-LABEL: @add_v2i64
				; CHECK: add v0.2d, v0.2d, v1.2d
				; CHECK: ret
				%res = add <2 x i64> %op1, %op2
				ret <2 x i64> %res
				}

				define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
				; CHECK-LABEL: @add_v4i64
				; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
				; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
				; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <4 x i64>, <4 x i64>* %a
				%op2 = load <4 x i64>, <4 x i64>* %b
				%res = add <4 x i64> %op1, %op2
				store <4 x i64> %res, <4 x i64>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
				; already cover the general legalisation cases.
				define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
				; CHECK-LABEL: @add_v8i64
				; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
				; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
				; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <8 x i64>, <8 x i64>* %a
				%op2 = load <8 x i64>, <8 x i64>* %b
				%res = add <8 x i64> %op1, %op2
				store <8 x i64> %res, <8 x i64>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
				; already cover the general legalisation cases.
				define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
				; CHECK-LABEL: @add_v16i64
				; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
				; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
				; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <16 x i64>, <16 x i64>* %a
				%op2 = load <16 x i64>, <16 x i64>* %b
				%res = add <16 x i64> %op1, %op2
				store <16 x i64> %res, <16 x i64>* %a
				ret void
				}

				; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
				; already cover the general legalisation cases.
				define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
				; CHECK-LABEL: @add_v32i64
				; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
				; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
				; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
				; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
				; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
				; CHECK: ret
				%op1 = load <32 x i64>, <32 x i64>* %a
				%op2 = load <32 x i64>, <32 x i64>* %b
				%res = add <32 x i64> %op1, %op2
				store <32 x i64> %res, <32 x i64>* %a
				ret void
				}

				attributes #0 = { "target-features"="+sve" }

This is an archive of the discontinued LLVM Phabricator instance.

[SVE] Code generation for fixed length vector adds.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 273824

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/lib/Target/AArch64/SVEInstrFormats.td

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll

llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SVE] Code generation for fixed length vector adds.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 273824

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/lib/Target/AArch64/SVEInstrFormats.td

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll

llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll

[SVE] Code generation for fixed length vector adds.
ClosedPublic