Diff 248670

llvm/include/llvm/IR/IntrinsicsAArch64.td

Show First 20 Lines • Show All 771 Lines • ▼ Show 20 Lines
def llvm_nxv4i32_ty : LLVMType<nxv4i32>;		def llvm_nxv4i32_ty : LLVMType<nxv4i32>;
def llvm_nxv2i64_ty : LLVMType<nxv2i64>;		def llvm_nxv2i64_ty : LLVMType<nxv2i64>;
def llvm_nxv8f16_ty : LLVMType<nxv8f16>;		def llvm_nxv8f16_ty : LLVMType<nxv8f16>;
def llvm_nxv4f32_ty : LLVMType<nxv4f32>;		def llvm_nxv4f32_ty : LLVMType<nxv4f32>;
def llvm_nxv2f64_ty : LLVMType<nxv2f64>;		def llvm_nxv2f64_ty : LLVMType<nxv2f64>;

let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".		let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".

		class AdvSIMD_SVE_Create_2Vector_Tuple
		: Intrinsic<[llvm_anyvector_ty],
		[llvm_anyvector_ty, LLVMMatchType<1>],
		[IntrReadMem]>;

		class AdvSIMD_SVE_Create_3Vector_Tuple
		: Intrinsic<[llvm_anyvector_ty],
		[llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>],
		[IntrReadMem]>;

		class AdvSIMD_SVE_Create_4Vector_Tuple
		: Intrinsic<[llvm_anyvector_ty],
		[llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>,
		LLVMMatchType<1>],
		[IntrReadMem]>;

		class AdvSIMD_SVE_Set_Vector_Tuple
		: Intrinsic<[llvm_anyvector_ty],
		[LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty],
		[IntrReadMem, ImmArg<1>]>;

		class AdvSIMD_SVE_Get_Vector_Tuple
		: Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
		[IntrReadMem, IntrArgMemOnly, ImmArg<1>]>;
		craig.topperUnsubmitted Not Done Reply Inline Actions Sorry for digging up an old patch here. Why is this IntrArgMemOnly? And why are any of these IntrReadMem? We're working on similar intrinsics for RISCV vector support and were wondering if we were missing something. craig.topper: Sorry for digging up an old patch here. Why is this IntrArgMemOnly? And why are any of these…
		c-rhodesAuthorUnsubmitted Not Done Reply Inline Actions Sorry for digging up an old patch here. Why is this IntrArgMemOnly? And why are any of these IntrReadMem? We're working on similar intrinsics for RISCV vector support and were wondering if we were missing something. Apologies for the delayed response. When these intrinsics were first implemented `CopyToReg`/`CopyFromReg` didn't support some of the more unusual tuple types like `nxv6i64` which could be created by an LD3 for example. This prevented copying between basic blocks since the compiler would crash. To get around this a temporary constraint of these intrinsics was they must immediately be followed by an extract to prevent the backend from ever seeing such types. I think at the time this was ok since these intrinsics were only used the the Arm C Language Extension (ACLE) intrinsics so we could apply this constraint, although it required marking the intrinsics as touching memory to prevent the mid end from mucking around with them. The `CopyToReg`/`CopyFromReg` issue has since been fixed however, so I can see no reason why these intrinsics can't be updated to mark them as not touching memory, I'll make a note to address that, cheers. c-rhodes: > Sorry for digging up an old patch here. Why is this IntrArgMemOnly? And why are any of these…

class AdvSIMD_1Vec_PredLoad_Intrinsic		class AdvSIMD_1Vec_PredLoad_Intrinsic
: Intrinsic<[llvm_anyvector_ty],		: Intrinsic<[llvm_anyvector_ty],
[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,		[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMPointerTo<0>],		LLVMPointerTo<0>],
[IntrReadMem, IntrArgMemOnly]>;		[IntrReadMem, IntrArgMemOnly]>;

class AdvSIMD_1Vec_PredFaultingLoad_Intrinsic		class AdvSIMD_1Vec_PredFaultingLoad_Intrinsic
: Intrinsic<[llvm_anyvector_ty],		: Intrinsic<[llvm_anyvector_ty],
▲ Show 20 Lines • Show All 462 Lines • ▼ Show 20 Lines	: Intrinsic<[],
[		[
llvm_anyvector_ty,		llvm_anyvector_ty,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,		LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
llvm_anyvector_ty, llvm_i64_ty		llvm_anyvector_ty, llvm_i64_ty
],		],
[IntrWriteMem, IntrArgMemOnly]>;		[IntrWriteMem, IntrArgMemOnly]>;

//		//
		// Vector tuple creation intrinsics (ACLE)
		//

		def int_aarch64_sve_tuple_create2 : AdvSIMD_SVE_Create_2Vector_Tuple;
		def int_aarch64_sve_tuple_create3 : AdvSIMD_SVE_Create_3Vector_Tuple;
		def int_aarch64_sve_tuple_create4 : AdvSIMD_SVE_Create_4Vector_Tuple;

		//
		// Vector tuple insertion/extraction intrinsics (ACLE)
		//

		def int_aarch64_sve_tuple_get : AdvSIMD_SVE_Get_Vector_Tuple;
		def int_aarch64_sve_tuple_set : AdvSIMD_SVE_Set_Vector_Tuple;

		//
// Loads		// Loads
//		//

def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;		def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;

def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;		def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;
def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;		def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;

▲ Show 20 Lines • Show All 891 Lines • Show Last 20 Lines

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 725 Lines • ▼ Show 20 Lines	static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");		assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");

unsigned IntermediateNumElts = IntermediateVT.isVector() ?		unsigned IntermediateNumElts = IntermediateVT.isVector() ?
IntermediateVT.getVectorNumElements() : 1;		IntermediateVT.getVectorNumElements() : 1;

// Convert the vector to the appropriate type if necessary.		// Convert the vector to the appropriate type if necessary.
unsigned DestVectorNoElts = NumIntermediates * IntermediateNumElts;		unsigned DestVectorNoElts = NumIntermediates * IntermediateNumElts;

EVT BuiltVectorTy = EVT::getVectorVT(		EVT BuiltVectorTy =
*DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);		EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(),
		DestVectorNoElts, ValueVT.isScalableVector());
if (ValueVT != BuiltVectorTy) {		if (ValueVT != BuiltVectorTy) {
if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy))		if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy))
Val = Widened;		Val = Widened;

Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);		Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
}		}

// Split the vector into intermediate operands.		// Split the vector into intermediate operands.
▲ Show 20 Lines • Show All 6,280 Lines • ▼ Show 20 Lines	case Intrinsic::ptrmask: {

EVT DestVT =		EVT DestVT =
EVT(DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));		EVT(DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));

setValue(&I, DAG.getNode(ISD::AND, getCurSDLoc(), DestVT, Ptr,		setValue(&I, DAG.getNode(ISD::AND, getCurSDLoc(), DestVT, Ptr,
DAG.getZExtOrTrunc(Const, getCurSDLoc(), DestVT)));		DAG.getZExtOrTrunc(Const, getCurSDLoc(), DestVT)));
return;		return;
}		}
		case Intrinsic::aarch64_sve_tuple_get: {
		SDValue Src1 = getValue(I.getOperand(0));
		SDValue Idx = getValue(I.getOperand(1));

		assert(Src1.getOpcode() == ISD::CONCAT_VECTORS &&
		"Unexpected operand for sve_tuple_get");
		efriedmaUnsubmitted Not Done Reply Inline Actions This assertion seems ambitious; I'm not sure how you plan to ensure that the sve.tuple.get intrinsic is in the same basic block as the intrinsic that produces the value. Is there some reason you're putting this handling into the target-independent SelectionDAGBuilder, as opposed to target-specific code? efriedma: This assertion seems ambitious; I'm not sure how you plan to ensure that the sve.tuple.get…
		c-rhodesAuthorUnsubmitted Not Done Reply Inline Actions This assertion seems ambitious; I'm not sure how you plan to ensure that the sve.tuple.get intrinsic is in the same basic block as the intrinsic that produces the value. Sander's patch D80139 fixes copying between BBs and I've updated this patch to use `EXTRACT_SUBVECTOR` to get the vector from the tuple type, rather than looking through the `CONCAT_VECTOR`, so this assert has been removed. Is there some reason you're putting this handling into the target-independent SelectionDAGBuilder, as opposed to target-specific code? Tuple types for structured loads/stores were originally implemented with sizeless structs downstream but adding sizeless structs to the C standard isn't going to happen, so we chose to represent them as multiples of 128-bit vectors, e.g. `svint32x2_t` -> `<n x 8 x i32>`. For vector tuples containing 2 and 4 vectors (LD2/LD4, ST2/ST4) this was fine as the types are powers of 2 and can be broken down by the existing mechanisms, but for LD3/ST3 the types are odd sizes: `svint8x3_t` -> `<vscale x 48 x i8>` `svint16x3_t` -> `<vscale x 24 x i16>` `svint32x3_t` -> `<vscale x 12 x i32>` `svint64x2_t` -> `<vscale x 6 x i64>` Which are problematic for legalisation when it comes to widening/splitting. To do this lowering in target-specific AArch64ISelLowering MVTs would have to be defined for these types which aren't legal. By lowering here we avoid these issues as these types don't reach type legalization. c-rhodes: > This assertion seems ambitious; I'm not sure how you plan to ensure that the sve.tuple.get…
		efriedmaUnsubmitted Not Done Reply Inline Actions If you need to run before legalization, you can put code in AArch64TargetLowering::PerformDAGCombine. Alternatively, additional MVTs wouldn't be that terrible. efriedma: If you need to run before legalization, you can put code in AArch64TargetLowering…
		c-rhodesAuthorUnsubmitted Not Done Reply Inline Actions If you need to run before legalization, you can put code in AArch64TargetLowering::PerformDAGCombine. That's a much better idea :) I've implemented your suggestion, cheers! Alternatively, additional MVTs wouldn't be that terrible. I'm a little confused about this, there's a comment at the top of `MachineValueType.h`: // This file defines the set of machine-level target independent types which // legal values in the code generator use. which to me implies MVTs should only be implemented for legal types which tuple vectors aren't, although I've also seen `v3i32` which seems to be used in the AMDGPU backend but I don't know if the ISA specifies a register for that. I don't think I want to implements MVTs in this patch but I'm curious if it's ok to define them for illegal types? c-rhodes: > If you need to run before legalization, you can put code in AArch64TargetLowering…
		efriedmaUnsubmitted Not Done Reply Inline Actions We treat the "legal values in the code generator" bit a little loosely; really, it's any type that useful to define as an MVT for some backend. And yes, if I recall that discussion correctly, I think AMDGPU actually does have operations on v3i32 registers. efriedma: We treat the "legal values in the code generator" bit a little loosely; really, it's any type…

		uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
		if (IdxConst > Src1->getNumOperands() - 1)
		report_fatal_error("index larger than expected");
		setValue(&I, SDValue(Src1.getOperand(IdxConst)));
		return;
		}
		case Intrinsic::aarch64_sve_tuple_set: {
		SDValue Tuple = getValue(I.getOperand(0));
		SDValue Idx = getValue(I.getOperand(1));
		SDValue Vec = getValue(I.getOperand(2));

		assert(Tuple.getOpcode() == ISD::CONCAT_VECTORS &&
		"Unexpected operand for sve_tuple_set");

		uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();

		SmallVector<SDValue, 4> Opnds;
		for (unsigned J = 0; J < Tuple->getNumOperands(); ++J)
		Opnds.push_back(J == IdxConst ? Vec : Tuple->getOperand(J));
		SDValue Result =
		DAG.getNode(ISD::CONCAT_VECTORS, sdl, Tuple.getValueType(), Opnds);
		setValue(&I, Result);
		return;
		}
		case Intrinsic::aarch64_sve_tuple_create2:
		case Intrinsic::aarch64_sve_tuple_create3:
		case Intrinsic::aarch64_sve_tuple_create4: {
		unsigned N = I.getNumArgOperands();
		SmallVector<SDValue, 4> Opnds;
		for (const auto &Arg : I.args())
		Opnds.push_back(getValue(Arg.get()));

		EVT VT = Opnds[0].getValueType();
		EVT EltVT = VT.getVectorElementType();
		EVT DestVT =
		EVT::getVectorVT(Context, EltVT, VT.getVectorElementCount() N);
		SDValue Result = DAG.getNode(ISD::CONCAT_VECTORS, sdl, DestVT, Opnds);
		setValue(&I, Result);
		return;
		}
}		}
}		}

void SelectionDAGBuilder::visitConstrainedFPIntrinsic(		void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
const ConstrainedFPIntrinsic &FPI) {		const ConstrainedFPIntrinsic &FPI) {
SDLoc sdl = getCurSDLoc();		SDLoc sdl = getCurSDLoc();

const TargetLowering &TLI = DAG.getTargetLoweringInfo();		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
▲ Show 20 Lines • Show All 3,675 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Show First 20 Lines • Show All 419 Lines • ▼ Show 20 Lines	public:
MachineBasicBlock *		MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,		EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;		MachineBasicBlock *MBB) const override;

bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,		bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
MachineFunction &MF,		MachineFunction &MF,
unsigned Intrinsic) const override;		unsigned Intrinsic) const override;

		unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
		CallingConv::ID CC, EVT VT,
		EVT &IntermediateVT,
		unsigned &NumIntermediates,
		MVT &RegisterVT) const override;

		MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
		EVT VT) const override;
		unsigned getNumRegistersForCallingConv(LLVMContext &Context,
		CallingConv::ID CC,
		EVT VT) const override;

bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,		bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
EVT NewVT) const override;		EVT NewVT) const override;

bool isTruncateFree(Type Ty1, Type Ty2) const override;		bool isTruncateFree(Type Ty1, Type Ty2) const override;
bool isTruncateFree(EVT VT1, EVT VT2) const override;		bool isTruncateFree(EVT VT1, EVT VT2) const override;

bool isProfitableToHoist(Instruction *I) const override;		bool isProfitableToHoist(Instruction *I) const override;

▲ Show 20 Lines • Show All 434 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,997 Lines • ▼ Show 20 Lines	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}		}
default:		default:
break;		break;
}		}

return false;		return false;
}		}

		MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
		CallingConv::ID CC,
		EVT VT) const {
		if (!VT.isScalableVector() \|\|
		VT.getSizeInBits().getKnownMinSize() <= AArch64::SVEBitsPerBlock)
		return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

		switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
		case MVT::i8:
		return MVT::nxv16i8;
		case MVT::i16:
		return MVT::nxv8i16;
		case MVT::i32:
		return MVT::nxv4i32;
		case MVT::i64:
		return MVT::nxv2i64;
		case MVT::f16:
		return MVT::nxv8f16;
		case MVT::f32:
		return MVT::nxv4f32;
		case MVT::f64:
		return MVT::nxv2f64;
		default:
		llvm_unreachable("Unsupported type for SVE vectors");
		}
		}

		unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
		LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
		unsigned &NumIntermediates, MVT &RegisterVT) const {
		if (!VT.isScalableVector() \|\|
		VT.getSizeInBits().getKnownMinSize() <= AArch64::SVEBitsPerBlock)
		return TargetLowering::getVectorTypeBreakdownForCallingConv(
		Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);

		IntermediateVT = RegisterVT = getRegisterTypeForCallingConv(Context, CC, VT);
		NumIntermediates = getNumRegistersForCallingConv(Context, CC, VT);
		return NumIntermediates;
		}

		unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
		LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
		if (!VT.isScalableVector() \|\|
		VT.getSizeInBits().getKnownMinSize() <= AArch64::SVEBitsPerBlock)
		return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

		unsigned NumVectors =
		VT.getSizeInBits().getKnownMinSize() / AArch64::SVEBitsPerBlock;
		assert(NumVectors * AArch64::SVEBitsPerBlock ==
		VT.getSizeInBits().getKnownMinSize() &&
		"Not a multiple of a full SVE vector");
		return NumVectors;
		}

bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,		bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,		ISD::LoadExtType ExtTy,
EVT NewVT) const {		EVT NewVT) const {
// TODO: This may be worth removing. Check regression tests for diffs.		// TODO: This may be worth removing. Check regression tests for diffs.
if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))		if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
return false;		return false;

// If we're reducing the load width in order to avoid having to use an extra		// If we're reducing the load width in order to avoid having to use an extra
▲ Show 20 Lines • Show All 4,056 Lines • ▼ Show 20 Lines	case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,		return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,
/OnlyPackedOffsets=/false);		/OnlyPackedOffsets=/false);
case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:		case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM);		return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM);
default:		default:
break;		break;
}		}
break;		break;
case ISD::GlobalAddress:		case ISD::GlobalAddress:
		efriedmaUnsubmitted Not Done Reply Inline Actions No point to explicitly checking isBeforeLegalizeOps(); the intrinsics should be gone after that. efriedma: No point to explicitly checking isBeforeLegalizeOps(); the intrinsics should be gone after that.
		c-rhodesAuthorUnsubmitted Done Reply Inline Actions Doh, done! c-rhodes: Doh, done!
return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());		return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
}		}
return SDValue();		return SDValue();
}		}

// Check if the return value is used as only a return value, as otherwise		// Check if the return value is used as only a return value, as otherwise
// we can't perform a tail-call. In particular, we need to check for		// we can't perform a tail-call. In particular, we need to check for
// target ISD nodes that are returns and any other "odd" constructs		// target ISD nodes that are returns and any other "odd" constructs
▲ Show 20 Lines • Show All 705 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-calling-convention-tuple-types.ll

This file was added.

				; RUN: llc -mtriple aarch64 -mattr=+sve -asm-verbose=0 < %s \| FileCheck %s

				;
				; svint8x2_t
				;

				define <vscale x 32 x i8> @ret_svint8x2_t(<vscale x 16 x i8> %unused_z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2) #0 {
				; CHECK-LABEL: ret_svint8x2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(<vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2)
				ret <vscale x 32 x i8> %tuple
				}

				define void @call_svint8x2_t(<vscale x 16 x i8> %dummy_z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %dummy_z2, <vscale x 16 x i8> %z3) #0 {
				; CHECK-LABEL: call_svint8x2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z3.d
				; CHECK-NEXT: bl callee_svint8x2_t
				%tuple = tail call <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(<vscale x 16 x i8> %z1, <vscale x 16 x i8> %z3)
				call void @callee_svint8x2_t(<vscale x 32 x i8> %tuple)
				ret void
				}

				;
				; svint16x2_t
				;

				define <vscale x 16 x i16> @ret_svint16x2_t(<vscale x 8 x i16> %unused_z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2) #0 {
				; CHECK-LABEL: ret_svint16x2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i16> @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16(<vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2)
				ret <vscale x 16 x i16> %tuple
				}

				define void @call_svint16x2_t(<vscale x 8 x i16> %dummy_z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %dummy_z2, <vscale x 8 x i16> %z3) #0 {
				; CHECK-LABEL: call_svint16x2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z3.d
				; CHECK-NEXT: bl callee_svint16x2_t
				%tuple = tail call <vscale x 16 x i16> @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16(<vscale x 8 x i16> %z1, <vscale x 8 x i16> %z3)
				call void @callee_svint16x2_t(<vscale x 16 x i16> %tuple)
				ret void
				}

				;
				; svint32x2_t
				;

				define <vscale x 8 x i32> @ret_svint32x2_t(<vscale x 4 x i32> %unused_z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2) #0 {
				; CHECK-LABEL: ret_svint32x2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x i32> @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(<vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2)
				ret <vscale x 8 x i32> %tuple
				}

				define void @call_svint32x2_t(<vscale x 4 x i32> %dummy_z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %dummy_z2, <vscale x 4 x i32> %z3) #0 {
				; CHECK-LABEL: call_svint32x2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z3.d
				; CHECK-NEXT: bl callee_svint32x2_t
				%tuple = tail call <vscale x 8 x i32> @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(<vscale x 4 x i32> %z1, <vscale x 4 x i32> %z3)
				call void @callee_svint32x2_t(<vscale x 8 x i32> %tuple)
				ret void
				}

				;
				; svint64x2_t
				;

				define <vscale x 4 x i64> @ret_svint64x2_t(<vscale x 2 x i64> %unused_z0, <vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2) #0 {
				; CHECK-LABEL: ret_svint64x2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 4 x i64> @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(<vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2)
				ret <vscale x 4 x i64> %tuple
				}

				define void @call_svint64x2_t(<vscale x 2 x i64> %dummy_z0, <vscale x 2 x i64> %z1, <vscale x 2 x i64> %dummy_z2, <vscale x 2 x i64> %z3) #0 {
				; CHECK-LABEL: call_svint64x2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z3.d
				; CHECK-NEXT: bl callee_svint64x2_t
				%tuple = tail call <vscale x 4 x i64> @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(<vscale x 2 x i64> %z1, <vscale x 2 x i64> %z3)
				call void @callee_svint64x2_t(<vscale x 4 x i64> %tuple)
				ret void
				}

				;
				; svfloatx2_t
				;

				define <vscale x 8 x float> @ret_svfloatx2_t(<vscale x 4 x float> %unused_z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2) #0 {
				; CHECK-LABEL: ret_svfloatx2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x float> @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32(<vscale x 4 x float> %z1, <vscale x 4 x float> %z2)
				ret <vscale x 8 x float> %tuple
				}

				define void @call_svfloatx2_t(<vscale x 4 x float> %dummy_z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %dummy_z2, <vscale x 4 x float> %z3) #0 {
				; CHECK-LABEL: call_svfloatx2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z3.d
				; CHECK-NEXT: bl callee_svfloatx2_t
				%tuple = tail call <vscale x 8 x float> @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32(<vscale x 4 x float> %z1, <vscale x 4 x float> %z3)
				call void @callee_svfloatx2_t(<vscale x 8 x float> %tuple)
				ret void
				}

				;
				; svdoublex2_t
				;

				define <vscale x 4 x double> @ret_svdoublex2_t(<vscale x 2 x double> %unused_z0, <vscale x 2 x double> %z1, <vscale x 2 x double> %z2) #0 {
				; CHECK-LABEL: ret_svdoublex2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 4 x double> @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64(<vscale x 2 x double> %z1, <vscale x 2 x double> %z2)
				ret <vscale x 4 x double> %tuple
				}

				define void @call_svdoublex2_t(<vscale x 2 x double> %dummy_z0, <vscale x 2 x double> %z1, <vscale x 2 x double> %dummy_z2, <vscale x 2 x double> %z3) #0 {
				; CHECK-LABEL: call_svdoublex2_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z3.d
				; CHECK-NEXT: bl callee_svdoublex2_t
				%tuple = tail call <vscale x 4 x double> @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64(<vscale x 2 x double> %z1, <vscale x 2 x double> %z3)
				call void @callee_svdoublex2_t(<vscale x 4 x double> %tuple)
				ret void
				}

				;
				; svint8x3_t
				;

				define <vscale x 48 x i8> @ret_svint8x3_t(<vscale x 16 x i8> %unused_z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3) #0 {
				; CHECK-LABEL: ret_svint8x3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 48 x i8> @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8(<vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3)
				ret <vscale x 48 x i8> %tuple
				}

				define void @call_svint8x3_t(<vscale x 16 x i8> %dummy_z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %dummy_z3, <vscale x 16 x i8> %z4) #0 {
				; CHECK-LABEL: call_svint8x3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svint8x3_t
				%tuple = tail call <vscale x 48 x i8> @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8(<vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z4)
				call void @callee_svint8x3_t(<vscale x 48 x i8> %tuple)
				ret void
				}

				;
				; svint16x3_t
				;

				define <vscale x 24 x i16> @ret_svint16x3_t(<vscale x 8 x i16> %unused_z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3) #0 {
				; CHECK-LABEL: ret_svint16x3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 24 x i16> @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16(<vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
				ret <vscale x 24 x i16> %tuple
				}

				define void @call_svint16x3_t(<vscale x 8 x i16> %dummy_z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %dummy_z3, <vscale x 8 x i16> %z4) #0 {
				; CHECK-LABEL: call_svint16x3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svint16x3_t
				%tuple = tail call <vscale x 24 x i16> @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16(<vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z4)
				call void @callee_svint16x3_t(<vscale x 24 x i16> %tuple)
				ret void
				}

				;
				; svint32x3_t
				;

				define <vscale x 12 x i32> @ret_svint32x3_t(<vscale x 4 x i32> %unused_z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3) #0 {
				; CHECK-LABEL: ret_svint32x3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3)
				ret <vscale x 12 x i32> %tuple
				}

				define void @call_svint32x3_t(<vscale x 4 x i32> %dummy_z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %dummy_z3, <vscale x 4 x i32> %z4) #0 {
				; CHECK-LABEL: call_svint32x3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svint32x3_t
				%tuple = tail call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z4)
				call void @callee_svint32x3_t(<vscale x 12 x i32> %tuple)
				ret void
				}

				;
				; svint64x3_t
				;

				define <vscale x 6 x i64> @ret_svint64x3_t(<vscale x 2 x i64> %unused_z0, <vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2, <vscale x 2 x i64> %z3) #0 {
				; CHECK-LABEL: ret_svint64x3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 6 x i64> @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(<vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2, <vscale x 2 x i64> %z3)
				ret <vscale x 6 x i64> %tuple
				}

				define void @call_svint64x3_t(<vscale x 2 x i64> %dummy_z0, <vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2, <vscale x 2 x i64> %dummy_z3, <vscale x 2 x i64> %z4) #0 {
				; CHECK-LABEL: call_svint64x3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svint64x3_t
				%tuple = tail call <vscale x 6 x i64> @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(<vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2, <vscale x 2 x i64> %z4)
				call void @callee_svint64x3_t(<vscale x 6 x i64> %tuple)
				ret void
				}

				;
				; svfloatx3_t
				;

				define <vscale x 12 x float> @ret_svfloatx3_t(<vscale x 4 x float> %unused_z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3) #0 {
				; CHECK-LABEL: ret_svfloatx3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 12 x float> @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32(<vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
				ret <vscale x 12 x float> %tuple
				}

				define void @call_svfloatx3_t(<vscale x 4 x float> %dummy_z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %dummy_z3, <vscale x 4 x float> %z4) #0 {
				; CHECK-LABEL: call_svfloatx3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svfloatx3_t
				%tuple = tail call <vscale x 12 x float> @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32(<vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z4)
				call void @callee_svfloatx3_t(<vscale x 12 x float> %tuple)
				ret void
				}

				;
				; svdoublex3_t
				;

				define <vscale x 6 x double> @ret_svdoublex3_t(<vscale x 2 x double> %unused_z0, <vscale x 2 x double> %z1, <vscale x 2 x double> %z2, <vscale x 2 x double> %z3) #0 {
				; CHECK-LABEL: ret_svdoublex3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 6 x double> @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64(<vscale x 2 x double> %z1, <vscale x 2 x double> %z2, <vscale x 2 x double> %z3)
				ret <vscale x 6 x double> %tuple
				}

				define void @call_svdoublex3_t(<vscale x 2 x double> %dummy_z0, <vscale x 2 x double> %z1, <vscale x 2 x double> %z2, <vscale x 2 x double> %dummy_z3, <vscale x 2 x double> %z4) #0 {
				; CHECK-LABEL: call_svdoublex3_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svdoublex3_t
				%tuple = tail call <vscale x 6 x double> @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64(<vscale x 2 x double> %z1, <vscale x 2 x double> %z2, <vscale x 2 x double> %z4)
				call void @callee_svdoublex3_t(<vscale x 6 x double> %tuple)
				ret void
				}

				;
				; svint8x4_t
				;

				define <vscale x 64 x i8> @ret_svint8x4_t(<vscale x 16 x i8> %unused_z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3, <vscale x 16 x i8> %z4) #0 {
				; CHECK-LABEL: ret_svint8x4_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: mov z3.d, z4.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 64 x i8> @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8(<vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3, <vscale x 16 x i8> %z4)
				ret <vscale x 64 x i8> %tuple
				}

				define void @call_svint8x4_t(<vscale x 16 x i8> %dummy_z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %dummy_z3, <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5) #0 {
				; CHECK-LABEL: call_svint8x4_t
				; CHECK: mov z3.d, z5.d
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svint8x4_t
				%tuple = tail call <vscale x 64 x i8> @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8(<vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5)
				call void @callee_svint8x4_t(<vscale x 64 x i8> %tuple)
				ret void
				}

				;
				; svint16x4_t
				;

				define <vscale x 32 x i16> @ret_svint16x4_t(<vscale x 8 x i16> %unused_z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3, <vscale x 8 x i16> %z4) #0 {
				; CHECK-LABEL: ret_svint16x4_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: mov z3.d, z4.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 32 x i16> @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16(<vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3, <vscale x 8 x i16> %z4)
				ret <vscale x 32 x i16> %tuple
				}

				define void @call_svint16x4_t(<vscale x 8 x i16> %dummy_z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %dummy_z3, <vscale x 8 x i16> %z4, <vscale x 8 x i16> %z5) #0 {
				; CHECK-LABEL: call_svint16x4_t
				; CHECK: mov z3.d, z5.d
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svint16x4_t
				%tuple = tail call <vscale x 32 x i16> @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16(<vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z4, <vscale x 8 x i16> %z5)
				call void @callee_svint16x4_t(<vscale x 32 x i16> %tuple)
				ret void
				}

				;
				; svint32x4_t
				;

				define <vscale x 16 x i32> @ret_svint32x4_t(<vscale x 4 x i32> %unused_z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3, <vscale x 4 x i32> %z4) #0 {
				; CHECK-LABEL: ret_svint32x4_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: mov z3.d, z4.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3, <vscale x 4 x i32> %z4)
				ret <vscale x 16 x i32> %tuple
				}

				define void @call_svint32x4_t(<vscale x 4 x i32> %dummy_z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %dummy_z3, <vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: call_svint32x4_t
				; CHECK: mov z3.d, z5.d
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svint32x4_t
				%tuple = tail call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5)
				call void @callee_svint32x4_t(<vscale x 16 x i32> %tuple)
				ret void
				}

				;
				; svint64x4_t
				;

				define <vscale x 8 x i64> @ret_svint64x4_t(<vscale x 2 x i64> %unused_z0, <vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2, <vscale x 2 x i64> %z3, <vscale x 2 x i64> %z4) #0 {
				; CHECK-LABEL: ret_svint64x4_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: mov z3.d, z4.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x i64> @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(<vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2, <vscale x 2 x i64> %z3, <vscale x 2 x i64> %z4)
				ret <vscale x 8 x i64> %tuple
				}

				define void @call_svint64x4_t(<vscale x 2 x i64> %dummy_z0, <vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2, <vscale x 2 x i64> %dummy_z3, <vscale x 2 x i64> %z4, <vscale x 2 x i64> %z5) #0 {
				; CHECK-LABEL: call_svint64x4_t
				; CHECK: mov z3.d, z5.d
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svint64x4_t
				%tuple = tail call <vscale x 8 x i64> @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(<vscale x 2 x i64> %z1, <vscale x 2 x i64> %z2, <vscale x 2 x i64> %z4, <vscale x 2 x i64> %z5)
				call void @callee_svint64x4_t(<vscale x 8 x i64> %tuple)
				ret void
				}

				;
				; svfloatx4_t
				;

				define <vscale x 16 x float> @ret_svfloatx4_t(<vscale x 4 x float> %unused_z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3, <vscale x 4 x float> %z4) #0 {
				; CHECK-LABEL: ret_svfloatx4_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: mov z3.d, z4.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x float> @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32(<vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3, <vscale x 4 x float> %z4)
				ret <vscale x 16 x float> %tuple
				}

				define void @call_svfloatx4_t(<vscale x 4 x float> %dummy_z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %dummy_z3, <vscale x 4 x float> %z4, <vscale x 4 x float> %z5) #0 {
				; CHECK-LABEL: call_svfloatx4_t
				; CHECK: mov z3.d, z5.d
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svfloatx4_t
				%tuple = tail call <vscale x 16 x float> @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32(<vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z4, <vscale x 4 x float> %z5)
				call void @callee_svfloatx4_t(<vscale x 16 x float> %tuple)
				ret void
				}

				;
				; svdoublex4_t
				;

				define <vscale x 8 x double> @ret_svdoublex4_t(<vscale x 2 x double> %unused_z0, <vscale x 2 x double> %z1, <vscale x 2 x double> %z2, <vscale x 2 x double> %z3, <vscale x 2 x double> %z4) #0 {
				; CHECK-LABEL: ret_svdoublex4_t
				; CHECK: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z3.d
				; CHECK-NEXT: mov z3.d, z4.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x double> @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64(<vscale x 2 x double> %z1, <vscale x 2 x double> %z2, <vscale x 2 x double> %z3, <vscale x 2 x double> %z4)
				ret <vscale x 8 x double> %tuple
				}

				define void @call_svdoublex4_t(<vscale x 2 x double> %dummy_z0, <vscale x 2 x double> %z1, <vscale x 2 x double> %z2, <vscale x 2 x double> %dummy_z3, <vscale x 2 x double> %z4, <vscale x 2 x double> %z5) #0 {
				; CHECK-LABEL: call_svdoublex4_t
				; CHECK: mov z3.d, z5.d
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: mov z1.d, z2.d
				; CHECK-NEXT: mov z2.d, z4.d
				; CHECK-NEXT: bl callee_svdoublex4_t
				%tuple = tail call <vscale x 8 x double> @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64(<vscale x 2 x double> %z1, <vscale x 2 x double> %z2, <vscale x 2 x double> %z4, <vscale x 2 x double> %z5)
				call void @callee_svdoublex4_t(<vscale x 8 x double> %tuple)
				ret void
				}

				attributes #0 = { nounwind "target-features"="+sve" }

				declare void @callee_svint8x2_t(<vscale x 32 x i8>)
				declare void @callee_svint16x2_t(<vscale x 16 x i16>)
				declare void @callee_svint32x2_t(<vscale x 8 x i32>)
				declare void @callee_svint64x2_t(<vscale x 4 x i64>)
				declare void @callee_svfloatx2_t(<vscale x 8 x float>)
				declare void @callee_svdoublex2_t(<vscale x 4 x double>)

				declare void @callee_svint8x3_t(<vscale x 48 x i8>)
				declare void @callee_svint16x3_t(<vscale x 24 x i16>)
				declare void @callee_svint32x3_t(<vscale x 12 x i32>)
				declare void @callee_svint64x3_t(<vscale x 6 x i64>)
				declare void @callee_svfloatx3_t(<vscale x 12 x float>)
				declare void @callee_svdoublex3_t(<vscale x 6 x double>)

				declare void @callee_svint8x4_t(<vscale x 64 x i8>)
				declare void @callee_svint16x4_t(<vscale x 32 x i16>)
				declare void @callee_svint32x4_t(<vscale x 16 x i32>)
				declare void @callee_svint64x4_t(<vscale x 8 x i64>)
				declare void @callee_svfloatx4_t(<vscale x 16 x float>)
				declare void @callee_svdoublex4_t(<vscale x 8 x double>)


				; x2
				declare <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
				declare <vscale x 16 x i16> @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
				declare <vscale x 8 x i32> @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
				declare <vscale x 4 x i64> @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
				declare <vscale x 8 x float> @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
				declare <vscale x 4 x double> @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)

				; x3
				declare <vscale x 48 x i8> @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
				declare <vscale x 24 x i16> @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
				declare <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
				declare <vscale x 6 x i64> @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
				declare <vscale x 12 x float> @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
				declare <vscale x 6 x double> @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)

				; x4
				declare <vscale x 64 x i8> @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
				declare <vscale x 32 x i16> @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
				declare <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
				declare <vscale x 8 x i64> @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
				declare <vscale x 16 x float> @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
				declare <vscale x 8 x double> @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)

llvm/test/CodeGen/AArch64/sve-intrinsics-create-tuple.ll

This file was added.

				; RUN: llc -mtriple aarch64 -mattr=+sve -asm-verbose=0 < %s \| FileCheck %s

				;
				; SVCREATE2 (i8)
				;

				define <vscale x 16 x i8> @test_svcreate2_s8_vec0(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_s8_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1)
				%extract = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv16i8.nxv32i8(<vscale x 32 x i8> %tuple, i32 0)
				ret <vscale x 16 x i8> %extract
				}

				define <vscale x 16 x i8> @test_svcreate2_s8_vec1(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_s8_vec1:
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1)
				%extract = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv16i8.nxv32i8(<vscale x 32 x i8> %tuple, i32 1)
				ret <vscale x 16 x i8> %extract
				}

				;
				; SVCREATE2 (i16)
				;

				define <vscale x 8 x i16> @test_svcreate2_s16_vec0(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_s16_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i16> @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1)
				%extract = tail call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv8i16.nxv16i16(<vscale x 16 x i16> %tuple, i32 0)
				ret <vscale x 8 x i16> %extract
				}

				define <vscale x 8 x i16> @test_svcreate2_s16_vec1(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_s16_vec1:
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i16> @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1)
				%extract = tail call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv8i16.nxv16i16(<vscale x 16 x i16> %tuple, i32 1)
				ret <vscale x 8 x i16> %extract
				}

				;
				; SVCREATE2 (half)
				;

				define <vscale x 8 x half> @test_svcreate2_f16_vec0(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_f16_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x half> @llvm.aarch64.sve.tuple.create2.nxv16f16.nxv8f16(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1)
				%extract = tail call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv8f16.nxv16f16(<vscale x 16 x half> %tuple, i32 0)
				ret <vscale x 8 x half> %extract
				}

				define <vscale x 8 x half> @test_svcreate2_f16_vec1(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_f16_vec1:
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x half> @llvm.aarch64.sve.tuple.create2.nxv16f16.nxv8f16(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1)
				%extract = tail call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv8f16.nxv16f16(<vscale x 16 x half> %tuple, i32 1)
				ret <vscale x 8 x half> %extract
				}

				;
				; SVCREATE2 (i32)
				;

				define <vscale x 4 x i32> @test_svcreate2_s32_vec0(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_s32_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x i32> @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1)
				%extract = tail call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv4i32.nxv8i32(<vscale x 8 x i32> %tuple, i32 0)
				ret <vscale x 4 x i32> %extract
				}

				define <vscale x 4 x i32> @test_svcreate2_s32_vec1(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_s32_vec1:
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x i32> @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1)
				%extract = tail call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv4i32.nxv8i32(<vscale x 8 x i32> %tuple, i32 1)
				ret <vscale x 4 x i32> %extract
				}

				;
				; SVCREATE2 (float)
				;

				define <vscale x 4 x float> @test_svcreate2_f32_vec0(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_f32_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x float> @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1)
				%extract = tail call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv4f32.nxv8f32(<vscale x 8 x float> %tuple, i32 0)
				ret <vscale x 4 x float> %extract
				}

				define <vscale x 4 x float> @test_svcreate2_f32_vec1(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_f32_vec1:
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x float> @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1)
				%extract = tail call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv4f32.nxv8f32(<vscale x 8 x float> %tuple, i32 1)
				ret <vscale x 4 x float> %extract
				}

				;
				; SVCREATE2 (i64)
				;

				define <vscale x 2 x i64> @test_svcreate2_s64_vec0(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_s64_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 4 x i64> @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1)
				%extract = tail call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv2i64.nxv4i64(<vscale x 4 x i64> %tuple, i32 0)
				ret <vscale x 2 x i64> %extract
				}

				define <vscale x 2 x i64> @test_svcreate2_s64_vec1(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_s64_vec1:
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 4 x i64> @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1)
				%extract = tail call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv2i64.nxv4i64(<vscale x 4 x i64> %tuple, i32 1)
				ret <vscale x 2 x i64> %extract
				}

				;
				; SVCREATE2 (double)
				;

				define <vscale x 2 x double> @test_svcreate2_f64_vec0(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_f64_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 4 x double> @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1)
				%extract = tail call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64(<vscale x 4 x double> %tuple, i32 0)
				ret <vscale x 2 x double> %extract
				}

				define <vscale x 2 x double> @test_svcreate2_f64_vec1(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate2_f64_vec1:
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 4 x double> @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1)
				%extract = tail call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64(<vscale x 4 x double> %tuple, i32 1)
				ret <vscale x 2 x double> %extract
				}

				;
				; SVCREATE3 (i8)
				;

				define <vscale x 16 x i8> @test_svcreate3_s8_vec0(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1, <vscale x 16 x i8> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_s8_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 48 x i8> @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1, <vscale x 16 x i8> %x2)
				%extract = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv16i8.nxv48i8(<vscale x 48 x i8> %tuple, i32 0)
				ret <vscale x 16 x i8> %extract
				}

				define <vscale x 16 x i8> @test_svcreate3_s8_vec2(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1, <vscale x 16 x i8> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_s8_vec2:
				; CHECK-NEXT: mov z0.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 48 x i8> @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1, <vscale x 16 x i8> %x2)
				%extract = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv16i8.nxv48i8(<vscale x 48 x i8> %tuple, i32 2)
				ret <vscale x 16 x i8> %extract
				}

				;
				; SVCREATE3 (i16)
				;

				define <vscale x 8 x i16> @test_svcreate3_s16_vec0(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1, <vscale x 8 x i16> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_s16_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 24 x i16> @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1, <vscale x 8 x i16> %x2)
				%extract = tail call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv8i16.nxv24i16(<vscale x 24 x i16> %tuple, i32 0)
				ret <vscale x 8 x i16> %extract
				}

				define <vscale x 8 x i16> @test_svcreate3_s16_vec2(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1, <vscale x 8 x i16> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_s16_vec2:
				; CHECK-NEXT: mov z0.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 24 x i16> @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1, <vscale x 8 x i16> %x2)
				%extract = tail call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv8i16.nxv24i16(<vscale x 24 x i16> %tuple, i32 2)
				ret <vscale x 8 x i16> %extract
				}
				;
				; SVCREATE3 (half)
				;

				define <vscale x 8 x half> @test_svcreate3_f16_vec0(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_f16_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 24 x half> @llvm.aarch64.sve.tuple.create3.nxv24f16.nxv8f16(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2)
				%extract = tail call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv8f16.nxv24f16(<vscale x 24 x half> %tuple, i32 0)
				ret <vscale x 8 x half> %extract
				}

				define <vscale x 8 x half> @test_svcreate3_f16_vec2(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_f16_vec2:
				; CHECK-NEXT: mov z0.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 24 x half> @llvm.aarch64.sve.tuple.create3.nxv24f16.nxv8f16(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2)
				%extract = tail call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv8f16.nxv24f16(<vscale x 24 x half> %tuple, i32 2)
				ret <vscale x 8 x half> %extract
				}


				;
				; SVCREATE3 (i32)
				;

				define <vscale x 4 x i32> @test_svcreate3_s32_vec0(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1, <vscale x 4 x i32> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_s32_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1, <vscale x 4 x i32> %x2)
				%extract = tail call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv4i32.nxv12i32(<vscale x 12 x i32> %tuple, i32 0)
				ret <vscale x 4 x i32> %extract
				}

				define <vscale x 4 x i32> @test_svcreate3_s32_vec2(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1, <vscale x 4 x i32> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_s32_vec2:
				; CHECK-NEXT: mov z0.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1, <vscale x 4 x i32> %x2)
				%extract = tail call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv4i32.nxv12i32(<vscale x 12 x i32> %tuple, i32 2)
				ret <vscale x 4 x i32> %extract
				}

				;
				; SVCREATE3 (float)
				;

				define <vscale x 4 x float> @test_svcreate3_f32_vec0(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_f32_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 12 x float> @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2)
				%extract = tail call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv4f32.nxv12f32(<vscale x 12 x float> %tuple, i32 0)
				ret <vscale x 4 x float> %extract
				}

				define <vscale x 4 x float> @test_svcreate3_f32_vec2(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_f32_vec2:
				; CHECK-NEXT: mov z0.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 12 x float> @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2)
				%extract = tail call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv4f32.nxv12f32(<vscale x 12 x float> %tuple, i32 2)
				ret <vscale x 4 x float> %extract
				}

				;
				; SVCREATE3 (i64)
				;

				define <vscale x 2 x i64> @test_svcreate3_s64_vec0(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1, <vscale x 2 x i64> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_s64_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 6 x i64> @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1, <vscale x 2 x i64> %x2)
				%extract = tail call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv2i64.nxv6i64(<vscale x 6 x i64> %tuple, i32 0)
				ret <vscale x 2 x i64> %extract
				}

				define <vscale x 2 x i64> @test_svcreate3_s64_vec2(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1, <vscale x 2 x i64> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_s64_vec2:
				; CHECK-NEXT: mov z0.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 6 x i64> @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1, <vscale x 2 x i64> %x2)
				%extract = tail call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv2i64.nxv6i64(<vscale x 6 x i64> %tuple, i32 2)
				ret <vscale x 2 x i64> %extract
				}

				;
				; SVCREATE3 (double)
				;

				define <vscale x 2 x double> @test_svcreate3_f64_vec0(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_f64_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 6 x double> @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2)
				%extract = tail call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(<vscale x 6 x double> %tuple, i32 0)
				ret <vscale x 2 x double> %extract
				}

				define <vscale x 2 x double> @test_svcreate3_f64_vec2(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate3_f64_vec2:
				; CHECK-NEXT: mov z0.d, z2.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 6 x double> @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2)
				%extract = tail call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(<vscale x 6 x double> %tuple, i32 2)
				ret <vscale x 2 x double> %extract
				}

				;
				; SVCREATE4 (i8)
				;

				define <vscale x 16 x i8> @test_svcreate4_s8_vec0(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1, <vscale x 16 x i8> %x2, <vscale x 16 x i8> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_s8_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 64 x i8> @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1, <vscale x 16 x i8> %x2, <vscale x 16 x i8> %x3)
				%extract = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv16i8.nxv64i8(<vscale x 64 x i8> %tuple, i32 0)
				ret <vscale x 16 x i8> %extract
				}

				define <vscale x 16 x i8> @test_svcreate4_s8_vec3(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1, <vscale x 16 x i8> %x2, <vscale x 16 x i8> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_s8_vec3:
				; CHECK-NEXT: mov z0.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 64 x i8> @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1, <vscale x 16 x i8> %x2, <vscale x 16 x i8> %x3)
				%extract = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv16i8.nxv64i8(<vscale x 64 x i8> %tuple, i32 3)
				ret <vscale x 16 x i8> %extract
				}

				;
				; SVCREATE4 (i16)
				;

				define <vscale x 8 x i16> @test_svcreate4_s16_vec0(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1, <vscale x 8 x i16> %x2, <vscale x 8 x i16> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_s16_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 32 x i16> @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1, <vscale x 8 x i16> %x2, <vscale x 8 x i16> %x3)
				%extract = tail call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv8i16.nxv32i16(<vscale x 32 x i16> %tuple, i32 0)
				ret <vscale x 8 x i16> %extract
				}

				define <vscale x 8 x i16> @test_svcreate4_s16_vec3(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1, <vscale x 8 x i16> %x2, <vscale x 8 x i16> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_s16_vec3:
				; CHECK-NEXT: mov z0.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 32 x i16> @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16(<vscale x 8 x i16> %x0, <vscale x 8 x i16> %x1, <vscale x 8 x i16> %x2, <vscale x 8 x i16> %x3)
				%extract = tail call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv8i16.nxv32i16(<vscale x 32 x i16> %tuple, i32 3)
				ret <vscale x 8 x i16> %extract
				}

				;
				; SVCREATE4 (half)
				;

				define <vscale x 8 x half> @test_svcreate4_f16_vec0(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2, <vscale x 8 x half> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_f16_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 32 x half> @llvm.aarch64.sve.tuple.create4.nxv32f16.nxv8f16(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2, <vscale x 8 x half> %x3)
				%extract = tail call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv8f16.nxv32f16(<vscale x 32 x half> %tuple, i32 0)
				ret <vscale x 8 x half> %extract
				}

				define <vscale x 8 x half> @test_svcreate4_f16_vec3(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2, <vscale x 8 x half> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_f16_vec3:
				; CHECK-NEXT: mov z0.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 32 x half> @llvm.aarch64.sve.tuple.create4.nxv32f16.nxv8f16(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2, <vscale x 8 x half> %x3)
				%extract = tail call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv8f16.nxv32f16(<vscale x 32 x half> %tuple, i32 3)
				ret <vscale x 8 x half> %extract
				}

				;
				; SVCREATE4 (i32)
				;

				define <vscale x 4 x i32> @test_svcreate4_s32_vec0(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1, <vscale x 4 x i32> %x2, <vscale x 4 x i32> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_s32_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1, <vscale x 4 x i32> %x2, <vscale x 4 x i32> %x3)
				%extract = tail call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv4i32.nxv16i32(<vscale x 16 x i32> %tuple, i32 0)
				ret <vscale x 4 x i32> %extract
				}

				define <vscale x 4 x i32> @test_svcreate4_s32_vec3(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1, <vscale x 4 x i32> %x2, <vscale x 4 x i32> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_s32_vec3:
				; CHECK-NEXT: mov z0.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32> %x0, <vscale x 4 x i32> %x1, <vscale x 4 x i32> %x2, <vscale x 4 x i32> %x3)
				%extract = tail call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv4i32.nxv16i32(<vscale x 16 x i32> %tuple, i32 3)
				ret <vscale x 4 x i32> %extract
				}

				;
				; SVCREATE4 (float)
				;

				define <vscale x 4 x float> @test_svcreate4_f32_vec0(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2, <vscale x 4 x float> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_f32_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x float> @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2, <vscale x 4 x float> %x3)
				%extract = tail call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv4f32.nxv16f32(<vscale x 16 x float> %tuple, i32 0)
				ret <vscale x 4 x float> %extract
				}

				define <vscale x 4 x float> @test_svcreate4_f32_vec3(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2, <vscale x 4 x float> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_f32_vec3:
				; CHECK-NEXT: mov z0.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x float> @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2, <vscale x 4 x float> %x3)
				%extract = tail call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv4f32.nxv16f32(<vscale x 16 x float> %tuple, i32 3)
				ret <vscale x 4 x float> %extract
				}

				;
				; SVCREATE4 (i64)
				;

				define <vscale x 2 x i64> @test_svcreate4_s64_vec0(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1, <vscale x 2 x i64> %x2, <vscale x 2 x i64> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_s64_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x i64> @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1, <vscale x 2 x i64> %x2, <vscale x 2 x i64> %x3)
				%extract = tail call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv2i64.nxv8i64(<vscale x 8 x i64> %tuple, i32 0)
				ret <vscale x 2 x i64> %extract
				}

				define <vscale x 2 x i64> @test_svcreate4_s64_vec3(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1, <vscale x 2 x i64> %x2, <vscale x 2 x i64> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_s64_vec3:
				; CHECK-NEXT: mov z0.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x i64> @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(<vscale x 2 x i64> %x0, <vscale x 2 x i64> %x1, <vscale x 2 x i64> %x2, <vscale x 2 x i64> %x3)
				%extract = tail call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv2i64.nxv8i64(<vscale x 8 x i64> %tuple, i32 3)
				ret <vscale x 2 x i64> %extract
				}

				;
				; SVCREATE4 (double)
				;

				define <vscale x 2 x double> @test_svcreate4_f64_vec0(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2, <vscale x 2 x double> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_f64_vec0:
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x double> @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2, <vscale x 2 x double> %x3)
				%extract = tail call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(<vscale x 8 x double> %tuple, i32 0)
				ret <vscale x 2 x double> %extract
				}

				define <vscale x 2 x double> @test_svcreate4_f64_vec3(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2, <vscale x 2 x double> %x3) local_unnamed_addr #0 {
				; CHECK-LABEL: test_svcreate4_f64_vec3:
				; CHECK-NEXT: mov z0.d, z3.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 8 x double> @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2, <vscale x 2 x double> %x3)
				%extract = tail call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(<vscale x 8 x double> %tuple, i32 3)
				ret <vscale x 2 x double> %extract
				}

				attributes #0 = { nounwind "target-features"="+sve" }

				declare <vscale x 4 x double> @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
				declare <vscale x 8 x float> @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
				declare <vscale x 16 x half> @llvm.aarch64.sve.tuple.create2.nxv16f16.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
				declare <vscale x 4 x i64> @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
				declare <vscale x 8 x i32> @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
				declare <vscale x 16 x i16> @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
				declare <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)

				declare <vscale x 6 x double> @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
				declare <vscale x 12 x float> @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
				declare <vscale x 24 x half> @llvm.aarch64.sve.tuple.create3.nxv24f16.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
				declare <vscale x 6 x i64> @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
				declare <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
				declare <vscale x 24 x i16> @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
				declare <vscale x 48 x i8> @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)

				declare <vscale x 8 x double> @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64 (<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
				declare <vscale x 16 x float> @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
				declare <vscale x 32 x half> @llvm.aarch64.sve.tuple.create4.nxv32f16.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
				declare <vscale x 8 x i64> @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
				declare <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
				declare <vscale x 32 x i16> @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
				declare <vscale x 64 x i8> @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)

				declare <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv16i8.nxv32i8(<vscale x 32 x i8>, i32 immarg)
				declare <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv16i8.nxv48i8(<vscale x 48 x i8>, i32 immarg)
				declare <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv16i8.nxv64i8(<vscale x 64 x i8>, i32 immarg)

				declare <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv8i16.nxv16i16(<vscale x 16 x i16>, i32 immarg)
				declare <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv8i16.nxv24i16(<vscale x 24 x i16>, i32 immarg)
				declare <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv8i16.nxv32i16(<vscale x 32 x i16>, i32 immarg)

				declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv4i32.nxv8i32(<vscale x 8 x i32>, i32 immarg)
				declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv4i32.nxv12i32(<vscale x 12 x i32>, i32 immarg)
				declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv4i32.nxv16i32(<vscale x 16 x i32>, i32 immarg)

				declare <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv2i64.nxv4i64(<vscale x 4 x i64>, i32 immarg)
				declare <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv2i64.nxv6i64(<vscale x 6 x i64>, i32 immarg)
				declare <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv2i64.nxv8i64(<vscale x 8 x i64>, i32 immarg)

				declare <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv8f16.nxv16f16(<vscale x 16 x half>, i32 immarg)
				declare <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv8f16.nxv24f16(<vscale x 24 x half>, i32 immarg)
				declare <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv8f16.nxv32f16(<vscale x 32 x half>, i32 immarg)

				declare <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv4f32.nxv8f32(<vscale x 8 x float>, i32 immarg)
				declare <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv4f32.nxv12f32(<vscale x 12 x float>, i32 immarg)
				declare <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv4f32.nxv16f32(<vscale x 16 x float>, i32 immarg)

				declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64(<vscale x 4 x double>, i32 immarg)
				declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(<vscale x 6 x double>, i32 immarg)
				declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(<vscale x 8 x double>, i32 immarg)

llvm/test/CodeGen/AArch64/sve-intrinsics-insert-extract-tuple.ll

This file was added.

				; RUN: llc -mtriple aarch64 -mattr=+sve -asm-verbose=0 < %s \| FileCheck %s

				; All these tests create a vector tuple, insert z5 into one of the elements,
				; and finally extracts that element from the wide vector to return it. These
				; checks ensure that z5 is always the value that is returned.

				;
				; Insert into two element tuples
				;

				; tuple: { tuple2.res0, tuple2.res1 }
				; insert z5: { z5 , tuple2.res1 }
				; extract z5: ^^
				define <vscale x 4 x i32> @set_tuple2_nxv8i32_elt0(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
				<vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3,
				<vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: set_tuple2_nxv8i32_elt0:
				; CHECK-NEXT: mov z0.d, z5.d
				; CHECK-NEXT: ret
				%tuple = call <vscale x 8 x i32> @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
				%ins = call <vscale x 8 x i32> @llvm.aarch64.sve.tuple.set.nxv8i32.nxv4i32(<vscale x 8 x i32> %tuple, i32 0, <vscale x 4 x i32> %z5)
				%ext = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv8i32(<vscale x 8 x i32> %ins, i32 0)
				ret <vscale x 4 x i32> %ext
				}

				; tuple: { tuple2.res0, tuple2.res1 }
				; insert z5: { tuple2.res0, z5 }
				; extract z5: ^^
				define <vscale x 4 x i32> @set_tuple2_nxv8i32_elt1(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
				<vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3,
				<vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: set_tuple2_nxv8i32_elt1:
				; CHECK-NEXT: mov z0.d, z5.d
				; CHECK-NEXT: ret
				%tuple = call <vscale x 8 x i32> @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
				%ins = call <vscale x 8 x i32> @llvm.aarch64.sve.tuple.set.nxv8i32.nxv4i32(<vscale x 8 x i32> %tuple, i32 1, <vscale x 4 x i32> %z5)
				%ext = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv8i32(<vscale x 8 x i32> %ins, i32 1)
				ret <vscale x 4 x i32> %ext
				}


				;
				; Insert into three element tuples
				;

				; tuple: { tuple3.res0, tuple3.res1, tuple3.res2 }
				; insert z5: { z5 , tuple3.res0, tuple3.res2 }
				; extract z5: ^^
				define <vscale x 4 x i32> @set_tuple3_nxv12i32_elt0(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
				<vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3,
				<vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: set_tuple3_nxv12i32_elt0:
				; CHECK-NEXT: mov z0.d, z5.d
				; CHECK-NEXT: ret
				%tuple = call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2)
				%ins = call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.set.nxv12i32.nxv4i32(<vscale x 12 x i32> %tuple, i32 0, <vscale x 4 x i32> %z5)
				%ext = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv12i32(<vscale x 12 x i32> %ins, i32 0)
				ret <vscale x 4 x i32> %ext
				}

				; tuple: { tuple3.res0, tuple3.res1, tuple3.res2 }
				; insert z5: { tuple3.res0, z5 , tuple3.res2 }
				; extract z5: ^^
				define <vscale x 4 x i32> @set_tuple3_nxv12i32_elt1(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
				<vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3,
				<vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: set_tuple3_nxv12i32_elt1:
				; CHECK-NEXT: mov z0.d, z5.d
				; CHECK-NEXT: ret
				%tuple = call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2)
				%ins = call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.set.nxv12i32.nxv4i32(<vscale x 12 x i32> %tuple, i32 1, <vscale x 4 x i32> %z5)
				%ext = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv12i32(<vscale x 12 x i32> %ins, i32 1)
				ret <vscale x 4 x i32> %ext
				}

				; tuple: { tuple3.res0, tuple3.res1, tuple3.res2 }
				; insert z5: { tuple3.res0, tuple3.res1, z5 }
				; extract z5: ^^
				define <vscale x 4 x i32> @set_tuple3_nxv12i32_elt2(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
				<vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3,
				<vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: set_tuple3_nxv12i32_elt2:
				; CHECK-NEXT: mov z0.d, z5.d
				; CHECK-NEXT: ret
				%tuple = call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2)
				%ins = call <vscale x 12 x i32> @llvm.aarch64.sve.tuple.set.nxv12i32.nxv4i32(<vscale x 12 x i32> %tuple, i32 2, <vscale x 4 x i32> %z5)
				%ext = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv12i32(<vscale x 12 x i32> %ins, i32 2)
				ret <vscale x 4 x i32> %ext
				}

				;
				; Insert into four element tuples
				;

				; tuple: { tuple4.res0, tuple4.res1, tuple4.res2, tuple4.res3 }
				; insert z5: { z5 , tuple4.res1, tuple4.res2, tuple4.res3 }
				; extract z5: ^^
				define <vscale x 4 x i32> @set_tuple4_nxv16i32_elt0(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
				<vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3,
				<vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: set_tuple4_nxv16i32_elt0:
				; CHECK-NEXT: mov z0.d, z5.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3)
				%ins = call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32(<vscale x 16 x i32> %tuple, i32 0, <vscale x 4 x i32> %z5)
				%ext = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32> %ins, i32 0)
				ret <vscale x 4 x i32> %ext
				}

				; tuple: { tuple4.res0, tuple4.res1, tuple4.res2, tuple4.res3 }
				; insert z5: { tuple4.res0, z5 , tuple4.res2, tuple4.res3 }
				; extract z5: ^^
				define <vscale x 4 x i32> @set_tuple4_nxv16i32_elt1(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
				<vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3,
				<vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: set_tuple4_nxv16i32_elt1:
				; CHECK-NEXT: mov z0.d, z5.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3)
				%ins = call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32(<vscale x 16 x i32> %tuple, i32 1, <vscale x 4 x i32> %z5)
				%ext = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32> %ins, i32 1)
				ret <vscale x 4 x i32> %ext
				}

				; tuple: { tuple4.res0, tuple4.res1, tuple4.res2, tuple4.res3 }
				; insert z5: { tuple4.res0, tuple4.res1, z5 , tuple4.res3 }
				; extract z5: ^^
				define <vscale x 4 x i32> @set_tuple4_nxv16i32_elt2(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
				<vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3,
				<vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: set_tuple4_nxv16i32_elt2:
				; CHECK-NEXT: mov z0.d, z5.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3)
				%ins = call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32(<vscale x 16 x i32> %tuple, i32 2, <vscale x 4 x i32> %z5)
				%ext = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32> %ins, i32 2)
				ret <vscale x 4 x i32> %ext
				}

				; tuple: { tuple4.res0, tuple4.res1, tuple4.res2, tuple4.res3 }
				; insert z5: { tuple4.res0, tuple4.res1, tuple4.res2, z5 }
				; extract z5: ^^
				define <vscale x 4 x i32> @set_tuple4_nxv16i32_elt3(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
				<vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3,
				<vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5) #0 {
				; CHECK-LABEL: set_tuple4_nxv16i32_elt3:
				; CHECK-NEXT: mov z0.d, z5.d
				; CHECK-NEXT: ret
				%tuple = tail call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3)
				%ins = call <vscale x 16 x i32> @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32(<vscale x 16 x i32> %tuple, i32 3, <vscale x 4 x i32> %z5)
				%ext = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32> %ins, i32 3)
				ret <vscale x 4 x i32> %ext
				}

				attributes #0 = { nounwind "target-features"="+sve" }

				declare <vscale x 8 x i32> @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
				declare <vscale x 8 x i32> @llvm.aarch64.sve.tuple.set.nxv8i32.nxv4i32(<vscale x 8 x i32>, i32, <vscale x 4 x i32>)
				declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv8i32(<vscale x 8 x i32>, i32)

				declare <vscale x 12 x i32> @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
				declare <vscale x 12 x i32> @llvm.aarch64.sve.tuple.set.nxv12i32.nxv4i32(<vscale x 12 x i32>, i32, <vscale x 4 x i32>)
				declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv12i32(<vscale x 12 x i32>, i32)

				declare <vscale x 16 x i32> @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
				declare <vscale x 16 x i32> @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32(<vscale x 16 x i32>, i32, <vscale x 4 x i32>)
				declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32>, i32)

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][SVE] Implement vector tuple intrinsics
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 248670

llvm/include/llvm/IR/IntrinsicsAArch64.td

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/sve-calling-convention-tuple-types.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-create-tuple.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-insert-extract-tuple.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][SVE] Implement vector tuple intrinsicsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 248670

llvm/include/llvm/IR/IntrinsicsAArch64.td

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/sve-calling-convention-tuple-types.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-create-tuple.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-insert-extract-tuple.ll

[AArch64][SVE] Implement vector tuple intrinsics
ClosedPublic