diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -789,6 +789,31 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". + class AdvSIMD_SVE_Create_2Vector_Tuple + : Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, LLVMMatchType<1>], + [IntrReadMem]>; + + class AdvSIMD_SVE_Create_3Vector_Tuple + : Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>], + [IntrReadMem]>; + + class AdvSIMD_SVE_Create_4Vector_Tuple + : Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, + LLVMMatchType<1>], + [IntrReadMem]>; + + class AdvSIMD_SVE_Set_Vector_Tuple + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], + [IntrReadMem, ImmArg>]>; + + class AdvSIMD_SVE_Get_Vector_Tuple + : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty], + [IntrReadMem, IntrArgMemOnly, ImmArg>]>; + class AdvSIMD_1Vec_PredLoad_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -1301,6 +1326,21 @@ [IntrNoMem]>; // +// Vector tuple creation intrinsics (ACLE) +// + +def int_aarch64_sve_tuple_create2 : AdvSIMD_SVE_Create_2Vector_Tuple; +def int_aarch64_sve_tuple_create3 : AdvSIMD_SVE_Create_3Vector_Tuple; +def int_aarch64_sve_tuple_create4 : AdvSIMD_SVE_Create_4Vector_Tuple; + +// +// Vector tuple insertion/extraction intrinsics (ACLE) +// + +def int_aarch64_sve_tuple_get : AdvSIMD_SVE_Get_Vector_Tuple; +def int_aarch64_sve_tuple_set : AdvSIMD_SVE_Set_Vector_Tuple; + +// // Loads // diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13657,6 +13657,73 @@ /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM); + case Intrinsic::aarch64_sve_tuple_get: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Src1 = N->getOperand(2); + SDValue Idx = N->getOperand(3); + + uint64_t IdxConst = cast(Idx)->getZExtValue(); + if (IdxConst > Src1->getNumOperands() - 1) + report_fatal_error("index larger than expected"); + + EVT ResVT = N->getValueType(0); + uint64_t NumLanes = ResVT.getVectorElementCount().Min; + SDValue Val = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, + DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32)); + return DAG.getMergeValues({Val, Chain}, DL); + } + case Intrinsic::aarch64_sve_tuple_set: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Tuple = N->getOperand(2); + SDValue Idx = N->getOperand(3); + SDValue Vec = N->getOperand(4); + + EVT TupleVT = Tuple.getValueType(); + uint64_t TupleLanes = TupleVT.getVectorElementCount().Min; + + uint64_t IdxConst = cast(Idx)->getZExtValue(); + uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min; + + if ((TupleLanes % NumLanes) != 0) + report_fatal_error("invalid tuple vector!"); + + uint64_t NumVecs = TupleLanes / NumLanes; + + SmallVector Opnds; + for (unsigned I = 0; I < NumVecs; ++I) { + if (I == IdxConst) + Opnds.push_back(Vec); + else { + Opnds.push_back( + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple, + DAG.getConstant(I * NumLanes, DL, MVT::i32))); + } + } + SDValue Concat = + DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds); + return DAG.getMergeValues({Concat, Chain}, DL); + } + case Intrinsic::aarch64_sve_tuple_create2: + case Intrinsic::aarch64_sve_tuple_create3: + case Intrinsic::aarch64_sve_tuple_create4: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + + SmallVector Opnds; + for (unsigned I = 2; I < N->getNumOperands(); ++I) + Opnds.push_back(N->getOperand(I)); + + EVT VT = Opnds[0].getValueType(); + EVT EltVT = VT.getVectorElementType(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, + VT.getVectorElementCount() * + (N->getNumOperands() - 2)); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds); + return DAG.getMergeValues({Concat, Chain}, DL); + } default: break; } diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-tuple-types.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-tuple-types.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-tuple-types.ll @@ -0,0 +1,499 @@ +; RUN: llc -mtriple aarch64 -mattr=+sve -asm-verbose=0 < %s | FileCheck %s + +; +; svint8x2_t +; + +define @ret_svint8x2_t( %unused_z0, %z1, %z2) #0 { +; CHECK-LABEL: ret_svint8x2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8( %z1, %z2) + ret %tuple +} + +define void @call_svint8x2_t( %dummy_z0, %z1, %dummy_z2, %z3) #0 { +; CHECK-LABEL: call_svint8x2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: bl callee_svint8x2_t + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8( %z1, %z3) + call void @callee_svint8x2_t( %tuple) + ret void +} + +; +; svint16x2_t +; + +define @ret_svint16x2_t( %unused_z0, %z1, %z2) #0 { +; CHECK-LABEL: ret_svint16x2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16( %z1, %z2) + ret %tuple +} + +define void @call_svint16x2_t( %dummy_z0, %z1, %dummy_z2, %z3) #0 { +; CHECK-LABEL: call_svint16x2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: bl callee_svint16x2_t + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16( %z1, %z3) + call void @callee_svint16x2_t( %tuple) + ret void +} + +; +; svint32x2_t +; + +define @ret_svint32x2_t( %unused_z0, %z1, %z2) #0 { +; CHECK-LABEL: ret_svint32x2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32( %z1, %z2) + ret %tuple +} + +define void @call_svint32x2_t( %dummy_z0, %z1, %dummy_z2, %z3) #0 { +; CHECK-LABEL: call_svint32x2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: bl callee_svint32x2_t + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32( %z1, %z3) + call void @callee_svint32x2_t( %tuple) + ret void +} + +; +; svint64x2_t +; + +define @ret_svint64x2_t( %unused_z0, %z1, %z2) #0 { +; CHECK-LABEL: ret_svint64x2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64( %z1, %z2) + ret %tuple +} + +define void @call_svint64x2_t( %dummy_z0, %z1, %dummy_z2, %z3) #0 { +; CHECK-LABEL: call_svint64x2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: bl callee_svint64x2_t + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64( %z1, %z3) + call void @callee_svint64x2_t( %tuple) + ret void +} + +; +; svfloatx2_t +; + +define @ret_svfloatx2_t( %unused_z0, %z1, %z2) #0 { +; CHECK-LABEL: ret_svfloatx2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32( %z1, %z2) + ret %tuple +} + +define void @call_svfloatx2_t( %dummy_z0, %z1, %dummy_z2, %z3) #0 { +; CHECK-LABEL: call_svfloatx2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: bl callee_svfloatx2_t + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32( %z1, %z3) + call void @callee_svfloatx2_t( %tuple) + ret void +} + +; +; svdoublex2_t +; + +define @ret_svdoublex2_t( %unused_z0, %z1, %z2) #0 { +; CHECK-LABEL: ret_svdoublex2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64( %z1, %z2) + ret %tuple +} + +define void @call_svdoublex2_t( %dummy_z0, %z1, %dummy_z2, %z3) #0 { +; CHECK-LABEL: call_svdoublex2_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: bl callee_svdoublex2_t + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64( %z1, %z3) + call void @callee_svdoublex2_t( %tuple) + ret void +} + +; +; svint8x3_t +; + +define @ret_svint8x3_t( %unused_z0, %z1, %z2, %z3) #0 { +; CHECK-LABEL: ret_svint8x3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8( %z1, %z2, %z3) + ret %tuple +} + +define void @call_svint8x3_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4) #0 { +; CHECK-LABEL: call_svint8x3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svint8x3_t + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8( %z1, %z2, %z4) + call void @callee_svint8x3_t( %tuple) + ret void +} + +; +; svint16x3_t +; + +define @ret_svint16x3_t( %unused_z0, %z1, %z2, %z3) #0 { +; CHECK-LABEL: ret_svint16x3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16( %z1, %z2, %z3) + ret %tuple +} + +define void @call_svint16x3_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4) #0 { +; CHECK-LABEL: call_svint16x3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svint16x3_t + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16( %z1, %z2, %z4) + call void @callee_svint16x3_t( %tuple) + ret void +} + +; +; svint32x3_t +; + +define @ret_svint32x3_t( %unused_z0, %z1, %z2, %z3) #0 { +; CHECK-LABEL: ret_svint32x3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32( %z1, %z2, %z3) + ret %tuple +} + +define void @call_svint32x3_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4) #0 { +; CHECK-LABEL: call_svint32x3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svint32x3_t + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32( %z1, %z2, %z4) + call void @callee_svint32x3_t( %tuple) + ret void +} + +; +; svint64x3_t +; + +define @ret_svint64x3_t( %unused_z0, %z1, %z2, %z3) #0 { +; CHECK-LABEL: ret_svint64x3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64( %z1, %z2, %z3) + ret %tuple +} + +define void @call_svint64x3_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4) #0 { +; CHECK-LABEL: call_svint64x3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svint64x3_t + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64( %z1, %z2, %z4) + call void @callee_svint64x3_t( %tuple) + ret void +} + +; +; svfloatx3_t +; + +define @ret_svfloatx3_t( %unused_z0, %z1, %z2, %z3) #0 { +; CHECK-LABEL: ret_svfloatx3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32( %z1, %z2, %z3) + ret %tuple +} + +define void @call_svfloatx3_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4) #0 { +; CHECK-LABEL: call_svfloatx3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svfloatx3_t + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32( %z1, %z2, %z4) + call void @callee_svfloatx3_t( %tuple) + ret void +} + +; +; svdoublex3_t +; + +define @ret_svdoublex3_t( %unused_z0, %z1, %z2, %z3) #0 { +; CHECK-LABEL: ret_svdoublex3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64( %z1, %z2, %z3) + ret %tuple +} + +define void @call_svdoublex3_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4) #0 { +; CHECK-LABEL: call_svdoublex3_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svdoublex3_t + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64( %z1, %z2, %z4) + call void @callee_svdoublex3_t( %tuple) + ret void +} + +; +; svint8x4_t +; + +define @ret_svint8x4_t( %unused_z0, %z1, %z2, %z3, %z4) #0 { +; CHECK-LABEL: ret_svint8x4_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: mov z3.d, z4.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8( %z1, %z2, %z3, %z4) + ret %tuple +} + +define void @call_svint8x4_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4, %z5) #0 { +; CHECK-LABEL: call_svint8x4_t +; CHECK: mov z3.d, z5.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svint8x4_t + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8( %z1, %z2, %z4, %z5) + call void @callee_svint8x4_t( %tuple) + ret void +} + +; +; svint16x4_t +; + +define @ret_svint16x4_t( %unused_z0, %z1, %z2, %z3, %z4) #0 { +; CHECK-LABEL: ret_svint16x4_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: mov z3.d, z4.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16( %z1, %z2, %z3, %z4) + ret %tuple +} + +define void @call_svint16x4_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4, %z5) #0 { +; CHECK-LABEL: call_svint16x4_t +; CHECK: mov z3.d, z5.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svint16x4_t + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16( %z1, %z2, %z4, %z5) + call void @callee_svint16x4_t( %tuple) + ret void +} + +; +; svint32x4_t +; + +define @ret_svint32x4_t( %unused_z0, %z1, %z2, %z3, %z4) #0 { +; CHECK-LABEL: ret_svint32x4_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: mov z3.d, z4.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32( %z1, %z2, %z3, %z4) + ret %tuple +} + +define void @call_svint32x4_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4, %z5) #0 { +; CHECK-LABEL: call_svint32x4_t +; CHECK: mov z3.d, z5.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svint32x4_t + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32( %z1, %z2, %z4, %z5) + call void @callee_svint32x4_t( %tuple) + ret void +} + +; +; svint64x4_t +; + +define @ret_svint64x4_t( %unused_z0, %z1, %z2, %z3, %z4) #0 { +; CHECK-LABEL: ret_svint64x4_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: mov z3.d, z4.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64( %z1, %z2, %z3, %z4) + ret %tuple +} + +define void @call_svint64x4_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4, %z5) #0 { +; CHECK-LABEL: call_svint64x4_t +; CHECK: mov z3.d, z5.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svint64x4_t + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64( %z1, %z2, %z4, %z5) + call void @callee_svint64x4_t( %tuple) + ret void +} + +; +; svfloatx4_t +; + +define @ret_svfloatx4_t( %unused_z0, %z1, %z2, %z3, %z4) #0 { +; CHECK-LABEL: ret_svfloatx4_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: mov z3.d, z4.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32( %z1, %z2, %z3, %z4) + ret %tuple +} + +define void @call_svfloatx4_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4, %z5) #0 { +; CHECK-LABEL: call_svfloatx4_t +; CHECK: mov z3.d, z5.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svfloatx4_t + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32( %z1, %z2, %z4, %z5) + call void @callee_svfloatx4_t( %tuple) + ret void +} + +; +; svdoublex4_t +; + +define @ret_svdoublex4_t( %unused_z0, %z1, %z2, %z3, %z4) #0 { +; CHECK-LABEL: ret_svdoublex4_t +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: mov z3.d, z4.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64( %z1, %z2, %z3, %z4) + ret %tuple +} + +define void @call_svdoublex4_t( %dummy_z0, %z1, %z2, %dummy_z3, %z4, %z5) #0 { +; CHECK-LABEL: call_svdoublex4_t +; CHECK: mov z3.d, z5.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z2.d, z4.d +; CHECK-NEXT: bl callee_svdoublex4_t + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64( %z1, %z2, %z4, %z5) + call void @callee_svdoublex4_t( %tuple) + ret void +} + +attributes #0 = { nounwind "target-features"="+sve" } + +declare void @callee_svint8x2_t() +declare void @callee_svint16x2_t() +declare void @callee_svint32x2_t() +declare void @callee_svint64x2_t() +declare void @callee_svfloatx2_t() +declare void @callee_svdoublex2_t() + +declare void @callee_svint8x3_t() +declare void @callee_svint16x3_t() +declare void @callee_svint32x3_t() +declare void @callee_svint64x3_t() +declare void @callee_svfloatx3_t() +declare void @callee_svdoublex3_t() + +declare void @callee_svint8x4_t() +declare void @callee_svint16x4_t() +declare void @callee_svint32x4_t() +declare void @callee_svint64x4_t() +declare void @callee_svfloatx4_t() +declare void @callee_svdoublex4_t() + + +; x2 +declare @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64(, ) + +; x3 +declare @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64(, , ) + +; x4 +declare @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64(, , , ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-create-tuple.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-create-tuple.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-create-tuple.ll @@ -0,0 +1,706 @@ +; RUN: llc -mtriple aarch64 -mattr=+sve -asm-verbose=1 < %s | FileCheck %s + +; +; SVCREATE2 (i8) +; + +define @test_svcreate2_s8_vec0(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_s8_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv16i8.nxv32i8( %tuple, i32 0) + ret %extract +} + +define @test_svcreate2_s8_vec1(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_s8_vec1: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv16i8.nxv32i8( %tuple, i32 1) + ret %extract +} + +; +; SVCREATE2 (i16) +; + +define @test_svcreate2_s16_vec0(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_s16_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8i16.nxv16i16( %tuple, i32 0) + ret %extract +} + +define @test_svcreate2_s16_vec1(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_s16_vec1: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8i16.nxv16i16( %tuple, i32 1) + ret %extract +} + +; +; SVCREATE2 (half) +; + +define @test_svcreate2_f16_vec0(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_f16_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv16f16.nxv8f16( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8f16.nxv16f16( %tuple, i32 0) + ret %extract +} + +define @test_svcreate2_f16_vec1(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_f16_vec1: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv16f16.nxv8f16( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8f16.nxv16f16( %tuple, i32 1) + ret %extract +} + +; +; SVCREATE2 (i32) +; + +define @test_svcreate2_s32_vec0(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_s32_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4i32.nxv8i32( %tuple, i32 0) + ret %extract +} + +define @test_svcreate2_s32_vec1(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_s32_vec1: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4i32.nxv8i32( %tuple, i32 1) + ret %extract +} + +; +; SVCREATE2 (float) +; + +define @test_svcreate2_f32_vec0(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_f32_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4f32.nxv8f32( %tuple, i32 0) + ret %extract +} + +define @test_svcreate2_f32_vec1(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_f32_vec1: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4f32.nxv8f32( %tuple, i32 1) + ret %extract +} + +; +; SVCREATE2 (i64) +; + +define @test_svcreate2_s64_vec0(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_s64_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2i64.nxv4i64( %tuple, i32 0) + ret %extract +} + +define @test_svcreate2_s64_vec1(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_s64_vec1: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2i64.nxv4i64( %tuple, i32 1) + ret %extract +} + +; +; SVCREATE2 (double) +; + +define @test_svcreate2_f64_vec0(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_f64_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64( %tuple, i32 0) + ret %extract +} + +define @test_svcreate2_f64_vec1(i1 %p, %z0, %z1) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate2_f64_vec1: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64( %z0, %z1) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64( %tuple, i32 1) + ret %extract +} + +; +; SVCREATE3 (i8) +; + +define @test_svcreate3_s8_vec0(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_s8_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv16i8.nxv48i8( %tuple, i32 0) + ret %extract +} + +define @test_svcreate3_s8_vec2(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_s8_vec2: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv16i8.nxv48i8( %tuple, i32 2) + ret %extract +} + +; +; SVCREATE3 (i16) +; + +define @test_svcreate3_s16_vec0(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_s16_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8i16.nxv24i16( %tuple, i32 0) + ret %extract +} + +define @test_svcreate3_s16_vec2(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_s16_vec2: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8i16.nxv24i16( %tuple, i32 2) + ret %extract +} +; +; SVCREATE3 (half) +; + +define @test_svcreate3_f16_vec0(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_f16_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv24f16.nxv8f16( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8f16.nxv24f16( %tuple, i32 0) + ret %extract +} + +define @test_svcreate3_f16_vec2(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_f16_vec2: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv24f16.nxv8f16( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8f16.nxv24f16( %tuple, i32 2) + ret %extract +} + + +; +; SVCREATE3 (i32) +; + +define @test_svcreate3_s32_vec0(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_s32_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4i32.nxv12i32( %tuple, i32 0) + ret %extract +} + +define @test_svcreate3_s32_vec2(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_s32_vec2: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4i32.nxv12i32( %tuple, i32 2) + ret %extract +} + +; +; SVCREATE3 (float) +; + +define @test_svcreate3_f32_vec0(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_f32_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4f32.nxv12f32( %tuple, i32 0) + ret %extract +} + +define @test_svcreate3_f32_vec2(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_f32_vec2: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4f32.nxv12f32( %tuple, i32 2) + ret %extract +} + +; +; SVCREATE3 (i64) +; + +define @test_svcreate3_s64_vec0(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_s64_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2i64.nxv6i64( %tuple, i32 0) + ret %extract +} + +define @test_svcreate3_s64_vec2(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_s64_vec2: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2i64.nxv6i64( %tuple, i32 2) + ret %extract +} + +; +; SVCREATE3 (double) +; + +define @test_svcreate3_f64_vec0(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_f64_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64( %tuple, i32 0) + ret %extract +} + +define @test_svcreate3_f64_vec2(i1 %p, %z0, %z1, %z2) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate3_f64_vec2: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64( %z0, %z1, %z2) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64( %tuple, i32 2) + ret %extract +} + +; +; SVCREATE4 (i8) +; + +define @test_svcreate4_s8_vec0(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_s8_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv16i8.nxv64i8( %tuple, i32 0) + ret %extract +} + +define @test_svcreate4_s8_vec3(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_s8_vec3: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv16i8.nxv64i8( %tuple, i32 3) + ret %extract +} + +; +; SVCREATE4 (i16) +; + +define @test_svcreate4_s16_vec0(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_s16_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8i16.nxv32i16( %tuple, i32 0) + ret %extract +} + +define @test_svcreate4_s16_vec3(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_s16_vec3: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8i16.nxv32i16( %tuple, i32 3) + ret %extract +} + +; +; SVCREATE4 (half) +; + +define @test_svcreate4_f16_vec0(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_f16_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv32f16.nxv8f16( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8f16.nxv32f16( %tuple, i32 0) + ret %extract +} + +define @test_svcreate4_f16_vec3(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_f16_vec3: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv32f16.nxv8f16( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv8f16.nxv32f16( %tuple, i32 3) + ret %extract +} + +; +; SVCREATE4 (i32) +; + +define @test_svcreate4_s32_vec0(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_s32_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4i32.nxv16i32( %tuple, i32 0) + ret %extract +} + +define @test_svcreate4_s32_vec3(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_s32_vec3: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4i32.nxv16i32( %tuple, i32 3) + ret %extract +} + +; +; SVCREATE4 (float) +; + +define @test_svcreate4_f32_vec0(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_f32_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4f32.nxv16f32( %tuple, i32 0) + ret %extract +} + +define @test_svcreate4_f32_vec3(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_f32_vec3: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv4f32.nxv16f32( %tuple, i32 3) + ret %extract +} + +; +; SVCREATE4 (i64) +; + +define @test_svcreate4_s64_vec0(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_s64_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2i64.nxv8i64( %tuple, i32 0) + ret %extract +} + +define @test_svcreate4_s64_vec3(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_s64_vec3: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2i64.nxv8i64( %tuple, i32 3) + ret %extract +} + +; +; SVCREATE4 (double) +; + +define @test_svcreate4_f64_vec0(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_f64_vec0: +; CHECK: // %L2 +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64( %tuple, i32 0) + ret %extract +} + +define @test_svcreate4_f64_vec3(i1 %p, %z0, %z1, %z2, %z3) local_unnamed_addr #0 { +; CHECK-LABEL: test_svcreate4_f64_vec3: +; CHECK: // %L2 +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64( %z0, %z1, %z2, %z3) + br i1 %p, label %L1, label %L2 +L1: + ret undef +L2: + %extract = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64( %tuple, i32 3) + ret %extract +} + +attributes #0 = { nounwind "target-features"="+sve" } + +declare @llvm.aarch64.sve.tuple.create2.nxv4f64.nxv2f64(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv8f32.nxv4f32(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv16f16.nxv8f16(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv16i16.nxv8i16(, ) +declare @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(, ) + +declare @llvm.aarch64.sve.tuple.create3.nxv6f64.nxv2f64(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv12f32.nxv4f32(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv24f16.nxv8f16(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv24i16.nxv8i16(, , ) +declare @llvm.aarch64.sve.tuple.create3.nxv48i8.nxv16i8(, , ) + +declare @llvm.aarch64.sve.tuple.create4.nxv8f64.nxv2f64 (, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv16f32.nxv4f32(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv32f16.nxv8f16(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv32i16.nxv8i16(, , , ) +declare @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8(, , , ) + +declare @llvm.aarch64.sve.tuple.get.nxv16i8.nxv32i8(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv16i8.nxv48i8(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv16i8.nxv64i8(, i32 immarg) + +declare @llvm.aarch64.sve.tuple.get.nxv8i16.nxv16i16(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv8i16.nxv24i16(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv8i16.nxv32i16(, i32 immarg) + +declare @llvm.aarch64.sve.tuple.get.nxv4i32.nxv8i32(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv4i32.nxv12i32(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv4i32.nxv16i32(, i32 immarg) + +declare @llvm.aarch64.sve.tuple.get.nxv2i64.nxv4i64(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv2i64.nxv6i64(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv2i64.nxv8i64(, i32 immarg) + +declare @llvm.aarch64.sve.tuple.get.nxv8f16.nxv16f16(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv8f16.nxv24f16(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv8f16.nxv32f16(, i32 immarg) + +declare @llvm.aarch64.sve.tuple.get.nxv4f32.nxv8f32(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv4f32.nxv12f32(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv4f32.nxv16f32(, i32 immarg) + +declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(, i32 immarg) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-insert-extract-tuple.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-insert-extract-tuple.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-insert-extract-tuple.ll @@ -0,0 +1,243 @@ +; RUN: llc -mtriple aarch64 -mattr=+sve -asm-verbose=0 < %s | FileCheck %s + +; All these tests create a vector tuple, insert z5 into one of the elements, +; and finally extracts that element from the wide vector to return it. These +; checks ensure that z5 is always the value that is returned. + +; +; Insert into two element tuples +; + +; tuple: { tuple2.res0, tuple2.res1 } +; insert z5: { z5 , tuple2.res1 } +; extract z5: ^^ +define @set_tuple2_nxv8i32_elt0( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple2_nxv8i32_elt0: + ; CHECK-NEXT: mov z0.d, z5.d + ; CHECK-NEXT: ret + %tuple = call @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32( %z0, %z1) + %ins = call @llvm.aarch64.sve.tuple.set.nxv8i32.nxv4i32( %tuple, i32 0, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv8i32( %ins, i32 0) + ret %ext +} + +; tuple: { tuple2.res0, tuple2.res1 } +; insert z5: { tuple2.res0, z5 } +; extract z5: ^^ +define @set_tuple2_nxv8i32_elt1( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple2_nxv8i32_elt1: + ; CHECK-NEXT: mov z0.d, z5.d + ; CHECK-NEXT: ret + %tuple = call @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32( %z0, %z1) + %ins = call @llvm.aarch64.sve.tuple.set.nxv8i32.nxv4i32( %tuple, i32 1, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv8i32( %ins, i32 1) + ret %ext +} + +; This test checks the elements _not_ being set aren't changed. + +; tuple: { tuple2.res0, tuple2.res1 } +; insert z5: { tuple2.res0, z5 } +; extract z0: ^^ +define @set_tuple2_nxv8i32_elt1_ret_elt0( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple2_nxv8i32_elt1_ret_elt0: + ; CHECK-NEXT: ret + %tuple = call @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32( %z0, %z1) + %ins = call @llvm.aarch64.sve.tuple.set.nxv8i32.nxv4i32( %tuple, i32 1, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv8i32( %ins, i32 0) + ret %ext +} + +; Test extract of tuple passed into function +define @get_tuple2_nxv8i32_elt1( %tuple) #0 { + ; CHECK-LABEL: get_tuple2_nxv8i32_elt1: + ; CHECK-NEXT: mov z0.d, z1.d + ; CHECK-NEXT: ret + %ext = call @llvm.aarch64.sve.tuple.get.nxv8i32( %tuple, i32 1) + ret %ext +} + +; +; Insert into three element tuples +; + +; tuple: { tuple3.res0, tuple3.res1, tuple3.res2 } +; insert z5: { z5 , tuple3.res0, tuple3.res2 } +; extract z5: ^^ +define @set_tuple3_nxv12i32_elt0( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple3_nxv12i32_elt0: + ; CHECK-NEXT: mov z0.d, z5.d + ; CHECK-NEXT: ret + %tuple = call @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32( %z0, %z1, %z2) + %ins = call @llvm.aarch64.sve.tuple.set.nxv12i32.nxv4i32( %tuple, i32 0, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv12i32( %ins, i32 0) + ret %ext +} + +; tuple: { tuple3.res0, tuple3.res1, tuple3.res2 } +; insert z5: { tuple3.res0, z5 , tuple3.res2 } +; extract z5: ^^ +define @set_tuple3_nxv12i32_elt1( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple3_nxv12i32_elt1: + ; CHECK-NEXT: mov z0.d, z5.d + ; CHECK-NEXT: ret + %tuple = call @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32( %z0, %z1, %z2) + %ins = call @llvm.aarch64.sve.tuple.set.nxv12i32.nxv4i32( %tuple, i32 1, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv12i32( %ins, i32 1) + ret %ext +} + +; tuple: { tuple3.res0, tuple3.res1, tuple3.res2 } +; insert z5: { tuple3.res0, tuple3.res1, z5 } +; extract z5: ^^ +define @set_tuple3_nxv12i32_elt2( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple3_nxv12i32_elt2: + ; CHECK-NEXT: mov z0.d, z5.d + ; CHECK-NEXT: ret + %tuple = call @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32( %z0, %z1, %z2) + %ins = call @llvm.aarch64.sve.tuple.set.nxv12i32.nxv4i32( %tuple, i32 2, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv12i32( %ins, i32 2) + ret %ext +} + +; This test checks the elements _not_ being set aren't changed. + +; tuple: { tuple3.res0, tuple3.res1, tuple3.res2 } +; insert z5: { tuple3.res0, z5 , tuple3.res2 } +; extract z2: ^^ +define @set_tuple3_nxv12i32_elt1_ret_elt2( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple3_nxv12i32_elt1_ret_elt2: + ; CHECK-NEXT: mov z0.d, z2.d + ; CHECK-NEXT: ret + %tuple = call @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32( %z0, %z1, %z2) + %ins = call @llvm.aarch64.sve.tuple.set.nxv12i32.nxv4i32( %tuple, i32 1, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv12i32( %ins, i32 2) + ret %ext +} + +; Test extract of tuple passed into function +define @get_tuple3_nxv12i32_elt2( %z0, %tuple) #0 { + ; CHECK-LABEL: get_tuple3_nxv12i32_elt2: + ; CHECK-NEXT: mov z0.d, z3.d + ; CHECK-NEXT: ret + %ext = call @llvm.aarch64.sve.tuple.get.nxv12i32( %tuple, i32 2) + ret %ext +} + +; +; Insert into four element tuples +; + +; tuple: { tuple4.res0, tuple4.res1, tuple4.res2, tuple4.res3 } +; insert z5: { z5 , tuple4.res1, tuple4.res2, tuple4.res3 } +; extract z5: ^^ +define @set_tuple4_nxv16i32_elt0( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple4_nxv16i32_elt0: + ; CHECK-NEXT: mov z0.d, z5.d + ; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32( %z0, %z1, %z2, %z3) + %ins = call @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32( %tuple, i32 0, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv16i32( %ins, i32 0) + ret %ext +} + +; tuple: { tuple4.res0, tuple4.res1, tuple4.res2, tuple4.res3 } +; insert z5: { tuple4.res0, z5 , tuple4.res2, tuple4.res3 } +; extract z5: ^^ +define @set_tuple4_nxv16i32_elt1( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple4_nxv16i32_elt1: + ; CHECK-NEXT: mov z0.d, z5.d + ; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32( %z0, %z1, %z2, %z3) + %ins = call @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32( %tuple, i32 1, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv16i32( %ins, i32 1) + ret %ext +} + +; tuple: { tuple4.res0, tuple4.res1, tuple4.res2, tuple4.res3 } +; insert z5: { tuple4.res0, tuple4.res1, z5 , tuple4.res3 } +; extract z5: ^^ +define @set_tuple4_nxv16i32_elt2( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple4_nxv16i32_elt2: + ; CHECK-NEXT: mov z0.d, z5.d + ; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32( %z0, %z1, %z2, %z3) + %ins = call @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32( %tuple, i32 2, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv16i32( %ins, i32 2) + ret %ext +} + +; tuple: { tuple4.res0, tuple4.res1, tuple4.res2, tuple4.res3 } +; insert z5: { tuple4.res0, tuple4.res1, tuple4.res2, z5 } +; extract z5: ^^ +define @set_tuple4_nxv16i32_elt3( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple4_nxv16i32_elt3: + ; CHECK-NEXT: mov z0.d, z5.d + ; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32( %z0, %z1, %z2, %z3) + %ins = call @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32( %tuple, i32 3, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv16i32( %ins, i32 3) + ret %ext +} + +; This test checks the elements _not_ being set aren't changed. + +; tuple: { tuple4.res0, tuple4.res1, tuple4.res2, tuple4.res3 } +; insert z5: { tuple4.res0, tuple4.res1, tuple4.res2, z5 } +; extract z2: ^^ +define @set_tuple4_nxv16i32_elt3_ret_elt2( %z0, %z1, + %z2, %z3, + %z4, %z5) #0 { + ; CHECK-LABEL: set_tuple4_nxv16i32_elt3_ret_elt2: + ; CHECK-NEXT: mov z0.d, z2.d + ; CHECK-NEXT: ret + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32( %z0, %z1, %z2, %z3) + %ins = call @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32( %tuple, i32 3, %z5) + %ext = call @llvm.aarch64.sve.tuple.get.nxv16i32( %ins, i32 2) + ret %ext +} + +; Test extract of tuple passed into function +define @get_tuple4_nxv16i32_elt3( %tuple) #0 { + ; CHECK-LABEL: get_tuple4_nxv16i32_elt3: + ; CHECK-NEXT: mov z0.d, z3.d + ; CHECK-NEXT: ret + %ext = call @llvm.aarch64.sve.tuple.get.nxv16i32( %tuple, i32 3) + ret %ext +} + +attributes #0 = { nounwind "target-features"="+sve" } + +declare @llvm.aarch64.sve.tuple.create2.nxv8i32.nxv4i32(, ) +declare @llvm.aarch64.sve.tuple.set.nxv8i32.nxv4i32(, i32, ) +declare @llvm.aarch64.sve.tuple.get.nxv8i32(, i32) + +declare @llvm.aarch64.sve.tuple.create3.nxv12i32.nxv4i32(, , ) +declare @llvm.aarch64.sve.tuple.set.nxv12i32.nxv4i32(, i32, ) +declare @llvm.aarch64.sve.tuple.get.nxv12i32(, i32) + +declare @llvm.aarch64.sve.tuple.create4.nxv16i32.nxv4i32(, , , ) +declare @llvm.aarch64.sve.tuple.set.nxv16i32.nxv4i32(, i32, ) +declare @llvm.aarch64.sve.tuple.get.nxv16i32(, i32)