Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -16260,6 +16260,50 @@ The argument to this intrinsic must be a vector. +'``llvm.experimental.vector.splice``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm) + declare @llvm.experimental.vector.splice.nxv4i32( %vec1, %vec2, i32 %imm) + +Overview: +""""""""" + +The '``llvm.experimental.vector.splice.*``' intrinsics construct a vector by +concatenating elements from the first input vector with elements of the second +input vector, returning a vector of the same type as the input vectors. A +signed immediate specifies the elements to select from the first vector, where +the sign of the immediate distinguishes two variants. A positive immediate +specifies an index into the first vector and a negative immediate specifies the +number of trailing elements to extract from the first vector. + +These intrinsics work for both fixed and scalable vectors. While this intrinsic +is marked as experimental the recommended way to express this operation for +fixed-width vectors is still to use a shufflevector, as that may allow for more +optimization opportunities. + +For example: + +.. code-block:: text + + llvm.experimental.vector.splice(, , 1) ==> ; index + llvm.experimental.vector.splice(, , -3) ==> ; trailing elements + + +Arguments: +"""""""""" + +The first two operands are vectors with the same type. The third argument +``imm`` is the start index, modulo VL, where VL is the runtime number of +elements of the vector. The ``imm`` is a signed integer constant in the range +``-VL <= imm < VL``, values outside of this range are clamped. + Matrix Intrinsics ----------------- Index: llvm/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -554,6 +554,19 @@ /// in terms of the element size of VEC1/VEC2, not in terms of bytes. VECTOR_SHUFFLE, + /// VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as + /// VEC1/VEC2 from CONCAT_VECTORS(VEC1, VEC2), based on the IMM in two ways. + /// Let the result type be T, if IMM is positive it represents the starting + /// element number (an index) from which a subvector of type T is extracted + /// from CONCAT_VECTORS(VEC1, VEC2). If IMM is negative it represents a count + /// specifying the number of trailing elements to extract from VEC1, where the + /// elements of T are selected using the following algorithm: + /// RESULT[i] = CONCAT_VECTORS(VEC1,VEC2)[VEC1.ElementCount - ABS(IMM) + i] + /// If T is a fixed-width vector and IMM is out-of-bounds the result vector is + /// undefined. If T is a scalable vector, IMM is clamped by the runtime + /// scaling factor 'vscale'. IMM is a constant integer. + VECTOR_SPLICE, + /// SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a /// scalar value into element 0 of the resultant vector type. The top /// elements 1 to N-1 of the N-element vector are undefined. The type Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -4504,6 +4504,10 @@ /// Returns true if the expansion was successful. bool expandREM(SDNode *Node, SDValue &Result, SelectionDAG &DAG) const; + /// Method for building the DAG expansion of ISD::VECTOR_SPLICE. This + /// method accepts vectors as its arguments. + SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const; + //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1653,6 +1653,13 @@ [llvm_anyvector_ty, llvm_i64_ty], [IntrNoMem, ImmArg>]>; +//===---------- Named shufflevector intrinsics ------===// +def int_experimental_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// Index: llvm/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/include/llvm/Target/TargetSelectionDAG.td +++ llvm/include/llvm/Target/TargetSelectionDAG.td @@ -241,6 +241,9 @@ def SDTVecShuffle : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ]>; +def SDTVecSlice : SDTypeProfile<1, 3, [ // vector splice + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisInt<3> +]>; def SDTVecExtract : SDTypeProfile<1, 2, [ // vector extract SDTCisEltOfVec<0, 1>, SDTCisPtrTy<2> ]>; @@ -655,6 +658,7 @@ def vector_shuffle : SDNode<"ISD::VECTOR_SHUFFLE", SDTVecShuffle, []>; def vector_reverse : SDNode<"ISD::VECTOR_REVERSE", SDTVecReverse>; +def vector_splice : SDNode<"ISD::VECTOR_SPLICE", SDTVecSlice, []>; def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>; def splat_vector : SDNode<"ISD::SPLAT_VECTOR", SDTypeProfile<1, 1, []>, []>; def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>, Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3208,6 +3208,10 @@ Results.push_back(Tmp1); break; } + case ISD::VECTOR_SPLICE: { + Results.push_back(TLI.expandVectorSplice(Node, DAG)); + break; + } case ISD::EXTRACT_ELEMENT: { EVT OpTy = Node->getOperand(0).getValueType(); if (cast(Node->getOperand(1))->getZExtValue()) { @@ -4714,6 +4718,14 @@ Results.push_back(Tmp1); break; } + case ISD::VECTOR_SPLICE: { + Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0)); + Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(1)); + Tmp3 = DAG.getNode(ISD::VECTOR_SPLICE, dl, NVT, Tmp1, Tmp2, + Node->getOperand(2)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp3)); + break; + } case ISD::SETCC: case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: { Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -100,6 +100,8 @@ Res = PromoteIntRes_VECTOR_REVERSE(N); break; case ISD::VECTOR_SHUFFLE: Res = PromoteIntRes_VECTOR_SHUFFLE(N); break; + case ISD::VECTOR_SPLICE: + Res = PromoteIntRes_VECTOR_SPLICE(N); break; case ISD::INSERT_VECTOR_ELT: Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break; case ISD::BUILD_VECTOR: @@ -4612,6 +4614,15 @@ return Swap.getValue(1); } +SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) { + SDLoc dl(N); + + SDValue V0 = GetPromotedInteger(N->getOperand(0)); + SDValue V1 = GetPromotedInteger(N->getOperand(1)); + EVT OutVT = V0.getValueType(); + + return DAG.getNode(ISD::VECTOR_SPLICE, dl, OutVT, V0, V1, N->getOperand(2)); +} SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) { Index: llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -300,6 +300,7 @@ SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N); SDValue PromoteIntRes_VECTOR_REVERSE(SDNode *N); SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N); + SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N); SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N); SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N); SDValue PromoteIntRes_SPLAT_VECTOR(SDNode *N); @@ -838,6 +839,7 @@ void SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -936,6 +936,9 @@ case ISD::VECTOR_SHUFFLE: SplitVecRes_VECTOR_SHUFFLE(cast(N), Lo, Hi); break; + case ISD::VECTOR_SPLICE: + SplitVecRes_VECTOR_SPLICE(N, Lo, Hi); + break; case ISD::VAARG: SplitVecRes_VAARG(N, Lo, Hi); break; @@ -5505,3 +5508,25 @@ Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, InHi.getValueType(), InHi); Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, InLo.getValueType(), InLo); } + +void DAGTypeLegalizer::SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo, + SDValue &Hi) { + EVT VT = N->getValueType(0); + SDLoc DL(N); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + + auto &MF = DAG.getMachineFunction(); + MachinePointerInfo PtrInfo = MachinePointerInfo::getUnknownStack(MF); + + auto Expanded = cast(TLI.expandVectorSplice(N, DAG)); + SDValue Chain = Expanded->getChain(); + SDValue Ptr = Expanded->getOperand(1); + + // Load the lo part of the spliced result + Lo = DAG.getLoad(LoVT, DL, Chain, Ptr, PtrInfo); + IncrementPointer(cast(Lo), LoVT, PtrInfo, Ptr); + // Load the hi part of the spliced result + Hi = DAG.getLoad(HiVT, DL, Chain, Ptr, PtrInfo); +} Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -774,6 +774,7 @@ void visitVectorReduce(const CallInst &I, unsigned Intrinsic); void visitVectorReverse(const CallInst &I); + void visitVectorSplice(const CallInst &I); void visitUserOp1(const Instruction &I) { llvm_unreachable("UserOp1 should not exist at instruction selection time!"); Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7028,6 +7028,9 @@ case Intrinsic::experimental_vector_reverse: visitVectorReverse(I); return; + case Intrinsic::experimental_vector_splice: + visitVectorSplice(I); + return; } } @@ -10879,3 +10882,37 @@ setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(), DAG.getVTList(ValueVTs), Values)); } + +void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + SDLoc DL = getCurSDLoc(); + SDValue V1 = getValue(I.getOperand(0)); + SDValue V2 = getValue(I.getOperand(1)); + int64_t Imm = cast(I.getOperand(2))->getSExtValue(); + + // VECTOR_SHUFFLE doesn't support a scalable mask so use a dedicated node. + if (VT.isScalableVector()) { + MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V1, V2, + DAG.getConstant(Imm, DL, IdxVT))); + return; + } + + unsigned NumElts = VT.getVectorNumElements(); + + if ((-Imm > NumElts) || (Imm >= NumElts)) { + // Result is undefined if immediate is out-of-bounds. + setValue(&I, DAG.getUNDEF(VT)); + return; + } + + uint64_t Idx = (NumElts + (Imm % NumElts)) % NumElts; + + // Use VECTOR_SHUFFLE to maintain original behaviour for fixed-length vectors. + SmallVector Mask; + for (unsigned i = 0; i < NumElts; ++i) + Mask.push_back(Idx + i); + setValue(&I, DAG.getVectorShuffle(VT, DL, V1, V2, Mask)); +} Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -288,6 +288,7 @@ case ISD::EXTRACT_SUBVECTOR: return "extract_subvector"; case ISD::SCALAR_TO_VECTOR: return "scalar_to_vector"; case ISD::VECTOR_SHUFFLE: return "vector_shuffle"; + case ISD::VECTOR_SPLICE: return "vector_splice"; case ISD::SPLAT_VECTOR: return "splat_vector"; case ISD::VECTOR_REVERSE: return "vector_reverse"; case ISD::CARRY_FALSE: return "carry_false"; Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8598,3 +8598,76 @@ SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); return DAG.getSelectCC(dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO); } + +SDValue TargetLowering::expandVectorSplice(SDNode *Node, + SelectionDAG &DAG) const { + assert(Node->getOpcode() == ISD::VECTOR_SPLICE && "Unexpected opcode!"); + assert(Node->getValueType(0).isScalableVector() && + "Fixed length vector types expected to use SHUFFLE_VECTOR!"); + + EVT VT = Node->getValueType(0); + SDValue V1 = Node->getOperand(0); + SDValue V2 = Node->getOperand(1); + int64_t Imm = cast(Node->getOperand(2))->getSExtValue(); + SDLoc DL(Node); + + // Expand through memory thusly: + // Alloca CONCAT_VECTORS_TYPES(V1, V2) Ptr + // Store V1, Ptr + // Store V2, Ptr + sizeof(V1) + // If (Imm < 0) + // TrailingElts = -Imm + // Ptr = Ptr + sizeof(V1) - (TrailingElts * sizeof(VT.Elt)) + // else + // Ptr = Ptr + (Imm * sizeof(VT.Elt)) + // Res = Load Ptr + + Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false); + + EVT MemVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorElementCount() * 2); + SDValue StackPtr = DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment); + EVT PtrVT = StackPtr.getValueType(); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + // Store the lo part of CONCAT_VECTORS(V1, V2) + SDValue StoreV1 = DAG.getStore(DAG.getEntryNode(), DL, V1, StackPtr, PtrInfo); + // Store the hi part of CONCAT_VECTORS(V1, V2) + SDValue OffsetToV2 = DAG.getVScale( + DL, PtrVT, + APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinSize())); + SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, OffsetToV2); + SDValue StoreV2 = DAG.getStore(StoreV1, DL, V2, StackPtr2, PtrInfo); + + if (Imm < 0) { + uint64_t TrailingElts = -Imm; + + // NOTE: TrailingElts must be clamped so as not to read outside of V1:V2. + TypeSize EltByteSize = VT.getVectorElementType().getStoreSize(); + SDValue TrailingBytes = + DAG.getConstant(TrailingElts * EltByteSize, DL, PtrVT); + + if (TrailingElts > VT.getVectorMinNumElements()) { + SDValue VLBytes = + DAG.getVScale(DL, PtrVT, + APInt(PtrVT.getFixedSizeInBits(), + VT.getStoreSize().getKnownMinSize())); + TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VLBytes); + } + + // Calculate the start address of the spliced result. + StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes); + + // Load the spliced result + return DAG.getLoad(VT, DL, StoreV2, StackPtr2, + MachinePointerInfo::getUnknownStack(MF)); + } + + // Load back the required element. + StackPtr = getVectorElementPointer(DAG, StackPtr, VT, Node->getOperand(2)); + // Load the spliced result + return DAG.getLoad(VT, DL, StoreV2, StackPtr, + MachinePointerInfo::getUnknownStack(MF)); +} Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -849,6 +849,9 @@ setOperationAction(ISD::VECREDUCE_FMIN, VT, Expand); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Expand); setOperationAction(ISD::VECREDUCE_SEQ_FMUL, VT, Expand); + + // Named vector shuffles default to expand. + setOperationAction(ISD::VECTOR_SPLICE, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1106,6 +1106,7 @@ setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::SMIN, VT, Custom); @@ -1269,6 +1270,11 @@ for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); } + + setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64); + setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32); + setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16); + setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); Index: llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll @@ -0,0 +1,237 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; VECTOR_SPLICE (index) +; + +define <16 x i8> @splice_v16i8_idx(<16 x i8> %a, <16 x i8> %b) #0 { +; CHECK-LABEL: splice_v16i8_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #1 +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) + ret <16 x i8> %res +} + +define <8 x i16> @splice_v8i16_idx(<8 x i16> %a, <8 x i16> %b) #0 { +; CHECK-LABEL: splice_v8i16_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> %a, <8 x i16> %b, i32 2) + ret <8 x i16> %res +} + +define <4 x i32> @splice_v4i32_idx(<4 x i32> %a, <4 x i32> %b) #0 { +; CHECK-LABEL: splice_v4i32_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2) + ret <4 x i32> %res +} + +define <2 x i64> @splice_v2i64_idx(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK-LABEL: splice_v2i64_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> %a, <2 x i64> %b, i32 1) + ret <2 x i64> %res +} + +define <8 x half> @splice_v8f16_idx(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: splice_v8f16_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #2 +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> %a, <8 x half> %b, i32 1) + ret <8 x half> %res +} + +define <4 x float> @splice_v4f32_idx(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: splice_v4f32_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> %a, <4 x float> %b, i32 1) + ret <4 x float> %res +} + +define <2 x double> @splice_v2f64_idx(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: splice_v2f64_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ret + %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1) + ret <2 x double> %res +} + +; Verify promote type legalisation works as expected. +define <2 x i8> @splice_v2i8_idx(<2 x i8> %a, <2 x i8> %b) #0 { +; CHECK-LABEL: splice_v2i8_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK-NEXT: ret + %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1) + ret <2 x i8> %res +} + +; Verify splitvec type legalisation works as expected. +define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 { +; CHECK-LABEL: splice_v8i32_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #4 +; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #4 +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5) + ret <8 x i32> %res +} + +; Verify splitvec type legalisation works as expected. +define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 { +; CHECK-LABEL: splice_v16f32_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #12 +; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: ext v2.16b, v3.16b, v4.16b, #12 +; CHECK-NEXT: ext v3.16b, v4.16b, v5.16b, #12 +; CHECK-NEXT: ret + %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7) + ret <16 x float> %res +} + +; Verify out-of-bounds index results in undef vector. +define <2 x i64> @splice_v2i64_idx_out_of_bounds(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK-LABEL: splice_v2i64_idx_out_of_bounds: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> %a, <2 x i64> %b, i32 2) + ret <2 x i64> %res +} + +; +; VECTOR_SPLICE (trailing elements) +; + +define <16 x i8> @splice_v16i8(<16 x i8> %a, <16 x i8> %b) #0 { +; CHECK-LABEL: splice_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #1 +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 -15) + ret <16 x i8> %res +} + +define <8 x i16> @splice_v8i16(<8 x i16> %a, <8 x i16> %b) #0 { +; CHECK-LABEL: splice_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> %a, <8 x i16> %b, i32 -6) + ret <8 x i16> %res +} + +define <4 x i32> @splice_v4i32(<4 x i32> %a, <4 x i32> %b) #0 { +; CHECK-LABEL: splice_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> %a, <4 x i32> %b, i32 -2) + ret <4 x i32> %res +} + +define <2 x i64> @splice_v2i64(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK-LABEL: splice_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> %a, <2 x i64> %b, i32 -1) + ret <2 x i64> %res +} + +define <8 x half> @splice_v8f16(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: splice_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #2 +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> %a, <8 x half> %b, i32 -7) + ret <8 x half> %res +} + +define <4 x float> @splice_v4f32(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: splice_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> %a, <4 x float> %b, i32 -3) + ret <4 x float> %res +} + +define <2 x double> @splice_v2f64(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: splice_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ret + %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -1) + ret <2 x double> %res +} + +; Verify promote type legalisation works as expected. +define <2 x i8> @splice_v2i8(<2 x i8> %a, <2 x i8> %b) #0 { +; CHECK-LABEL: splice_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK-NEXT: ret + %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 -1) + ret <2 x i8> %res +} + +; Verify splitvec type legalisation works as expected. +define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 { +; CHECK-LABEL: splice_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #4 +; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #4 +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3) + ret <8 x i32> %res +} + +; Verify splitvec type legalisation works as expected. +define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 { +; CHECK-LABEL: splice_v16f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #12 +; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: ext v2.16b, v3.16b, v4.16b, #12 +; CHECK-NEXT: ext v3.16b, v4.16b, v5.16b, #12 +; CHECK-NEXT: ret + %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9) + ret <16 x float> %res +} + +; Verify out-of-bounds trailing element count results in undef vector. +define <2 x i64> @splice_v2i64_out_of_bounds(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK-LABEL: splice_v2i64_out_of_bounds: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> %a, <2 x i64> %b, i32 -3) + ret <2 x i64> %res +} + +declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32) +declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32>, <4 x i32>, i32) +declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32) +declare <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64>, <2 x i64>, i32) +declare <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half>, <8 x half>, i32) +declare <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float>, <4 x float>, i32) +declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32) +declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32) + +attributes #0 = { nounwind "target-features"="+neon" } Index: llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -0,0 +1,1310 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; VECTOR_SPLICE (index) +; + +define @splice_nxv16i8_first_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8_first_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #0 // =0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 0) + ret %res +} + +define @splice_nxv16i8_last_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8_last_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: mov w10, #15 +; CHECK-NEXT: cmp x9, #15 // =15 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 15) + ret %res +} + +; Ensure index is clamped when we cannot prove it's less than VL-1. +define @splice_nxv16i8_clamped_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8_clamped_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: cmp x9, #16 // =16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 16) + ret %res +} + +define @splice_nxv8i16_first_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16_first_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #0 // =0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #1 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv8i16_last_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16_last_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cnth x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #7 +; CHECK-NEXT: cmp x10, #7 // =7 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #1 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 7) + ret %res +} + +; Ensure index is clamped when we cannot prove it's less than VL-1. +define @splice_nxv8i16_clamped_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16_clamped_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cnth x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: cmp x10, #8 // =8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #1 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 8) + ret %res +} + +define @splice_nxv4i32_first_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32_first_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #0 // =0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 0) + ret %res +} + +define @splice_nxv4i32_last_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32_last_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntw x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #3 +; CHECK-NEXT: cmp x10, #3 // =3 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 3) + ret %res +} + +; Ensure index is clamped when we cannot prove it's less than VL-1. +define @splice_nxv4i32_clamped_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32_clamped_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntw x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: cmp x10, #4 // =4 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 4) + ret %res +} + +define @splice_nxv2i64_first_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64_first_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #0 // =0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #3 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 0) + ret %res +} + +define @splice_nxv2i64_last_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64_last_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #1 // =1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #3 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 1) + ret %res +} + +; Ensure index is clamped when we cannot prove it's less than VL-1. +define @splice_nxv2i64_clamped_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64_clamped_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntd x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x10, #2 // =2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #3 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 2) + ret %res +} + +define @splice_nxv8f16_first_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16_first_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #0 // =0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #1 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv8f16_last_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16_last_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cnth x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #7 +; CHECK-NEXT: cmp x10, #7 // =7 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #1 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 7) + ret %res +} + +; Ensure index is clamped when we cannot prove it's less than VL-1. +define @splice_nxv8f16_clamped_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16_clamped_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cnth x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: cmp x10, #8 // =8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #1 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 8) + ret %res +} + +define @splice_nxv4f32_first_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32_first_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #0 // =0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 0) + ret %res +} + +define @splice_nxv4f32_last_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32_last_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntw x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #3 +; CHECK-NEXT: cmp x10, #3 // =3 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 3) + ret %res +} + +; Ensure index is clamped when we cannot prove it's less than VL-1. +define @splice_nxv4f32_clamped_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32_clamped_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntw x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: cmp x10, #4 // =4 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 4) + ret %res +} + +define @splice_nxv2f64_first_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64_first_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #0 // =0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #3 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 0) + ret %res +} + +define @splice_nxv2f64_last_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64_last_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #1 // =1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #3 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 1) + ret %res +} + +; Ensure index is clamped when we cannot prove it's less than VL-1. +define @splice_nxv2f64_clamped_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64_clamped_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntd x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x10, #2 // =2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #3 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 2) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv2i1_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i1_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cmp x9, #1 // =1 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: st1d { z0.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #3 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 1) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv4i1_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i1_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntw x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x10, #2 // =2 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1w { z0.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 2) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv8i1_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i1_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cnth x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: cmp x10, #4 // =4 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1h { z0.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #1 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 4) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv16i1_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i1_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov w10, #8 +; CHECK-NEXT: cmp x9, #8 // =8 +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: st1b { z0.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 8) + ret %res +} + +; Verify promote type legalisation works as expected. +define @splice_nxv2i8_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i8_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: cmp x9, #1 // =1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #3 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 1) + ret %res +} + +; Verify splitvec type legalisation works as expected. +define @splice_nxv8i32_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i32_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: cnth x10 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x10, #2 // =2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] +; CHECK-NEXT: orr x8, x8, x9, lsl #2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i32( %a, %b, i32 2) + ret %res +} + +; Verify splitvec type legalisation works as expected. +define @splice_nxv16f32_clamped_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16f32_clamped_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-8 +; CHECK-NEXT: rdvl x10, #1 +; CHECK-NEXT: sub x10, x10, #1 // =1 +; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: cmp x10, #16 // =16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z7.s }, p0, [x8, #7, mul vl] +; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl] +; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl] +; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16f32( %a, %b, i32 16) + ret %res +} + +; +; VECTOR_SPLICE (trailing elements) +; + +define @splice_nxv16i8( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -16) + ret %res +} + +define @splice_nxv16i8_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv16i8_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #17 +; CHECK-NEXT: cmp x9, #17 // =17 +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -17) + ret %res +} + +define @splice_nxv8i16( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -8) + ret %res +} + +define @splice_nxv8i16_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #2 // =2 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv8i16_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: cmp x9, #18 // =18 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -9) + ret %res +} + +define @splice_nxv4i32( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -4) + ret %res +} + +define @splice_nxv4i32_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #4 // =4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv4i32_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #20 +; CHECK-NEXT: cmp x9, #20 // =20 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -5) + ret %res +} + +define @splice_nxv2i64( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -2) + ret %res +} + +define @splice_nxv2i64_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #8 // =8 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv2i64_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #24 +; CHECK-NEXT: cmp x9, #24 // =24 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -3) + ret %res +} + +define @splice_nxv8f16( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -8) + ret %res +} + +define @splice_nxv8f16_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #2 // =2 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv8f16_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: cmp x9, #18 // =18 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -9) + ret %res +} + +define @splice_nxv4f32( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -4) + ret %res +} + +define @splice_nxv4f32_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #4 // =4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv4f32_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #20 +; CHECK-NEXT: cmp x9, #20 // =20 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -5) + ret %res +} + +define @splice_nxv2f64( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -2) + ret %res +} + +define @splice_nxv2f64_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #8 // =8 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv2f64_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #24 +; CHECK-NEXT: cmp x9, #24 // =24 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -3) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv2i1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #8 // =8 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 -1) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv4i1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #4 // =4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 -1) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv8i1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #2 // =2 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 -1) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv16i1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 -1) + ret %res +} + +; Verify promote type legalisation works as expected. +define @splice_nxv2i8( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 -2) + ret %res +} + +; Verify splitvec type legalisation works as expected. +define @splice_nxv8i32( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] +; CHECK-NEXT: addvl x8, x8, #2 +; CHECK-NEXT: sub x8, x8, #32 // =32 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i32( %a, %b, i32 -8) + ret %res +} + +; Verify splitvec type legalisation works as expected. +define @splice_nxv16f32_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16f32_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-8 +; CHECK-NEXT: rdvl x9, #4 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #68 +; CHECK-NEXT: cmp x9, #68 // =68 +; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z7.s }, p0, [x8, #7, mul vl] +; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl] +; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl] +; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl] +; CHECK-NEXT: addvl x8, x8, #4 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16f32( %a, %b, i32 -17) + ret %res +} + +declare @llvm.experimental.vector.splice.nxv2i1(, , i32) +declare @llvm.experimental.vector.splice.nxv4i1(, , i32) +declare @llvm.experimental.vector.splice.nxv8i1(, , i32) +declare @llvm.experimental.vector.splice.nxv16i1(, , i32) +declare @llvm.experimental.vector.splice.nxv2i8(, , i32) +declare @llvm.experimental.vector.splice.nxv16i8(, , i32) +declare @llvm.experimental.vector.splice.nxv8i16(, , i32) +declare @llvm.experimental.vector.splice.nxv4i32(, , i32) +declare @llvm.experimental.vector.splice.nxv8i32(, , i32) +declare @llvm.experimental.vector.splice.nxv2i64(, , i32) +declare @llvm.experimental.vector.splice.nxv8f16(, , i32) +declare @llvm.experimental.vector.splice.nxv4f32(, , i32) +declare @llvm.experimental.vector.splice.nxv16f32(, , i32) +declare @llvm.experimental.vector.splice.nxv2f64(, , i32) + +attributes #0 = { nounwind "target-features"="+sve" }