Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -16182,6 +16182,42 @@ vector index constant type (for most targets this will be an integer pointer type). +'``llvm.experimental.vector.splice``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %trailing.elts) + declare @llvm.experimental.vector.splice.nxv4i32( %vec1, %vec2, i32 %trailing.elts) + +Overview: +""""""""" + +The '``llvm.experimental.vector.splice.*``' intrinsics construct a vector by +concatenating the trailing elements from the first input vector with the +starting elements of the second input vector, returning a vector of the same +type as the input vectors. + +For example: + +.. code-block:: text + + experimental.vector.splice(, , 3) ==> + + +Arguments: +"""""""""" + +The first two operands are vectors with the same type. The third argument +``trailing.elts`` specifies the number of trailing elements to extract from the +first vector, if this exceeds the known minimum number of elements in the first +vector, it is clamped. + + Matrix Intrinsics ----------------- Index: llvm/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -549,6 +549,11 @@ /// in terms of the element size of VEC1/VEC2, not in terms of bytes. VECTOR_SHUFFLE, + // VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a vector, of the same type as + // VEC1/VEC2, whose elements are shuffled using the following algorithm: + // RESULT[i] = CONCAT_VECTORS(VEC1,VEC2)[VEC1.ElementCount - IMM + i] + VECTOR_SPLICE, + /// SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a /// scalar value into element 0 of the resultant vector type. The top /// elements 1 to N-1 of the N-element vector are undefined. The type Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1634,6 +1634,13 @@ [llvm_anyvector_ty, llvm_i64_ty], [IntrNoMem, ImmArg>]>; +//===---------- Named shufflevector intrinsics ------===// +def int_experimental_vector_splice : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// Index: llvm/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/include/llvm/Target/TargetSelectionDAG.td +++ llvm/include/llvm/Target/TargetSelectionDAG.td @@ -241,6 +241,9 @@ def SDTVecShuffle : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ]>; +def SDTVecSlice : SDTypeProfile<1, 3, [ // vector splice + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisInt<3> +]>; def SDTVecExtract : SDTypeProfile<1, 2, [ // vector extract SDTCisEltOfVec<0, 1>, SDTCisPtrTy<2> ]>; @@ -651,6 +654,7 @@ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def vector_shuffle : SDNode<"ISD::VECTOR_SHUFFLE", SDTVecShuffle, []>; +def vector_splice : SDNode<"ISD::VECTOR_SPLICE", SDTVecSlice, []>; def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>; def splat_vector : SDNode<"ISD::SPLAT_VECTOR", SDTypeProfile<1, 1, []>, []>; def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>, Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -188,6 +188,7 @@ SDValue ExpandExtractFromVectorThroughStack(SDValue Op); SDValue ExpandInsertToVectorThroughStack(SDValue Op); SDValue ExpandVectorBuildThroughStack(SDNode* Node); + SDValue ExpandVectorSpliceThroughStack(SDNode *Node); SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP); SDValue ExpandConstant(ConstantSDNode *CP); @@ -1469,6 +1470,64 @@ return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo); } +SDValue SelectionDAGLegalize::ExpandVectorSpliceThroughStack(SDNode *Node) { + assert(Node->getOpcode() == ISD::VECTOR_SPLICE && "Unexpected opcode!"); + assert(Node->getValueType(0).isScalableVector() && + "Fixed length vector types expected to use SHUFFLE_VECTOR!"); + + EVT VT = Node->getValueType(0); + SDValue V1 = Node->getOperand(0); + SDValue V2 = Node->getOperand(1); + uint64_t TrailingElts = Node->getConstantOperandVal(2); + SDLoc DL(Node); + + // Expand through memory thusly: + // Alloca CONCAT_VECTORS_TYPES(V1, V2) Ptr + // Store V1, Ptr + // Store V2, Ptr + sizeof(V1) + // Ptr = Ptr + sizeof(V1) - (TrailingElts * sizeof(VT.Elt)) + // Res = Load Ptr + + Type *StoreType = VT.getTypeForEVT(*DAG.getContext()); + Align Alignment = DAG.getDataLayout().getPrefTypeAlign(StoreType); + + EVT MemVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorElementCount() * 2); + SDValue Ptr = DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment); + EVT PtrVT = Ptr.getValueType(); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(Ptr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + // Store the lo part of CONCAT_VECTORS(V1, V2) + SDValue StoreV1 = DAG.getStore(DAG.getEntryNode(), DL, V1, Ptr, PtrInfo); + // Store the hi part of CONCAT_VECTORS(V1, V2) + SDValue OffsetToV2 = DAG.getVScale( + DL, PtrVT, + APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinSize())); + Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, OffsetToV2); + SDValue StoreV2 = DAG.getStore(StoreV1, DL, V2, Ptr, PtrInfo); + + // NOTE: TrailingElts must be clamped so as not to read outside of V1:V2. + TypeSize EltByteSize = VT.getVectorElementType().getStoreSize(); + SDValue TrailingBytes = + DAG.getConstant(TrailingElts * EltByteSize, DL, PtrVT); + + if (TrailingElts > VT.getVectorMinNumElements()) { + SDValue VLBytes = DAG.getVScale( + DL, PtrVT, + APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinSize())); + TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VLBytes); + } + + // Calculate the start address of the spliced result. + Ptr = DAG.getNode(ISD::SUB, DL, PtrVT, Ptr, TrailingBytes); + + // Load the spliced result + return DAG.getLoad(VT, DL, StoreV2, Ptr, + MachinePointerInfo::getUnknownStack(MF)); +} + /// Bitcast a floating-point value to an integer value. Only bitcast the part /// containing the sign bit if the target has no integer value capable of /// holding all bits of the floating-point value. @@ -3319,6 +3378,10 @@ Results.push_back(Tmp1); break; } + case ISD::VECTOR_SPLICE: { + Results.push_back(ExpandVectorSpliceThroughStack(Node)); + break; + } case ISD::EXTRACT_ELEMENT: { EVT OpTy = Node->getOperand(0).getValueType(); if (cast(Node->getOperand(1))->getZExtValue()) { @@ -4825,6 +4888,14 @@ Results.push_back(Tmp1); break; } + case ISD::VECTOR_SPLICE: { + Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0)); + Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(1)); + Tmp3 = DAG.getNode(ISD::VECTOR_SPLICE, dl, NVT, Tmp1, Tmp2, + Node->getOperand(2)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp3)); + break; + } case ISD::SETCC: case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: { Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -98,6 +98,8 @@ Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break; case ISD::VECTOR_SHUFFLE: Res = PromoteIntRes_VECTOR_SHUFFLE(N); break; + case ISD::VECTOR_SPLICE: + Res = PromoteIntRes_VECTOR_SPLICE(N); break; case ISD::INSERT_VECTOR_ELT: Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break; case ISD::BUILD_VECTOR: @@ -4587,6 +4589,15 @@ return Swap.getValue(1); } +SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) { + SDLoc dl(N); + + SDValue V0 = GetPromotedInteger(N->getOperand(0)); + SDValue V1 = GetPromotedInteger(N->getOperand(1)); + EVT OutVT = V0.getValueType(); + + return DAG.getNode(ISD::VECTOR_SPLICE, dl, OutVT, V0, V1, N->getOperand(2)); +} SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) { Index: llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -299,6 +299,7 @@ SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo); SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N); SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N); + SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N); SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N); SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N); SDValue PromoteIntRes_SPLAT_VECTOR(SDNode *N); @@ -835,6 +836,7 @@ void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -933,6 +933,9 @@ case ISD::VECTOR_SHUFFLE: SplitVecRes_VECTOR_SHUFFLE(cast(N), Lo, Hi); break; + case ISD::VECTOR_SPLICE: + SplitVecRes_VECTOR_SPLICE(N, Lo, Hi); + break; case ISD::VAARG: SplitVecRes_VAARG(N, Lo, Hi); break; @@ -5492,3 +5495,64 @@ Ops[Idx] = FillVal; return DAG.getBuildVector(NVT, dl, Ops); } + +void DAGTypeLegalizer::SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo, + SDValue &Hi) { + EVT VT = N->getValueType(0); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + uint64_t TrailingElts = N->getConstantOperandVal(2); + SDLoc DL(N); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + + // The operation cannot be split in two so expand it instead: + // Alloca CONCAT_VECTORS_TYPES(V1, V2) Ptr + // Store V1, Ptr + // Store V2, Ptr + sizeof(V1) + // Ptr = Ptr + sizeof(V1) - (TrailingElts * sizeof(VT.Elt)) + // Lo = Load Ptr + // hi = Load Ptr + sizeof(lo) + + // In cases where the vector is illegal it will be broken down into parts + // and stored in parts - we should use the alignment for the smallest part. + Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false); + + EVT MemVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorElementCount() * 2); + SDValue Ptr = DAG.CreateStackTemporary(MemVT.getStoreSize(), SmallestAlign); + EVT PtrVT = Ptr.getValueType(); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(Ptr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + // Store the lo part of CONCAT_VECTORS(V1, V2) + SDValue StoreV1 = + DAG.getStore(DAG.getEntryNode(), DL, V1, Ptr, PtrInfo, SmallestAlign); + // Store the hi part of CONCAT_VECTORS(V1, V2) + IncrementPointer(cast(StoreV1), VT, PtrInfo, Ptr); + SDValue StoreV2 = DAG.getStore(StoreV1, DL, V2, Ptr, PtrInfo, SmallestAlign); + + // NOTE: TrailingElts must be clamped so as not to read outside of V1:V2. + TypeSize EltByteSize = VT.getVectorElementType().getStoreSize(); + SDValue TrailingBytes = + DAG.getConstant(TrailingElts * EltByteSize, DL, PtrVT); + if (TrailingElts > VT.getVectorMinNumElements()) { + SDValue VLBytes = DAG.getVScale( + DL, PtrVT, + APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinSize())); + TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VLBytes); + } + + // Calculate the start address of the spliced result. + Ptr = DAG.getNode(ISD::SUB, DL, PtrVT, Ptr, TrailingBytes); + + // Load the lo part of the spliced result + Lo = DAG.getLoad(LoVT, DL, StoreV2, Ptr, + MachinePointerInfo::getUnknownStack(MF)); + // Load the hi part of the spliced result + MachinePointerInfo MPI = cast(Lo)->getPointerInfo(); + IncrementPointer(cast(Lo), LoVT, MPI, Ptr); + Hi = DAG.getLoad(HiVT, DL, StoreV2, Ptr, MPI); +} Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -773,6 +773,7 @@ void visitGCResult(const GCResultInst &I); void visitVectorReduce(const CallInst &I, unsigned Intrinsic); + void visitVectorSplice(const CallInst &I); void visitUserOp1(const Instruction &I) { llvm_unreachable("UserOp1 should not exist at instruction selection time!"); Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6998,6 +6998,9 @@ setValue(&I, DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, Index)); return; } + case Intrinsic::experimental_vector_splice: + visitVectorSplice(I); + return; } } @@ -10822,3 +10825,31 @@ setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(), DAG.getVTList(ValueVTs), Values)); } + +void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + SDLoc DL = getCurSDLoc(); + SDValue V1 = getValue(I.getOperand(0)); + SDValue V2 = getValue(I.getOperand(1)); + unsigned TrailingElts = cast(I.getOperand(2))->getZExtValue(); + + // VECTOR_SHUFFLE doesn't support a scalable mask so use a dedicated node. + if (VT.isScalableVector()) { + MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V1, V2, + DAG.getConstant(TrailingElts, DL, IdxVT))); + return; + } + + unsigned NumElts = VT.getVectorNumElements(); + assert(TrailingElts <= NumElts && "Invalid number of trailing elements!"); + + // Use VECTOR_SHUFFLE to maintain original behaviour for fixed-length vectors. + SmallVector Mask; + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(NumElts - TrailingElts + i); + + setValue(&I, DAG.getVectorShuffle(VT, DL, V1, V2, Mask)); +} Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -288,6 +288,7 @@ case ISD::EXTRACT_SUBVECTOR: return "extract_subvector"; case ISD::SCALAR_TO_VECTOR: return "scalar_to_vector"; case ISD::VECTOR_SHUFFLE: return "vector_shuffle"; + case ISD::VECTOR_SPLICE: return "vector_splice"; case ISD::SPLAT_VECTOR: return "splat_vector"; case ISD::CARRY_FALSE: return "carry_false"; case ISD::ADDC: return "addc"; Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -849,6 +849,9 @@ setOperationAction(ISD::VECREDUCE_FMIN, VT, Expand); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Expand); setOperationAction(ISD::VECREDUCE_SEQ_FMUL, VT, Expand); + + // Named vector shuffles default to expand. + setOperationAction(ISD::VECTOR_SPLICE, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -900,6 +900,7 @@ SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1090,6 +1090,7 @@ setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::SMIN, VT, Custom); @@ -1253,6 +1254,15 @@ for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); } + + setOperationAction(ISD::VECTOR_SPLICE, MVT::nxv2i1, Promote); + AddPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64); + setOperationAction(ISD::VECTOR_SPLICE, MVT::nxv4i1, Promote); + AddPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32); + setOperationAction(ISD::VECTOR_SPLICE, MVT::nxv8i1, Promote); + AddPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16); + setOperationAction(ISD::VECTOR_SPLICE, MVT::nxv16i1, Promote); + AddPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); Index: llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; VECTOR_SPLICE +; + +define <16 x i8> @splice_v16i8(<16 x i8> %a, <16 x i8> %b) #0 { +; CHECK-LABEL: splice_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #1 +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 15) + ret <16 x i8> %res +} + +define <8 x i16> @splice_v8i16(<8 x i16> %a, <8 x i16> %b) #0 { +; CHECK-LABEL: splice_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> %a, <8 x i16> %b, i32 6) + ret <8 x i16> %res +} + +define <4 x i32> @splice_v4i32(<4 x i32> %a, <4 x i32> %b) #0 { +; CHECK-LABEL: splice_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2) + ret <4 x i32> %res +} + +define <2 x i64> @splice_v2i64(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK-LABEL: splice_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> %a, <2 x i64> %b, i32 1) + ret <2 x i64> %res +} + +define <8 x half> @splice_v8f16(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: splice_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #2 +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> %a, <8 x half> %b, i32 7) + ret <8 x half> %res +} + +define <4 x float> @splice_v4f32(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: splice_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> %a, <4 x float> %b, i32 3) + ret <4 x float> %res +} + +define <2 x double> @splice_v2f64(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: splice_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ret + %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1) + ret <2 x double> %res +} + +; Verify promote type legalisation works as expected. +define <2 x i8> @splice_v2i8(<2 x i8> %a, <2 x i8> %b) #0 { +; CHECK-LABEL: splice_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK-NEXT: ret + %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1) + ret <2 x i8> %res +} + +; Verify splitvec type legalisation works as expected. +define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 { +; CHECK-LABEL: splice_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #4 +; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #4 +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 3) + ret <8 x i32> %res +} + +; Verify splitvec type legalisation works as expected. +define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 { +; CHECK-LABEL: splice_v16f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #12 +; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: ext v2.16b, v3.16b, v4.16b, #12 +; CHECK-NEXT: ext v3.16b, v4.16b, v5.16b, #12 +; CHECK-NEXT: ret + %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 9) + ret <16 x float> %res +} + +declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32) +declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32>, <4 x i32>, i32) +declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32) +declare <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64>, <2 x i64>, i32) +declare <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half>, <8 x half>, i32) +declare <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float>, <4 x float>, i32) +declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32) +declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32) + +attributes #0 = { nounwind "target-features"="+neon" } Index: llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -0,0 +1,631 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; VECTOR_SPLICE +; + +define @splice_nxv16i8( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 16) + ret %res +} + +define @splice_nxv16i8_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv16i8_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #17 +; CHECK-NEXT: cmp x9, #17 // =17 +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 17) + ret %res +} + +define @splice_nxv8i16( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 8) + ret %res +} + +define @splice_nxv8i16_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #2 // =2 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv8i16_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: cmp x9, #18 // =18 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 9) + ret %res +} + +define @splice_nxv4i32( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 4) + ret %res +} + +define @splice_nxv4i32_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #4 // =4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv4i32_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #20 +; CHECK-NEXT: cmp x9, #20 // =20 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 5) + ret %res +} + +define @splice_nxv2i64( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 2) + ret %res +} + +define @splice_nxv2i64_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #8 // =8 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv2i64_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #24 +; CHECK-NEXT: cmp x9, #24 // =24 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 3) + ret %res +} + +define @splice_nxv8f16( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 8) + ret %res +} + +define @splice_nxv8f16_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #2 // =2 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv8f16_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: cmp x9, #18 // =18 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 9) + ret %res +} + +define @splice_nxv4f32( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 4) + ret %res +} + +define @splice_nxv4f32_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #4 // =4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv4f32_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #20 +; CHECK-NEXT: cmp x9, #20 // =20 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 5) + ret %res +} + +define @splice_nxv2f64( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 2) + ret %res +} + +define @splice_nxv2f64_1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #8 // =8 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 1) + ret %res +} + +; Ensure number of trailing elements is clamped when we cannot prove it's less than VL. +define @splice_nxv2f64_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #24 +; CHECK-NEXT: cmp x9, #24 // =24 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 3) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv2i1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #8 // =8 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 1) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv4i1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #4 // =4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 1) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv8i1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #2 // =2 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 1) + ret %res +} + +; Ensure predicate based splice is promoted to use ZPRs. +define @splice_nxv16i1( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 1) + ret %res +} + +; Verify promote type legalisation works as expected. +define @splice_nxv2i8( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: sub x8, x8, #16 // =16 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 2) + ret %res +} + +; Verify splitvec type legalisation works as expected. +define @splice_nxv8i32( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] +; CHECK-NEXT: addvl x8, x8, #2 +; CHECK-NEXT: sub x8, x8, #32 // =32 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv8i32( %a, %b, i32 8) + ret %res +} + +; Verify splitvec type legalisation works as expected. +define @splice_nxv16f32_clamped( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16f32_clamped: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-8 +; CHECK-NEXT: rdvl x9, #4 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov w10, #68 +; CHECK-NEXT: cmp x9, #68 // =68 +; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z7.s }, p0, [x8, #7, mul vl] +; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl] +; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl] +; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl] +; CHECK-NEXT: addvl x8, x8, #4 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16f32( %a, %b, i32 17) + ret %res +} + +declare @llvm.experimental.vector.splice.nxv2i1(, , i32) +declare @llvm.experimental.vector.splice.nxv4i1(, , i32) +declare @llvm.experimental.vector.splice.nxv8i1(, , i32) +declare @llvm.experimental.vector.splice.nxv16i1(, , i32) +declare @llvm.experimental.vector.splice.nxv2i8(, , i32) +declare @llvm.experimental.vector.splice.nxv16i8(, , i32) +declare @llvm.experimental.vector.splice.nxv8i16(, , i32) +declare @llvm.experimental.vector.splice.nxv4i32(, , i32) +declare @llvm.experimental.vector.splice.nxv8i32(, , i32) +declare @llvm.experimental.vector.splice.nxv2i64(, , i32) +declare @llvm.experimental.vector.splice.nxv8f16(, , i32) +declare @llvm.experimental.vector.splice.nxv4f32(, , i32) +declare @llvm.experimental.vector.splice.nxv16f32(, , i32) +declare @llvm.experimental.vector.splice.nxv2f64(, , i32) + +attributes #0 = { nounwind "target-features"="+sve" }