diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -17671,6 +17671,75 @@ The argument to this intrinsic must be a vector. +'``llvm.experimental.vector.deinterleave2``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec1) + declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec1) + +Overview: +""""""""" + +The '``llvm.experimental.vector.deinterleave2``' intrinsic constructs two +vectors by deinterleaving the even and odd lanes of the input vector. + +This intrinsic works for both fixed and scalable vectors. While this intrinsic +supports all vector types the recommended way to express this operation for +fixed-width vectors is still to use a shufflevector, as that may allow for more +optimization opportunities. + +For example: + +.. code-block:: text + + {<2 x i64>, <2 x i64>} llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> <i64 0, i64 1, i64 2, i64 3>); ==> {<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>} + +Arguments: +"""""""""" + +The argument is a vector whose type corresponds to the logical concatenation of +the two result types. + +'``llvm.experimental.vector.interleave2``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec1, <2 x double> %vec2) + declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2) + +Overview: +""""""""" + +The '``llvm.experimental.vector.interleave2``' intrinsic constructs a vector +by interleaving two input vectors. + +This intrinsic works for both fixed and scalable vectors. While this intrinsic +supports all vector types the recommended way to express this operation for +fixed-width vectors is still to use a shufflevector, as that may allow for more +optimization opportunities. + +For example: + +.. code-block:: text + + <4 x i64> llvm.experimental.vector.interleave2.v4i64(<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>); ==> <4 x i64> <i64 0, i64 1, i64 2, i64 3> + +Arguments: +"""""""""" +Both arguments must be vectors of the same type whereby their logical +concatenation matches the result type. + '``llvm.experimental.vector.splice``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -571,6 +571,19 @@ /// vector, but not the other way around. EXTRACT_SUBVECTOR, + /// VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and + /// output vectors having the same type. The first output contains the even + /// indices from CONCAT_VECTORS(VEC1, VEC2), with the second output + /// containing the odd indices. The relative order of elements within an + /// output match that of the concatenated input. + VECTOR_DEINTERLEAVE, + + /// VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and + /// output vectors having the same type. The first output contains the + /// result of interleaving the low half of CONCAT_VECTORS(VEC1, VEC2), with + /// the second output containing the result of interleaving the high half. + VECTOR_INTERLEAVE, + /// VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, /// whose elements are shuffled using the following algorithm: /// RESULT[i] = VECTOR[VECTOR.ElementCount - 1 - i] diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2116,6 +2116,17 @@ [llvm_anyvector_ty, llvm_i64_ty], [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>; + +def int_experimental_vector_interleave2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMHalfElementsVectorType<0>, + LLVMHalfElementsVectorType<0>], + [IntrNoMem]>; + +def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>, + LLVMHalfElementsVectorType<0>], + [llvm_anyvector_ty], + [IntrNoMem]>; + //===----------------- Pointer Authentication Intrinsics ------------------===// // diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -648,6 +648,8 @@ void visitVectorReduce(const CallInst &I, unsigned Intrinsic); void visitVectorReverse(const CallInst &I); void visitVectorSplice(const CallInst &I); + void visitVectorInterleave(const CallInst &I); + void visitVectorDeinterleave(const CallInst &I); void visitStepVector(const CallInst &I); void visitUserOp1(const Instruction &I) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/AssignmentTrackingAnalysis.h" #include "llvm/CodeGen/CodeGenCommonISel.h" @@ -7319,6 +7320,12 @@ case Intrinsic::experimental_vector_splice: visitVectorSplice(I); return; + case Intrinsic::experimental_vector_interleave2: + visitVectorInterleave(I); + return; + case Intrinsic::experimental_vector_deinterleave2: + visitVectorDeinterleave(I); + return; } } @@ -11549,6 +11556,64 @@ setValue(&I, DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), Mask)); } +void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I) { + auto DL = getCurSDLoc(); + SDValue InVec = getValue(I.getOperand(0)); + EVT OutVT = + InVec.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); + + unsigned OutNumElts = OutVT.getVectorMinNumElements(); + + // ISD Node needs the input vectors split into two equal parts + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, + DAG.getConstant(0, DL, MVT::i64)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, + DAG.getConstant(OutNumElts, DL, MVT::i64)); + + // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing + // legalisation and combines. + if (OutVT.isFixedLengthVector()) { + SDValue Even = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, + createStrideMask(0, 2, OutNumElts)); + SDValue Odd = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, + createStrideMask(1, 2, OutNumElts)); + SDValue Res = DAG.getMergeValues({Even, Odd}, getCurSDLoc()); + setValue(&I, Res); + return; + } + + SDValue Res = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, + DAG.getVTList(OutVT, OutVT), Lo, Hi); + setValue(&I, Res); + return; +} + +void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I) { + auto DL = getCurSDLoc(); + EVT InVT = getValue(I.getOperand(0)).getValueType(); + SDValue InVec0 = getValue(I.getOperand(0)); + SDValue InVec1 = getValue(I.getOperand(1)); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT OutVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing + // legalisation and combines. + if (OutVT.isFixedLengthVector()) { + unsigned NumElts = InVT.getVectorMinNumElements(); + SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVec0, InVec1); + setValue(&I, DAG.getVectorShuffle(OutVT, DL, V, DAG.getUNDEF(OutVT), + createInterleaveMask(NumElts, 2))); + return; + } + + SDValue Res = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, + DAG.getVTList(InVT, InVT), InVec0, InVec1); + Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Res.getValue(0), + Res.getValue(1)); + setValue(&I, Res); + return; +} + void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) { SmallVector<EVT, 4> ValueVTs; ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(), diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -293,6 +293,8 @@ case ISD::CONCAT_VECTORS: return "concat_vectors"; case ISD::INSERT_SUBVECTOR: return "insert_subvector"; case ISD::EXTRACT_SUBVECTOR: return "extract_subvector"; + case ISD::VECTOR_DEINTERLEAVE: return "vector_deinterleave"; + case ISD::VECTOR_INTERLEAVE: return "vector_interleave"; case ISD::SCALAR_TO_VECTOR: return "scalar_to_vector"; case ISD::VECTOR_SHUFFLE: return "vector_shuffle"; case ISD::VECTOR_SPLICE: return "vector_splice"; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1047,6 +1047,8 @@ SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1208,6 +1208,8 @@ {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); } } @@ -1253,6 +1255,8 @@ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); @@ -1394,6 +1398,8 @@ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); @@ -6064,6 +6070,10 @@ return LowerCTTZ(Op, DAG); case ISD::VECTOR_SPLICE: return LowerVECTOR_SPLICE(Op, DAG); + case ISD::VECTOR_DEINTERLEAVE: + return LowerVECTOR_DEINTERLEAVE(Op, DAG); + case ISD::VECTOR_INTERLEAVE: + return LowerVECTOR_INTERLEAVE(Op, DAG); case ISD::STRICT_LROUND: case ISD::STRICT_LLROUND: case ISD::STRICT_LRINT: @@ -23556,6 +23566,34 @@ } } +SDValue +AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OpVT = Op.getValueType(); + assert(OpVT.isScalableVector() && + "Expected scalable vector in LowerVECTOR_DEINTERLEAVE."); + SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0), + Op.getOperand(1)); + SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getMergeValues({Even, Odd}, DL); +} + +SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OpVT = Op.getValueType(); + assert(OpVT.isScalableVector() && + "Expected scalable vector in LowerVECTOR_INTERLEAVE."); + + SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0), + Op.getOperand(1)); + SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getMergeValues({Lo, Hi}, DL); +} + SDValue AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s + +define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v2.h[1], v1.h[0] +; CHECK-NEXT: mov v1.h[0], v0.h[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret + %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) + ret {<2 x half>, <2 x half>} %retval +} + +define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f16_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: uzp1 v2.4h, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v1.4h, v0.4h, v1.4h +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret + %retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec) + ret {<4 x half>, <4 x half>} %retval +} + +define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f32_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v2.2s, v0.2s, v1.2s +; CHECK-NEXT: zip2 v1.2s, v0.2s, v1.2s +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret + %retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec) + ret {<2 x float>, <2 x float>} %retval +} + +define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v8f16_v16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v1.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec) + ret {<8 x half>, <8 x half>} %retval +} + +define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f32_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec) +ret {<4 x float>, <4 x float>} %retval +} + +define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) + ret {<2 x double>, <2 x double>} %retval +} + +; Integers + +define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) { +; CHECK-LABEL: vector_deinterleave_v16i8_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.16b, v0.16b, v1.16b +; CHECK-NEXT: uzp2 v1.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec) + ret {<16 x i8>, <16 x i8>} %retval +} + +define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec) { +; CHECK-LABEL: vector_deinterleave_v8i16_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v1.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec) + ret {<8 x i16>, <8 x i16>} %retval +} + +define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_v8i32(<8 x i32> %vec) { +; CHECK-LABEL: vector_deinterleave_v4i32_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec) + ret {<4 x i32>, <4 x i32>} %retval +} + +define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) + ret {<2 x i64>, <2 x i64>} %retval +} + + +; Floating declarations +declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) +declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) +declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>) +declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) +declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) + +; Integer declarations +declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) + diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s + +define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) { +; CHECK-LABEL: interleave2_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret + %retval = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1) + ret <4 x half> %retval +} + +define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) { +; CHECK-LABEL: interleave2_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-NEXT: ret + %retval = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1) + ret <8 x half> %retval +} + +define <16 x half> @interleave2_v16f16(<8 x half> %vec0, <8 x half> %vec1) { +; CHECK-LABEL: interleave2_v16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h +; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1) + ret <16 x half> %retval +} + +define <4 x float> @interleave2_v4f32(<2 x float> %vec0, <2 x float> %vec1) { +; CHECK-LABEL: interleave2_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: rev64 v1.4s, v0.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %retval = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1) + ret <4 x float> %retval +} + +define <8 x float> @interleave2_v8f32(<4 x float> %vec0, <4 x float> %vec1) { +; CHECK-LABEL: interleave2_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1) + ret <8 x float> %retval +} + +define <4 x double> @interleave2_v4f64(<2 x double> %vec0, <2 x double> %vec1) { +; CHECK-LABEL: interleave2_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <4 x double>@llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1) + ret <4 x double> %retval +} + +; Integers + +define <32 x i8> @interleave2_v32i8(<16 x i8> %vec0, <16 x i8> %vec1) { +; CHECK-LABEL: interleave2_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.16b, v0.16b, v1.16b +; CHECK-NEXT: zip2 v1.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1) + ret <32 x i8> %retval +} + +define <16 x i16> @interleave2_v16i16(<8 x i16> %vec0, <8 x i16> %vec1) { +; CHECK-LABEL: interleave2_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h +; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1) + ret <16 x i16> %retval +} + +define <8 x i32> @interleave2_v8i32(<4 x i32> %vec0, <4 x i32> %vec1) { +; CHECK-LABEL: interleave2_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1) + ret <8 x i32> %retval +} + +define <4 x i64> @interleave2_v4i64(<2 x i64> %vec0, <2 x i64> %vec1) { +; CHECK-LABEL: interleave2_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1) + ret <4 x i64> %retval +} + + +; Float declarations +declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) +declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) +declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) +declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>) + +; Integer declarations +declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) + diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s + +define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.d, z0.s +; CHECK-NEXT: uunpklo z2.d, z0.s +; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec) + ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval +} + +define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv8f16(<vscale x 8 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s +; CHECK-NEXT: uzp2 z1.s, z2.s, z1.s +; CHECK-NEXT: ret + %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec) + ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval +} + +define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv4f32(<vscale x 4 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.d, z0.s +; CHECK-NEXT: uunpklo z2.d, z0.s +; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec) + ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval +} + +define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv16f16(<vscale x 16 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec) + ret {<vscale x 8 x half>, <vscale x 8 x half>} %retval +} + +define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv8f32(<vscale x 8 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp2 z1.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec) +ret {<vscale x 4 x float>, <vscale x 4 x float>} %retval +} + +define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv4f64(<vscale x 4 x double> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec) + ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval +} + +; Integers + +define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv32i8(<vscale x 32 x i8> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp2 z1.b, z0.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec) + ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval +} + +define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv16i16(<vscale x 16 x i16> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec) + ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval +} + +define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxvv8i32(<vscale x 8 x i32> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp2 z1.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec) + ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval +} + +define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv4i64(<vscale x 4 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec) + ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval +} + +; Predicated +define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv32i1(<vscale x 32 x i1> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 p2.b, p0.b, p1.b +; CHECK-NEXT: uzp2 p1.b, p0.b, p1.b +; CHECK-NEXT: mov p0.b, p2.b +; CHECK-NEXT: ret + %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec) + ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval +} + +define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i1(<vscale x 16 x i1> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i1_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: uzp1 p0.h, p2.h, p1.h +; CHECK-NEXT: uzp2 p1.h, p2.h, p1.h +; CHECK-NEXT: ret + %retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.experimental.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec) + ret {<vscale x 8 x i1>, <vscale x 8 x i1>} %retval +} + +define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1(<vscale x 8 x i1> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i1_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: uzp1 p0.s, p2.s, p1.s +; CHECK-NEXT: uzp2 p1.s, p2.s, p1.s +; CHECK-NEXT: ret + %retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.experimental.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec) + ret {<vscale x 4 x i1>, <vscale x 4 x i1>} %retval +} + +define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1(<vscale x 4 x i1> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i1_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: uzp1 p0.d, p2.d, p1.d +; CHECK-NEXT: uzp2 p1.d, p2.d, p1.d +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.experimental.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec) + ret {<vscale x 2 x i1>, <vscale x 2 x i1>} %retval +} + + +; Floating declarations +declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>) +declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>) +declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>) +declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>) +declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>) +declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>) + +; Integer declarations +declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>) +declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>) +declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>) +declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>) + +; Predicated declarations +declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>) +declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.experimental.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>) +declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.experimental.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>) +declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.experimental.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>) diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s + +define <vscale x 4 x half> @interleave2_nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1) { +; CHECK-LABEL: interleave2_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.d, z0.d, z1.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s +; CHECK-NEXT: ret + %retval = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1) + ret <vscale x 4 x half> %retval +} + +define <vscale x 8 x half> @interleave2_nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1) { +; CHECK-LABEL: interleave2_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.s, z0.s, z1.s +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %retval = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1) + ret <vscale x 8 x half> %retval +} + +define <vscale x 16 x half> @interleave2_nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1) { +; CHECK-LABEL: interleave2_nxv16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.h, z0.h, z1.h +; CHECK-NEXT: zip2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1) + ret <vscale x 16 x half> %retval +} + +define <vscale x 4 x float> @interleave2_nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1) { +; CHECK-LABEL: interleave2_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.d, z0.d, z1.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s +; CHECK-NEXT: ret + %retval = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1) + ret <vscale x 4 x float> %retval +} + +define <vscale x 8 x float> @interleave2_nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) { +; CHECK-LABEL: interleave2_nxv8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.s, z0.s, z1.s +; CHECK-NEXT: zip2 z1.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) + ret <vscale x 8 x float> %retval +} + +define <vscale x 4 x double> @interleave2_nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1) { +; CHECK-LABEL: interleave2_nxv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.d, z0.d, z1.d +; CHECK-NEXT: zip2 z1.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call <vscale x 4 x double>@llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1) + ret <vscale x 4 x double> %retval +} + +; Integers + +define <vscale x 32 x i8> @interleave2_nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1) { +; CHECK-LABEL: interleave2_nxv32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.b, z0.b, z1.b +; CHECK-NEXT: zip2 z1.b, z0.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1) + ret <vscale x 32 x i8> %retval +} + +define <vscale x 16 x i16> @interleave2_nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1) { +; CHECK-LABEL: interleave2_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.h, z0.h, z1.h +; CHECK-NEXT: zip2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1) + ret <vscale x 16 x i16> %retval +} + +define <vscale x 8 x i32> @interleave2_nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1) { +; CHECK-LABEL: interleave2_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.s, z0.s, z1.s +; CHECK-NEXT: zip2 z1.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1) + ret <vscale x 8 x i32> %retval +} + +define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1) { +; CHECK-LABEL: interleave2_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.d, z0.d, z1.d +; CHECK-NEXT: zip2 z1.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1) + ret <vscale x 4 x i64> %retval +} + +; Predicated + +define <vscale x 32 x i1> @interleave2_nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1) { +; CHECK-LABEL: interleave2_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p2.b, p0.b, p1.b +; CHECK-NEXT: zip2 p1.b, p0.b, p1.b +; CHECK-NEXT: mov p0.b, p2.b +; CHECK-NEXT: ret + %retval = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1) + ret <vscale x 32 x i1> %retval +} + +define <vscale x 16 x i1> @interleave2_nxv16i1(<vscale x 8 x i1> %vec0, <vscale x 8 x i1> %vec1) { +; CHECK-LABEL: interleave2_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p2.h, p0.h, p1.h +; CHECK-NEXT: zip1 p0.h, p0.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p2.b +; CHECK-NEXT: ret + %retval = call <vscale x 16 x i1> @llvm.experimental.vector.interleave2.nxv16i1(<vscale x 8 x i1> %vec0, <vscale x 8 x i1> %vec1) + ret <vscale x 16 x i1> %retval +} + +define <vscale x 8 x i1> @interleave2_nxv8i1(<vscale x 4 x i1> %vec0, <vscale x 4 x i1> %vec1) { +; CHECK-LABEL: interleave2_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: uzp1 p0.h, p0.h, p2.h +; CHECK-NEXT: ret + %retval = call <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1> %vec0, <vscale x 4 x i1> %vec1) + ret <vscale x 8 x i1> %retval +} + +define <vscale x 4 x i1> @interleave2_nxv4i1(<vscale x 2 x i1> %vec0, <vscale x 2 x i1> %vec1) { +; CHECK-LABEL: interleave2_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p2.d, p0.d, p1.d +; CHECK-NEXT: zip1 p0.d, p0.d, p1.d +; CHECK-NEXT: uzp1 p0.s, p0.s, p2.s +; CHECK-NEXT: ret + %retval = call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %vec0, <vscale x 2 x i1> %vec1) + ret <vscale x 4 x i1> %retval +} + + +; Float declarations +declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>) +declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>) +declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>) +declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>) +declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>) +declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>) + +; Integer declarations +declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>) +declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>) +declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>) + +; Predicated +declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>) +declare <vscale x 16 x i1> @llvm.experimental.vector.interleave2.nxv16i1(<vscale x 8 x i1>, <vscale x 8 x i1>) +declare <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1>, <vscale x 4 x i1>) +declare <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)