diff --git a/llvm/0001-IR-Add-new-intrinsics-interleave-and-deinterleave-ve.patch b/llvm/0001-IR-Add-new-intrinsics-interleave-and-deinterleave-ve.patch new file mode 100644 --- /dev/null +++ b/llvm/0001-IR-Add-new-intrinsics-interleave-and-deinterleave-ve.patch @@ -0,0 +1,1068 @@ +From 901c81fc6fc763d6539c7537c585e60952fc666f Mon Sep 17 00:00:00 2001 +From: Caroline Concatto +Date: Tue, 17 Jan 2023 13:19:47 +0000 +Subject: [PATCH] [IR] Add new intrinsics interleave and deinterleave vectors + +This patch adds 2 new intrinsics: + + ; Interleave two vectors into a wider vector + @llvm.vector.interleave2.nxv2i64( %even, %odd) + + ; Deinterleave the odd and even lanes from a wider vector + {, } @llvm.vector.deinterleave2.nxv2i64( %vec) + +The main motivator for adding these intrinsics is to support vectorization of +complex types using scalable vectors. + +The intrinsics are kept simple by only supporting a stride of 2, which makes +them easy to lower and type-legalize. A stride of 2 is sufficient to handle +complex types which only have a real/imaginary component. + +The format of the intrinsics matches how `shufflevector` is used in +LoopVectorize. For example: + + using cf = std::complex; + + void foo(cf * dst, int N) { + for (int i=0; i) + (2) Extracts odd lanes using shufflevector (leading to <4 x float>) + (3) Extracts even lanes using shufflevector (leading to <4 x float>) + (4) Performs the addition + (5) Interleaves the two <4 x float> vectors into a single <8 x float> using + shufflevector + (6) Stores the wide vector. + +In this example, we can 1-1 replace shufflevector in (2) and (3) with the +deinterleave intrinsic, and replace the shufflevector in (5) with the +interleave intrinsic. + +The SelectionDAG nodes might be extended to support higher strides (3, 4, etc) +as well in the future. + +Similar to what was done for vector.splice and vector.reverse, the intrinsic +is lowered to a shufflevector when the type is fixed width, so to benefit from +existing code that was written to recognize/optimize shufflevector patterns. + +Note that this approach does not prevent us from adding new intrinsics for other +strides, or adding a more generic shuffle intrinsic in the future. It just solves +the immediate problem of being able to vectorize loops with complex math. + +Differential Revision: https://reviews.llvm.org/D141924 +--- + llvm/docs/LangRef.rst | 69 +++++++ + llvm/include/llvm/CodeGen/ISDOpcodes.h | 13 ++ + llvm/include/llvm/IR/Intrinsics.td | 11 ++ + .../SelectionDAG/SelectionDAGBuilder.cpp | 64 ++++++ + .../SelectionDAG/SelectionDAGBuilder.h | 2 + + .../SelectionDAG/SelectionDAGDumper.cpp | 2 + + .../Target/AArch64/AArch64ISelLowering.cpp | 38 ++++ + llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 + + .../AArch64/fixed-vector-deinterleave.ll | 136 +++++++++++++ + .../AArch64/fixed-vector-interleave.ll | 133 +++++++++++++ + .../AArch64/sve-vector-deinterleave.ll | 186 ++++++++++++++++++ + .../CodeGen/AArch64/sve-vector-interleave.ll | 181 +++++++++++++++++ + 12 files changed, 837 insertions(+) + create mode 100644 llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll + create mode 100644 llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll + create mode 100644 llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll + create mode 100644 llvm/test/CodeGen/AArch64/sve-vector-interleave.ll + +diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst +index 79589c21275b..d796a34899cf 100644 +--- a/llvm/docs/LangRef.rst ++++ b/llvm/docs/LangRef.rst +@@ -17671,6 +17671,75 @@ Arguments: + + The argument to this intrinsic must be a vector. + ++'``llvm.experimental.vector.deinterleave2``' Intrinsic ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++Syntax: ++""""""" ++This is an overloaded intrinsic. ++ ++:: ++ ++ declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec1) ++ declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec1) ++ ++Overview: ++""""""""" ++ ++The '``llvm.experimental.vector.deinterleave2``' intrinsic constructs two ++vectors by deinterleaving the even and odd lanes of the input vector. ++ ++This intrinsic works for both fixed and scalable vectors. While this intrinsic ++is marked as experimental, the recommended way to express this operation for ++fixed-width vectors is still to use a shufflevector, as that may allow for more ++optimization opportunities. ++ ++For example: ++ ++.. code-block:: text ++ ++ {<2 x i64>, <2 x i64>} llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> ); ==> {<2 x i64> , <2 x i64> } ++ ++Arguments: ++"""""""""" ++ ++The argument to this intrinsic must be a vector and that is twice the size of ++one output vector. ++ ++'``llvm.experimental.vector.interleave2``' Intrinsic ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++Syntax: ++""""""" ++This is an overloaded intrinsic. ++ ++:: ++ ++ declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec1, <2 x double> %vec2) ++ declare @llvm.experimental.vector.interleave2.nxv8i32( %vec1, %vec2) ++ ++Overview: ++""""""""" ++ ++The '``llvm.experimental.vector.interleave2``' intrinsic constructs a vector ++by interleaving two input vectors. ++ ++This intrinsic works for both fixed and scalable vectors. While this intrinsic ++is marked as experimental, the recommended way to express this operation for ++fixed-width vectors is still to use a shufflevector, as that may allow for more ++optimization opportunities. ++ ++For example: ++ ++.. code-block:: text ++ ++ <4 x i64> llvm.experimental.vector.interleave2.v4i64(<2 x i64> , <2 x i64> ); ==> <4 x i64> ++ ++Arguments: ++"""""""""" ++The argument to this intrinsic must be two vectors where each vector must be ++half the size of the output. ++ + '``llvm.experimental.vector.splice``' Intrinsic + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h +index f5e543710026..97891c70d0e5 100644 +--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h ++++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h +@@ -571,6 +571,19 @@ enum NodeType { + /// vector, but not the other way around. + EXTRACT_SUBVECTOR, + ++ /// VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and ++ /// output vectors having the same type. The first output contains the even ++ /// indices from CONCAT_VECTORS(VEC1, VEC2), with the second output ++ /// containing the odd indices. The relative order of elements within an ++ /// output match that of the concatenated input. ++ VECTOR_DEINTERLEAVE, ++ ++ /// VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and ++ /// output vectors having the same type. The first output contains the ++ /// result of interleaving the low half of CONCAT_VECTORS(VEC1, VEC2), with ++ /// the second output containing the result of interleaving the high half. ++ VECTOR_INTERLEAVE, ++ + /// VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, + /// whose elements are shuffled using the following algorithm: + /// RESULT[i] = VECTOR[VECTOR.ElementCount - 1 - i] +diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td +index e0fd727607ce..90e0aa8acefa 100644 +--- a/llvm/include/llvm/IR/Intrinsics.td ++++ b/llvm/include/llvm/IR/Intrinsics.td +@@ -2116,6 +2116,17 @@ def int_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, llvm_i64_ty], + [IntrNoMem, IntrSpeculatable, ImmArg>]>; + ++ ++def int_experimental_vector_interleave2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], ++ [LLVMHalfElementsVectorType<0>, ++ LLVMHalfElementsVectorType<0>], ++ [IntrNoMem]>; ++ ++def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>, ++ LLVMHalfElementsVectorType<0>], ++ [llvm_anyvector_ty], ++ [IntrNoMem]>; ++ + //===----------------- Pointer Authentication Intrinsics ------------------===// + // + +diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +index 96bf1943444d..5f9f85b0bcac 100644 +--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp ++++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +@@ -29,6 +29,7 @@ + #include "llvm/Analysis/MemoryLocation.h" + #include "llvm/Analysis/TargetLibraryInfo.h" + #include "llvm/Analysis/ValueTracking.h" ++#include "llvm/Analysis/VectorUtils.h" + #include "llvm/CodeGen/Analysis.h" + #include "llvm/CodeGen/AssignmentTrackingAnalysis.h" + #include "llvm/CodeGen/CodeGenCommonISel.h" +@@ -7319,6 +7320,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, + case Intrinsic::experimental_vector_splice: + visitVectorSplice(I); + return; ++ case Intrinsic::experimental_vector_interleave2: ++ visitVectorInterleave(I); ++ return; ++ case Intrinsic::experimental_vector_deinterleave2: ++ visitVectorDeinterleave(I); ++ return; + } + } + +@@ -11549,6 +11556,63 @@ void SelectionDAGBuilder::visitVectorReverse(const CallInst &I) { + setValue(&I, DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), Mask)); + } + ++void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I) { ++ auto DL = getCurSDLoc(); ++ SDValue InVec = getValue(I.getOperand(0)); ++ EVT OutVT = InVec.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); ++ ++ unsigned OutNumElts = OutVT.getVectorMinNumElements(); ++ ++ // ISD Node needs the input vectors split into two equal parts ++ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, ++ DAG.getConstant(0, DL, MVT::i64)); ++ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, ++ DAG.getConstant(OutNumElts, DL, MVT::i64)); ++ ++ // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing ++ // legalisation and combines. ++ if (OutVT.isFixedLengthVector()) { ++ SDValue Even = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, ++ createStrideMask(0, 2, OutNumElts)); ++ SDValue Odd = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, ++ createStrideMask(1, 2, OutNumElts)); ++ SDValue Res = DAG.getMergeValues({Even, Odd}, getCurSDLoc()); ++ setValue(&I, Res); ++ return; ++ } ++ ++ SDValue Res = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, ++ DAG.getVTList(OutVT, OutVT), Lo, Hi); ++ setValue(&I, Res); ++ return; ++} ++ ++void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I) { ++ auto DL = getCurSDLoc(); ++ EVT InVT = getValue(I.getOperand(0)).getValueType(); ++ SDValue InVec0 = getValue(I.getOperand(0)); ++ SDValue InVec1 = getValue(I.getOperand(1)); ++ const TargetLowering &TLI = DAG.getTargetLoweringInfo(); ++ EVT OutVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); ++ ++ // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing ++ // legalisation and combines. ++ if (OutVT.isFixedLengthVector()) { ++ unsigned NumElts = InVT.getVectorMinNumElements(); ++ SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVec0, InVec1); ++ setValue(&I, DAG.getVectorShuffle(OutVT, DL, V, DAG.getUNDEF(OutVT), ++ createInterleaveMask(NumElts, 2))); ++ return; ++ } ++ ++ SDValue Res = ++ DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, DAG.getVTList(InVT, InVT), InVec0, InVec1); ++ Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Res.getValue(0), ++ Res.getValue(1)); ++ setValue(&I, Res); ++ return; ++} ++ + void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) { + SmallVector ValueVTs; + ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(), +diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +index bf2111013461..74054ce89b17 100644 +--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h ++++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +@@ -648,6 +648,8 @@ private: + void visitVectorReduce(const CallInst &I, unsigned Intrinsic); + void visitVectorReverse(const CallInst &I); + void visitVectorSplice(const CallInst &I); ++ void visitVectorInterleave(const CallInst &I); ++ void visitVectorDeinterleave(const CallInst &I); + void visitStepVector(const CallInst &I); + + void visitUserOp1(const Instruction &I) { +diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +index 85b20af97a97..f31147bb30a8 100644 +--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp ++++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +@@ -293,6 +293,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { + case ISD::CONCAT_VECTORS: return "concat_vectors"; + case ISD::INSERT_SUBVECTOR: return "insert_subvector"; + case ISD::EXTRACT_SUBVECTOR: return "extract_subvector"; ++ case ISD::VECTOR_DEINTERLEAVE: return "vector_deinterleave"; ++ case ISD::VECTOR_INTERLEAVE: return "vector_interleave"; + case ISD::SCALAR_TO_VECTOR: return "scalar_to_vector"; + case ISD::VECTOR_SHUFFLE: return "vector_shuffle"; + case ISD::VECTOR_SPLICE: return "vector_splice"; +diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +index 324d1cf0d007..b36e2bbb6d71 100644 +--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp ++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +@@ -1208,6 +1208,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, + {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); ++ setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); ++ setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + } + } + +@@ -1253,6 +1255,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); ++ setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); ++ setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); +@@ -1394,6 +1398,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); ++ setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); ++ setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); +@@ -6064,6 +6070,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, + return LowerCTTZ(Op, DAG); + case ISD::VECTOR_SPLICE: + return LowerVECTOR_SPLICE(Op, DAG); ++ case ISD::VECTOR_DEINTERLEAVE: ++ return LowerVECTOR_DEINTERLEAVE(Op, DAG); ++ case ISD::VECTOR_INTERLEAVE: ++ return LowerVECTOR_INTERLEAVE(Op, DAG); + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_LRINT: +@@ -23556,6 +23566,34 @@ AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op, + } + } + ++SDValue ++AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op, ++ SelectionDAG &DAG) const { ++ SDLoc DL(Op); ++ EVT OpVT = Op.getValueType(); ++ assert(OpVT.isScalableVector() && ++ "Expected scalable vector in LowerVECTOR_DEINTERLEAVE."); ++ SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0), ++ Op.getOperand(1)); ++ SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0), ++ Op.getOperand(1)); ++ return DAG.getMergeValues({Even, Odd}, DL); ++} ++ ++SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op, ++ SelectionDAG &DAG) const { ++ SDLoc DL(Op); ++ EVT OpVT = Op.getValueType(); ++ assert(OpVT.isScalableVector() && ++ "Expected scalable vector in LowerVECTOR_INTERLEAVE."); ++ ++ SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0), ++ Op.getOperand(1)); ++ SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0), ++ Op.getOperand(1)); ++ return DAG.getMergeValues({Lo, Hi}, DL); ++} ++ + SDValue + AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, + SelectionDAG &DAG) const { +diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h +index 3731c5ae2408..6841c8bf42af 100644 +--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h ++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h +@@ -1047,6 +1047,8 @@ private: + SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; +diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +new file mode 100644 +index 000000000000..a6e803ddd84f +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +@@ -0,0 +1,136 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s ++ ++define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) { ++; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: dup v1.2s, v0.s[1] ++; CHECK-NEXT: mov v2.16b, v0.16b ++; CHECK-NEXT: mov v2.h[1], v1.h[0] ++; CHECK-NEXT: mov v1.h[0], v0.h[1] ++; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 ++; CHECK-NEXT: fmov d0, d2 ++; CHECK-NEXT: ret ++%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) ++ret {<2 x half>, <2 x half>} %retval ++} ++ ++define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) { ++; CHECK-LABEL: vector_deinterleave_v4f16_v8f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ++; CHECK-NEXT: uzp1 v2.4h, v0.4h, v1.4h ++; CHECK-NEXT: uzp2 v1.4h, v0.4h, v1.4h ++; CHECK-NEXT: fmov d0, d2 ++; CHECK-NEXT: ret ++%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec) ++ret {<4 x half>, <4 x half>} %retval ++} ++ ++define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) { ++; CHECK-LABEL: vector_deinterleave_v2f32_v4f32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ++; CHECK-NEXT: zip1 v2.2s, v0.2s, v1.2s ++; CHECK-NEXT: zip2 v1.2s, v0.2s, v1.2s ++; CHECK-NEXT: fmov d0, d2 ++; CHECK-NEXT: ret ++%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec) ++ret {<2 x float>, <2 x float>} %retval ++} ++ ++define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %vec) { ++; CHECK-LABEL: vector_deinterleave_v8f16_v16f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 v2.8h, v0.8h, v1.8h ++; CHECK-NEXT: uzp2 v1.8h, v0.8h, v1.8h ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec) ++ret {<8 x half>, <8 x half>} %retval ++} ++ ++define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %vec) { ++; CHECK-LABEL: vector_deinterleave_v4f32_v8f32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s ++; CHECK-NEXT: uzp2 v1.4s, v0.4s, v1.4s ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec) ++ret {<4 x float>, <4 x float>} %retval ++} ++ ++define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) { ++; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d ++; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) ++ret {<2 x double>, <2 x double>} %retval ++} ++ ++; Integers ++ ++define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) { ++; CHECK-LABEL: vector_deinterleave_v16i8_v32i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 v2.16b, v0.16b, v1.16b ++; CHECK-NEXT: uzp2 v1.16b, v0.16b, v1.16b ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec) ++ret {<16 x i8>, <16 x i8>} %retval ++} ++ ++define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec) { ++; CHECK-LABEL: vector_deinterleave_v8i16_v16i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 v2.8h, v0.8h, v1.8h ++; CHECK-NEXT: uzp2 v1.8h, v0.8h, v1.8h ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec) ++ret {<8 x i16>, <8 x i16>} %retval ++} ++ ++define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_v8i32(<8 x i32> %vec) { ++; CHECK-LABEL: vector_deinterleave_v4i32_v8i32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s ++; CHECK-NEXT: uzp2 v1.4s, v0.4s, v1.4s ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec) ++ret {<4 x i32>, <4 x i32>} %retval ++} ++ ++define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { ++; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d ++; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) ++ret {<2 x i64>, <2 x i64>} %retval ++} ++ ++ ++; Floating declarations ++declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) ++declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) ++declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>) ++declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) ++declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) ++declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) ++ ++; Integer declarations ++declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) ++declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) ++declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) ++declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) ++ +diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll +new file mode 100644 +index 000000000000..f9e56633195d +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll +@@ -0,0 +1,133 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s ++ ++define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) { ++; CHECK-LABEL: interleave2_v4f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h ++; CHECK-NEXT: ret ++ %retval = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1) ++ ret <4 x half> %retval ++} ++ ++define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) { ++; CHECK-LABEL: interleave2_v8f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: adrp x8, .LCPI1_0 ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ++; CHECK-NEXT: mov v0.d[1], v1.d[0] ++; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] ++; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b ++; CHECK-NEXT: ret ++ %retval = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1) ++ ret <8 x half> %retval ++} ++ ++define <16 x half> @interleave2_v16f16(<8 x half> %vec0, <8 x half> %vec1) { ++; CHECK-LABEL: interleave2_v16f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h ++; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++ %retval = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1) ++ ret <16 x half> %retval ++} ++ ++define <4 x float> @interleave2_v4f32(<2 x float> %vec0, <2 x float> %vec1) { ++; CHECK-LABEL: interleave2_v4f32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ++; CHECK-NEXT: mov v0.d[1], v1.d[0] ++; CHECK-NEXT: rev64 v1.4s, v0.4s ++; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s ++; CHECK-NEXT: ret ++ %retval = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1) ++ ret <4 x float> %retval ++} ++ ++define <8 x float> @interleave2_v8f32(<4 x float> %vec0, <4 x float> %vec1) { ++; CHECK-LABEL: interleave2_v8f32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s ++; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++ %retval = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1) ++ ret <8 x float> %retval ++} ++ ++define <4 x double> @interleave2_v4f64(<2 x double> %vec0, <2 x double> %vec1) { ++; CHECK-LABEL: interleave2_v4f64: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d ++; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++ %retval = call <4 x double>@llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1) ++ ret <4 x double> %retval ++} ++ ++; Integers ++ ++define <32 x i8> @interleave2_v32i8(<16 x i8> %vec0, <16 x i8> %vec1) { ++; CHECK-LABEL: interleave2_v32i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v2.16b, v0.16b, v1.16b ++; CHECK-NEXT: zip2 v1.16b, v0.16b, v1.16b ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++ %retval = call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1) ++ ret <32 x i8> %retval ++} ++ ++define <16 x i16> @interleave2_v16i16(<8 x i16> %vec0, <8 x i16> %vec1) { ++; CHECK-LABEL: interleave2_v16i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h ++; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++ %retval = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1) ++ ret <16 x i16> %retval ++} ++ ++define <8 x i32> @interleave2_v8i32(<4 x i32> %vec0, <4 x i32> %vec1) { ++; CHECK-LABEL: interleave2_v8i32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s ++; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++ %retval = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1) ++ ret <8 x i32> %retval ++} ++ ++define <4 x i64> @interleave2_v4i64(<2 x i64> %vec0, <2 x i64> %vec1) { ++; CHECK-LABEL: interleave2_v4i64: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d ++; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++ %retval = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1) ++ ret <4 x i64> %retval ++} ++ ++ ++; Float declarations ++declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) ++declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) ++declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) ++declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>) ++declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) ++declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>) ++ ++; Integer declarations ++declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>) ++declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) ++declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) ++declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) ++ +diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll +new file mode 100644 +index 000000000000..41164f922324 +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll +@@ -0,0 +1,186 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s ++ ++define {, } @vector_deinterleave_nxv2f16_nxv4f16( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uunpkhi z1.d, z0.s ++; CHECK-NEXT: uunpklo z2.d, z0.s ++; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d ++; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f16( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv4f16_nxv8f16( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uunpkhi z1.s, z0.h ++; CHECK-NEXT: uunpklo z2.s, z0.h ++; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s ++; CHECK-NEXT: uzp2 z1.s, z2.s, z1.s ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f16( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv2f32_nxv4f32( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uunpkhi z1.d, z0.s ++; CHECK-NEXT: uunpklo z2.d, z0.s ++; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d ++; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f32( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv8f16_nxv16f16( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv16f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h ++; CHECK-NEXT: uzp2 z1.h, z0.h, z1.h ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16f16( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv4f32_nxv8f32( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv8f32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 z2.s, z0.s, z1.s ++; CHECK-NEXT: uzp2 z1.s, z0.s, z1.s ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f32( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv2f64_nxv4f64( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d ++; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f64( %vec) ++ret {, } %retval ++} ++ ++; Integers ++ ++define {, } @vector_deinterleave_nxv16i8_nxv32i8( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 z2.b, z0.b, z1.b ++; CHECK-NEXT: uzp2 z1.b, z0.b, z1.b ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i8( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv8i16_nxv16i16( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h ++; CHECK-NEXT: uzp2 z1.h, z0.h, z1.h ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i16( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv4i32_nxvv8i32( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 z2.s, z0.s, z1.s ++; CHECK-NEXT: uzp2 z1.s, z0.s, z1.s ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv2i64_nxv4i64( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d ++; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i64( %vec) ++ret {, } %retval ++} ++ ++; Predicated ++define {, } @vector_deinterleave_nxv16i1_nxv32i1( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: ++; CHECK: // %bb.0: ++; CHECK-NEXT: uzp1 p2.b, p0.b, p1.b ++; CHECK-NEXT: uzp2 p1.b, p0.b, p1.b ++; CHECK-NEXT: mov p0.b, p2.b ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv8i1_nxv16i1( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv8i1_nxv16i1: ++; CHECK: // %bb.0: ++; CHECK-NEXT: punpkhi p1.h, p0.b ++; CHECK-NEXT: punpklo p2.h, p0.b ++; CHECK-NEXT: uzp1 p0.h, p2.h, p1.h ++; CHECK-NEXT: uzp2 p1.h, p2.h, p1.h ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i1( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv4i1_nxv8i1( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv4i1_nxv8i1: ++; CHECK: // %bb.0: ++; CHECK-NEXT: punpkhi p1.h, p0.b ++; CHECK-NEXT: punpklo p2.h, p0.b ++; CHECK-NEXT: uzp1 p0.s, p2.s, p1.s ++; CHECK-NEXT: uzp2 p1.s, p2.s, p1.s ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i1( %vec) ++ret {, } %retval ++} ++ ++define {, } @vector_deinterleave_nxv2i1_nxv4i1( %vec) { ++; CHECK-LABEL: vector_deinterleave_nxv2i1_nxv4i1: ++; CHECK: // %bb.0: ++; CHECK-NEXT: punpkhi p1.h, p0.b ++; CHECK-NEXT: punpklo p2.h, p0.b ++; CHECK-NEXT: uzp1 p0.d, p2.d, p1.d ++; CHECK-NEXT: uzp2 p1.d, p2.d, p1.d ++; CHECK-NEXT: ret ++%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i1( %vec) ++ret {, } %retval ++} ++ ++ ++; Floating declarations ++declare {,} @llvm.experimental.vector.deinterleave2.nxv4f16() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv8f16() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv4f32() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv16f16() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv8f32() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv4f64() ++ ++; Integer declarations ++declare {, } @llvm.experimental.vector.deinterleave2.nxv32i8() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv16i16() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv4i64() ++ ++; Predicated declarations ++declare {, } @llvm.experimental.vector.deinterleave2.nxv32i1() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv16i1() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv8i1() ++declare {, } @llvm.experimental.vector.deinterleave2.nxv4i1() +diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll +new file mode 100644 +index 000000000000..931f165c541f +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll +@@ -0,0 +1,181 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s ++ ++define @interleave2_nxv4f16( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv4f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip2 z2.d, z0.d, z1.d ++; CHECK-NEXT: zip1 z0.d, z0.d, z1.d ++; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv4f16( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv8f16( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv8f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip2 z2.s, z0.s, z1.s ++; CHECK-NEXT: zip1 z0.s, z0.s, z1.s ++; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv8f16( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv16f16( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv16f16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 z2.h, z0.h, z1.h ++; CHECK-NEXT: zip2 z1.h, z0.h, z1.h ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv16f16( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv4f32( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv4f32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip2 z2.d, z0.d, z1.d ++; CHECK-NEXT: zip1 z0.d, z0.d, z1.d ++; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv4f32( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv8f32( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv8f32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 z2.s, z0.s, z1.s ++; CHECK-NEXT: zip2 z1.s, z0.s, z1.s ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv8f32( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv4f64( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv4f64: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 z2.d, z0.d, z1.d ++; CHECK-NEXT: zip2 z1.d, z0.d, z1.d ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv4f64( %vec0, %vec1) ++ ret %retval ++} ++ ++; Integers ++ ++define @interleave2_nxv32i8( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv32i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 z2.b, z0.b, z1.b ++; CHECK-NEXT: zip2 z1.b, z0.b, z1.b ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv32i8( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv16i16( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv16i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 z2.h, z0.h, z1.h ++; CHECK-NEXT: zip2 z1.h, z0.h, z1.h ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv16i16( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv8i32( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv8i32: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 z2.s, z0.s, z1.s ++; CHECK-NEXT: zip2 z1.s, z0.s, z1.s ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv8i32( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv4i64( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv4i64: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 z2.d, z0.d, z1.d ++; CHECK-NEXT: zip2 z1.d, z0.d, z1.d ++; CHECK-NEXT: mov z0.d, z2.d ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv4i64( %vec0, %vec1) ++ ret %retval ++} ++ ++; Predicated ++ ++define @interleave2_nxv32i1( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv32i1: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip1 p2.b, p0.b, p1.b ++; CHECK-NEXT: zip2 p1.b, p0.b, p1.b ++; CHECK-NEXT: mov p0.b, p2.b ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv32i1( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv16i1( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv16i1: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip2 p2.h, p0.h, p1.h ++; CHECK-NEXT: zip1 p0.h, p0.h, p1.h ++; CHECK-NEXT: uzp1 p0.b, p0.b, p2.b ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv16i1( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv8i1( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv8i1: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip2 p2.s, p0.s, p1.s ++; CHECK-NEXT: zip1 p0.s, p0.s, p1.s ++; CHECK-NEXT: uzp1 p0.h, p0.h, p2.h ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv8i1( %vec0, %vec1) ++ ret %retval ++} ++ ++define @interleave2_nxv4i1( %vec0, %vec1) { ++; CHECK-LABEL: interleave2_nxv4i1: ++; CHECK: // %bb.0: ++; CHECK-NEXT: zip2 p2.d, p0.d, p1.d ++; CHECK-NEXT: zip1 p0.d, p0.d, p1.d ++; CHECK-NEXT: uzp1 p0.s, p0.s, p2.s ++; CHECK-NEXT: ret ++ %retval = call @llvm.experimental.vector.interleave2.nxv4i1( %vec0, %vec1) ++ ret %retval ++} ++ ++ ++; Float declarations ++declare @llvm.experimental.vector.interleave2.nxv4f16(, ) ++declare @llvm.experimental.vector.interleave2.nxv8f16(, ) ++declare @llvm.experimental.vector.interleave2.nxv16f16(, ) ++declare @llvm.experimental.vector.interleave2.nxv4f32(, ) ++declare @llvm.experimental.vector.interleave2.nxv8f32(, ) ++declare @llvm.experimental.vector.interleave2.nxv4f64(, ) ++ ++; Integer declarations ++declare @llvm.experimental.vector.interleave2.nxv32i8(, ) ++declare @llvm.experimental.vector.interleave2.nxv16i16(, ) ++declare @llvm.experimental.vector.interleave2.nxv8i32(, ) ++declare @llvm.experimental.vector.interleave2.nxv4i64(, ) ++ ++; Predicated ++declare @llvm.experimental.vector.interleave2.nxv32i1(, ) ++declare @llvm.experimental.vector.interleave2.nxv16i1(, ) ++declare @llvm.experimental.vector.interleave2.nxv8i1(, ) ++declare @llvm.experimental.vector.interleave2.nxv4i1(, ) +-- +2.25.1 + diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -17662,7 +17662,7 @@ The '``llvm.experimental.vector.reverse.*``' intrinsics reverse a vector. The intrinsic takes a single vector and returns a vector of matching type but with the original lane order reversed. These intrinsics work for both fixed -and scalable vectors. While this intrinsic is marked as experimental the +and scalable vectors. While this intrinsic supports all vector types the recommended way to express reverse operations for fixed-width vectors is still to use a shufflevector, as that may allow for more optimization opportunities. @@ -17671,6 +17671,75 @@ The argument to this intrinsic must be a vector. +'``llvm.experimental.vector.deinterleave2``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec1) + declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec1) + +Overview: +""""""""" + +The '``llvm.experimental.vector.deinterleave2``' intrinsic constructs two +vectors by deinterleaving the even and odd lanes of the input vector. + +This intrinsic works for both fixed and scalable vectors. While this intrinsic +supports all vector types the recommended way to express this operation for +fixed-width vectors is still to use a shufflevector, as that may allow for more +optimization opportunities. + +For example: + +.. code-block:: text + + {<2 x i64>, <2 x i64>} llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> ); ==> {<2 x i64> , <2 x i64> } + +Arguments: +"""""""""" + +Both arguments must be vectors of the same type whereby their logical +concatenation matches the result type. + +'``llvm.experimental.vector.interleave2``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec1, <2 x double> %vec2) + declare @llvm.experimental.vector.interleave2.nxv8i32( %vec1, %vec2) + +Overview: +""""""""" + +The '``llvm.experimental.vector.interleave2``' intrinsic constructs a vector +by interleaving two input vectors. + +This intrinsic works for both fixed and scalable vectors. While this intrinsic +is marked as experimental, the recommended way to express this operation for +fixed-width vectors is still to use a shufflevector, as that may allow for more +optimization opportunities. + +For example: + +.. code-block:: text + + <4 x i64> llvm.experimental.vector.interleave2.v4i64(<2 x i64> , <2 x i64> ); ==> <4 x i64> + +Arguments: +"""""""""" +The argument to this intrinsic must be two vectors where each vector must be +half the size of the output. + '``llvm.experimental.vector.splice``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -571,6 +571,19 @@ /// vector, but not the other way around. EXTRACT_SUBVECTOR, + /// VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and + /// output vectors having the same type. The first output contains the even + /// indices from CONCAT_VECTORS(VEC1, VEC2), with the second output + /// containing the odd indices. The relative order of elements within an + /// output match that of the concatenated input. + VECTOR_DEINTERLEAVE, + + /// VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and + /// output vectors having the same type. The first output contains the + /// result of interleaving the low half of CONCAT_VECTORS(VEC1, VEC2), with + /// the second output containing the result of interleaving the high half. + VECTOR_INTERLEAVE, + /// VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, /// whose elements are shuffled using the following algorithm: /// RESULT[i] = VECTOR[VECTOR.ElementCount - 1 - i] diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2116,6 +2116,17 @@ [llvm_anyvector_ty, llvm_i64_ty], [IntrNoMem, IntrSpeculatable, ImmArg>]>; + +def int_experimental_vector_interleave2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMHalfElementsVectorType<0>, + LLVMHalfElementsVectorType<0>], + [IntrNoMem]>; + +def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>, + LLVMHalfElementsVectorType<0>], + [llvm_anyvector_ty], + [IntrNoMem]>; + //===----------------- Pointer Authentication Intrinsics ------------------===// // diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -648,6 +648,8 @@ void visitVectorReduce(const CallInst &I, unsigned Intrinsic); void visitVectorReverse(const CallInst &I); void visitVectorSplice(const CallInst &I); + void visitVectorInterleave(const CallInst &I); + void visitVectorDeinterleave(const CallInst &I); void visitStepVector(const CallInst &I); void visitUserOp1(const Instruction &I) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/AssignmentTrackingAnalysis.h" #include "llvm/CodeGen/CodeGenCommonISel.h" @@ -7319,6 +7320,12 @@ case Intrinsic::experimental_vector_splice: visitVectorSplice(I); return; + case Intrinsic::experimental_vector_interleave2: + visitVectorInterleave(I); + return; + case Intrinsic::experimental_vector_deinterleave2: + visitVectorDeinterleave(I); + return; } } @@ -11549,6 +11556,64 @@ setValue(&I, DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), Mask)); } +void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I) { + auto DL = getCurSDLoc(); + SDValue InVec = getValue(I.getOperand(0)); + EVT OutVT = + InVec.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); + + unsigned OutNumElts = OutVT.getVectorMinNumElements(); + + // ISD Node needs the input vectors split into two equal parts + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, + DAG.getConstant(0, DL, MVT::i64)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, + DAG.getConstant(OutNumElts, DL, MVT::i64)); + + // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing + // legalisation and combines. + if (OutVT.isFixedLengthVector()) { + SDValue Even = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, + createStrideMask(0, 2, OutNumElts)); + SDValue Odd = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, + createStrideMask(1, 2, OutNumElts)); + SDValue Res = DAG.getMergeValues({Even, Odd}, getCurSDLoc()); + setValue(&I, Res); + return; + } + + SDValue Res = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, + DAG.getVTList(OutVT, OutVT), Lo, Hi); + setValue(&I, Res); + return; +} + +void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I) { + auto DL = getCurSDLoc(); + EVT InVT = getValue(I.getOperand(0)).getValueType(); + SDValue InVec0 = getValue(I.getOperand(0)); + SDValue InVec1 = getValue(I.getOperand(1)); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT OutVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing + // legalisation and combines. + if (OutVT.isFixedLengthVector()) { + unsigned NumElts = InVT.getVectorMinNumElements(); + SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVec0, InVec1); + setValue(&I, DAG.getVectorShuffle(OutVT, DL, V, DAG.getUNDEF(OutVT), + createInterleaveMask(NumElts, 2))); + return; + } + + SDValue Res = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, + DAG.getVTList(InVT, InVT), InVec0, InVec1); + Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Res.getValue(0), + Res.getValue(1)); + setValue(&I, Res); + return; +} + void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) { SmallVector ValueVTs; ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(), diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -293,6 +293,8 @@ case ISD::CONCAT_VECTORS: return "concat_vectors"; case ISD::INSERT_SUBVECTOR: return "insert_subvector"; case ISD::EXTRACT_SUBVECTOR: return "extract_subvector"; + case ISD::VECTOR_DEINTERLEAVE: return "vector_deinterleave"; + case ISD::VECTOR_INTERLEAVE: return "vector_interleave"; case ISD::SCALAR_TO_VECTOR: return "scalar_to_vector"; case ISD::VECTOR_SHUFFLE: return "vector_shuffle"; case ISD::VECTOR_SPLICE: return "vector_splice"; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1047,6 +1047,8 @@ SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1208,6 +1208,8 @@ {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); } } @@ -1253,6 +1255,8 @@ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); @@ -1394,6 +1398,8 @@ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); @@ -6064,6 +6070,10 @@ return LowerCTTZ(Op, DAG); case ISD::VECTOR_SPLICE: return LowerVECTOR_SPLICE(Op, DAG); + case ISD::VECTOR_DEINTERLEAVE: + return LowerVECTOR_DEINTERLEAVE(Op, DAG); + case ISD::VECTOR_INTERLEAVE: + return LowerVECTOR_INTERLEAVE(Op, DAG); case ISD::STRICT_LROUND: case ISD::STRICT_LLROUND: case ISD::STRICT_LRINT: @@ -23556,6 +23566,34 @@ } } +SDValue +AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OpVT = Op.getValueType(); + assert(OpVT.isScalableVector() && + "Expected scalable vector in LowerVECTOR_DEINTERLEAVE."); + SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0), + Op.getOperand(1)); + SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getMergeValues({Even, Odd}, DL); +} + +SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OpVT = Op.getValueType(); + assert(OpVT.isScalableVector() && + "Expected scalable vector in LowerVECTOR_INTERLEAVE."); + + SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0), + Op.getOperand(1)); + SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getMergeValues({Lo, Hi}, DL); +} + SDValue AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s + +define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v2.h[1], v1.h[0] +; CHECK-NEXT: mov v1.h[0], v0.h[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) +ret {<2 x half>, <2 x half>} %retval +} + +define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f16_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: uzp1 v2.4h, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v1.4h, v0.4h, v1.4h +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec) +ret {<4 x half>, <4 x half>} %retval +} + +define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f32_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v2.2s, v0.2s, v1.2s +; CHECK-NEXT: zip2 v1.2s, v0.2s, v1.2s +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec) +ret {<2 x float>, <2 x float>} %retval +} + +define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v8f16_v16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v1.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec) +ret {<8 x half>, <8 x half>} %retval +} + +define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f32_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec) +ret {<4 x float>, <4 x float>} %retval +} + +define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) +ret {<2 x double>, <2 x double>} %retval +} + +; Integers + +define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) { +; CHECK-LABEL: vector_deinterleave_v16i8_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.16b, v0.16b, v1.16b +; CHECK-NEXT: uzp2 v1.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec) +ret {<16 x i8>, <16 x i8>} %retval +} + +define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec) { +; CHECK-LABEL: vector_deinterleave_v8i16_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v1.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec) +ret {<8 x i16>, <8 x i16>} %retval +} + +define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_v8i32(<8 x i32> %vec) { +; CHECK-LABEL: vector_deinterleave_v4i32_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec) +ret {<4 x i32>, <4 x i32>} %retval +} + +define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) +ret {<2 x i64>, <2 x i64>} %retval +} + + +; Floating declarations +declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) +declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) +declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>) +declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) +declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) + +; Integer declarations +declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) + diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s + +define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) { +; CHECK-LABEL: interleave2_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret + %retval = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1) + ret <4 x half> %retval +} + +define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) { +; CHECK-LABEL: interleave2_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-NEXT: ret + %retval = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1) + ret <8 x half> %retval +} + +define <16 x half> @interleave2_v16f16(<8 x half> %vec0, <8 x half> %vec1) { +; CHECK-LABEL: interleave2_v16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h +; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1) + ret <16 x half> %retval +} + +define <4 x float> @interleave2_v4f32(<2 x float> %vec0, <2 x float> %vec1) { +; CHECK-LABEL: interleave2_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: rev64 v1.4s, v0.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %retval = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1) + ret <4 x float> %retval +} + +define <8 x float> @interleave2_v8f32(<4 x float> %vec0, <4 x float> %vec1) { +; CHECK-LABEL: interleave2_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1) + ret <8 x float> %retval +} + +define <4 x double> @interleave2_v4f64(<2 x double> %vec0, <2 x double> %vec1) { +; CHECK-LABEL: interleave2_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <4 x double>@llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1) + ret <4 x double> %retval +} + +; Integers + +define <32 x i8> @interleave2_v32i8(<16 x i8> %vec0, <16 x i8> %vec1) { +; CHECK-LABEL: interleave2_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.16b, v0.16b, v1.16b +; CHECK-NEXT: zip2 v1.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1) + ret <32 x i8> %retval +} + +define <16 x i16> @interleave2_v16i16(<8 x i16> %vec0, <8 x i16> %vec1) { +; CHECK-LABEL: interleave2_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h +; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1) + ret <16 x i16> %retval +} + +define <8 x i32> @interleave2_v8i32(<4 x i32> %vec0, <4 x i32> %vec1) { +; CHECK-LABEL: interleave2_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1) + ret <8 x i32> %retval +} + +define <4 x i64> @interleave2_v4i64(<2 x i64> %vec0, <2 x i64> %vec1) { +; CHECK-LABEL: interleave2_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %retval = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1) + ret <4 x i64> %retval +} + + +; Float declarations +declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) +declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) +declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) +declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>) + +; Integer declarations +declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) + diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s + +define {, } @vector_deinterleave_nxv2f16_nxv4f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.d, z0.s +; CHECK-NEXT: uunpklo z2.d, z0.s +; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f16( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f16_nxv8f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s +; CHECK-NEXT: uzp2 z1.s, z2.s, z1.s +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f16( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f32_nxv4f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.d, z0.s +; CHECK-NEXT: uunpklo z2.d, z0.s +; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f32( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8f16_nxv16f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16f16( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f32_nxv8f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp2 z1.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f64_nxv4f64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f64( %vec) + ret {, } %retval +} + +; Integers + +define {, } @vector_deinterleave_nxv16i8_nxv32i8( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp2 z1.b, z0.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i8( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8i16_nxv16i16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i16( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4i32_nxvv8i32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp2 z1.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2i64_nxv4i64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i64( %vec) + ret {, } %retval +} + +; Predicated +define {, } @vector_deinterleave_nxv16i1_nxv32i1( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 p2.b, p0.b, p1.b +; CHECK-NEXT: uzp2 p1.b, p0.b, p1.b +; CHECK-NEXT: mov p0.b, p2.b +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8i1_nxv16i1( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i1_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: uzp1 p0.h, p2.h, p1.h +; CHECK-NEXT: uzp2 p1.h, p2.h, p1.h +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i1( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4i1_nxv8i1( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i1_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: uzp1 p0.s, p2.s, p1.s +; CHECK-NEXT: uzp2 p1.s, p2.s, p1.s +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i1( %vec) + ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2i1_nxv4i1( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i1_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: uzp1 p0.d, p2.d, p1.d +; CHECK-NEXT: uzp2 p1.d, p2.d, p1.d +; CHECK-NEXT: ret + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i1( %vec) + ret {, } %retval +} + + +; Floating declarations +declare {,} @llvm.experimental.vector.deinterleave2.nxv4f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f64() + +; Integer declarations +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4i64() + +; Predicated declarations +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i1() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16i1() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8i1() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4i1() diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s + +define @interleave2_nxv4f16( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.d, z0.d, z1.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv4f16( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv8f16( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.s, z0.s, z1.s +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv8f16( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv16f16( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.h, z0.h, z1.h +; CHECK-NEXT: zip2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv16f16( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv4f32( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.d, z0.d, z1.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv4f32( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv8f32( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.s, z0.s, z1.s +; CHECK-NEXT: zip2 z1.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv8f32( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv4f64( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.d, z0.d, z1.d +; CHECK-NEXT: zip2 z1.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv4f64( %vec0, %vec1) + ret %retval +} + +; Integers + +define @interleave2_nxv32i8( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.b, z0.b, z1.b +; CHECK-NEXT: zip2 z1.b, z0.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv32i8( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv16i16( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.h, z0.h, z1.h +; CHECK-NEXT: zip2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv16i16( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv8i32( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.s, z0.s, z1.s +; CHECK-NEXT: zip2 z1.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv8i32( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv4i64( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 z2.d, z0.d, z1.d +; CHECK-NEXT: zip2 z1.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv4i64( %vec0, %vec1) + ret %retval +} + +; Predicated + +define @interleave2_nxv32i1( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p2.b, p0.b, p1.b +; CHECK-NEXT: zip2 p1.b, p0.b, p1.b +; CHECK-NEXT: mov p0.b, p2.b +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv32i1( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv16i1( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p2.h, p0.h, p1.h +; CHECK-NEXT: zip1 p0.h, p0.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p2.b +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv16i1( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv8i1( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: uzp1 p0.h, p0.h, p2.h +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv8i1( %vec0, %vec1) + ret %retval +} + +define @interleave2_nxv4i1( %vec0, %vec1) { +; CHECK-LABEL: interleave2_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p2.d, p0.d, p1.d +; CHECK-NEXT: zip1 p0.d, p0.d, p1.d +; CHECK-NEXT: uzp1 p0.s, p0.s, p2.s +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.interleave2.nxv4i1( %vec0, %vec1) + ret %retval +} + + +; Float declarations +declare @llvm.experimental.vector.interleave2.nxv4f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f16(, ) +declare @llvm.experimental.vector.interleave2.nxv16f16(, ) +declare @llvm.experimental.vector.interleave2.nxv4f32(, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) + +; Integer declarations +declare @llvm.experimental.vector.interleave2.nxv32i8(, ) +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) + +; Predicated +declare @llvm.experimental.vector.interleave2.nxv32i1(, ) +declare @llvm.experimental.vector.interleave2.nxv16i1(, ) +declare @llvm.experimental.vector.interleave2.nxv8i1(, ) +declare @llvm.experimental.vector.interleave2.nxv4i1(, )