diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -270,6 +270,8 @@ VWSUB_W_VL, VWSUBU_W_VL, + // Narrowing logical shift right. + // Operands are (source, shift, passthru, mask, vl) VNSRL_VL, // Vector compare producing a mask. Fourth operand is input mask. Fifth @@ -698,6 +700,8 @@ SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_REVERSE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -600,6 +600,10 @@ setOperationAction({ISD::VP_FP_TO_SINT, ISD::VP_FP_TO_UINT, ISD::VP_TRUNCATE, ISD::VP_SETCC}, VT, Custom); + + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_REVERSE, VT, Custom); setOperationPromotedToType( @@ -691,6 +695,9 @@ VT, Expand); } + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + // Splice setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); @@ -772,6 +779,9 @@ {ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + setOperationAction({ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE}, VT, Custom); setOperationAction(FloatingPointVPOps, VT, Custom); @@ -4092,6 +4102,10 @@ return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return lowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::VECTOR_DEINTERLEAVE: + return lowerVECTOR_DEINTERLEAVE(Op, DAG); + case ISD::VECTOR_INTERLEAVE: + return lowerVECTOR_INTERLEAVE(Op, DAG); case ISD::STEP_VECTOR: return lowerSTEP_VECTOR(Op, DAG); case ISD::VECTOR_REVERSE: @@ -6478,6 +6492,167 @@ return DAG.getBitcast(Op.getSimpleValueType(), Slidedown); } +// Widen a vector's operands to i8, then truncate its results back to the +// original type, typically i8. All operand and result types must be the same. +static SDValue wideVectorOpToi8(SDValue N, SDLoc &DL, SelectionDAG &DAG) { + MVT VT = N.getSimpleValueType(); + MVT WideVT = VT.changeVectorElementType(MVT::i8); + SmallVector WideOps; + for (SDValue Op : N.getNode()->ops()) { + assert(Op.getSimpleValueType() == VT && + "Operands and result must be same type"); + WideOps.push_back(DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op)); + } + + unsigned NumVals = N.getNode()->getNumValues(); + + SDVTList VTs = DAG.getVTList(SmallVector( + NumVals, N.getValueType().changeVectorElementType(MVT::i8))); + SDValue WideN = DAG.getNode(N.getOpcode(), DL, VTs, WideOps); + SmallVector TruncVals; + for (unsigned I = 0; I < WideN.getNode()->getNumValues(); I++) { + TruncVals.push_back( + DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(WideN.getNode(), I))); + } + + if (TruncVals.size() > 1) + return DAG.getMergeValues(TruncVals, DL); + return TruncVals.front(); +} + +SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VecVT = Op.getSimpleValueType(); + MVT XLenVT = Subtarget.getXLenVT(); + + // 1 bit element vectors need to be widened to e8 + if (VecVT.getVectorElementType() == MVT::i1) + return wideVectorOpToi8(Op, DL, DAG); + + // Reconstruct the concatenated array to deinterleave + MVT WideVT = MVT::getScalableVectorVT(VecVT.getVectorElementType(), + VecVT.getVectorMinNumElements() * 2); + SDValue Wide = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideVT, Op.getOperand(0), + Op.getOperand(1)); + + auto [Mask, VL] = getDefaultScalableVLOps(WideVT, DL, DAG, Subtarget); + + // If the element type is smaller than i64, then we can deinterleave + // through vnsrl.wi + if (VecVT.getScalarSizeInBits() < 64) { + // Bitcast the concatenated vector from -> + // This is also casts FPs to ints + MVT WideEltVT = MVT::getIntegerVT(WideVT.getScalarSizeInBits() * 2); + WideVT = MVT::getVectorVT( + WideEltVT, WideVT.getVectorElementCount().divideCoefficientBy(2)); + Wide = DAG.getBitcast(WideVT, Wide); + + MVT NarrowVT = VecVT.changeVectorElementTypeToInteger(); + auto [Mask, VL] = getDefaultScalableVLOps(NarrowVT, DL, DAG, Subtarget); + + SDValue Passthru = DAG.getUNDEF(VecVT); + + SDValue Even = DAG.getNode( + RISCVISD::VNSRL_VL, DL, NarrowVT, Wide, + DAG.getSplatVector(NarrowVT, DL, DAG.getConstant(0, DL, XLenVT)), + Passthru, Mask, VL); + SDValue Odd = DAG.getNode( + RISCVISD::VNSRL_VL, DL, NarrowVT, Wide, + DAG.getSplatVector( + NarrowVT, DL, + DAG.getConstant(VecVT.getScalarSizeInBits(), DL, XLenVT)), + Passthru, Mask, VL); + + // Bitcast the results back in case it was casted from an FP vector + return DAG.getMergeValues( + {DAG.getBitcast(VecVT, Even), DAG.getBitcast(VecVT, Odd)}, DL); + } + + MVT IdxVT = WideVT.changeVectorElementTypeToInteger(); + // Create a vector of even indices {0, 2, 4, ...} + SDValue EvenIdx = + DAG.getStepVector(DL, IdxVT, APInt(WideVT.getScalarSizeInBits(), 2)); + // Create a vector of odd indices {1, 3, 5, ... } + SDValue OddIdx = + DAG.getNode(ISD::ADD, DL, IdxVT, EvenIdx, DAG.getConstant(1, DL, IdxVT)); + + // Gather the even and odd elements into two separate vectors + SDValue EvenWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, WideVT, Wide, + EvenIdx, DAG.getUNDEF(WideVT), Mask, VL); + SDValue OddWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, WideVT, Wide, + OddIdx, DAG.getUNDEF(WideVT), Mask, VL); + + SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide, + DAG.getConstant(0, DL, XLenVT)); + SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide, + DAG.getConstant(0, DL, XLenVT)); + + return DAG.getMergeValues({Even, Odd}, DL); +} + +SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VecVT = Op.getSimpleValueType(); + + // i1 vectors need to be widened to i8 + if (VecVT.getVectorElementType() == MVT::i1) + return wideVectorOpToi8(Op, DL, DAG); + + MVT XLenVT = Subtarget.getXLenVT(); + auto [_, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget); + + MVT ConcatVT = + MVT::getVectorVT(VecVT.getVectorElementType(), + VecVT.getVectorElementCount().multiplyCoefficientBy(2)); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, + Op.getOperand(0), Op.getOperand(1)); + + MVT IdxVT = MVT::getVectorVT(MVT::i16, ConcatVT.getVectorElementCount()); + + SDValue StepVec = DAG.getStepVector(DL, IdxVT); + + SDValue One = DAG.getConstant(1, DL, XLenVT); + + // ... 0 1 0 1 0 1 0 1 + SDValue OddMask = DAG.getNode(ISD::AND, DL, IdxVT, StepVec, + DAG.getSplatVector(IdxVT, DL, One)); + // Convert it to a mask vector type (nxmxi16 -> nxmxi1) + // vmsne.vi v0, oddmask, 0 + OddMask = DAG.getSetCC( + DL, getMaskTypeFor(ConcatVT), OddMask, + DAG.getSplatVector(IdxVT, DL, DAG.getConstant(0, DL, XLenVT)), + ISD::CondCode::SETNE); + + SDValue VLMAX = + DAG.getNode(ISD::VSCALE, DL, XLenVT, + getVLOp(VecVT.getVectorMinNumElements(), DL, DAG, Subtarget)); + + // Build up the index vector for interleaving the concatenated array + // ... 3 3 2 2 1 1 0 0 + SDValue Idx = DAG.getNode(ISD::SRL, DL, IdxVT, StepVec, + DAG.getSplatVector(IdxVT, DL, One)); + // ... n+3 3 n+2 2 n+1 1 n 0 + Idx = DAG.getNode(RISCVISD::ADD_VL, DL, IdxVT, Idx, + DAG.getSplatVector(IdxVT, DL, VLMAX), DAG.getUNDEF(IdxVT), + OddMask, VL); + + // Perform the interleaving + SDValue Interleaved = + DAG.getNode(RISCVISD::VRGATHEREI16_VV_VL, DL, ConcatVT, Concat, Idx, + DAG.getUNDEF(ConcatVT), OddMask, VL); + + // Extract the two halves from the concatenated result + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Interleaved, + DAG.getVectorIdxConstant(0, DL)); + SDValue Hi = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, VecVT, Interleaved, + DAG.getVectorIdxConstant(VecVT.getVectorMinNumElements(), DL)); + + return DAG.getMergeValues({Lo, Hi}, DL); +} + // Lower step_vector to the vid instruction. Any non-identity step value must // be accounted for my manual expansion. SDValue RISCVTargetLowering::lowerSTEP_VECTOR(SDValue Op, diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -0,0 +1,166 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { +%retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec) +ret {<16 x i1>, <16 x i1>} %retval +} + +define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) { +; CHECK-LABEL: vector_deinterleave_v16i8_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec) +ret {<16 x i8>, <16 x i8>} %retval +} + +define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec) { +; CHECK-LABEL: vector_deinterleave_v8i16_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec) +ret {<8 x i16>, <8 x i16>} %retval +} + +define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_vv8i32(<8 x i32> %vec) { +; CHECK-LABEL: vector_deinterleave_v4i32_vv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec) +ret {<4 x i32>, <4 x i32>} %retval +} + +define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v11, v8, 1 +; CHECK-NEXT: vrgather.vi v11, v12, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) +ret {<2 x i64>, <2 x i64>} %retval +} + +declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>) +declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) + +; Floats + +define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) +ret {<2 x half>, <2 x half>} %retval +} + +define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec) +ret {<4 x half>, <4 x half>} %retval +} + +define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v8, a0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: ret +%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec) +ret {<2 x float>, <2 x float>} %retval +} + +define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v8f16_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec) +ret {<8 x half>, <8 x half>} %retval +} + +define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec) +ret {<4 x float>, <4 x float>} %retval +} + +define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v11, v8, 1 +; CHECK-NEXT: vrgather.vi v11, v12, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) +ret {<2 x double>, <2 x double>} %retval +} + +declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) +declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) +declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>) +declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) +declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -0,0 +1,182 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {, } @vector_deinterleave_nxv16i1_nxv32i1( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v12, 8 +; CHECK-NEXT: vand.vi v10, v8, 1 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv16i8_nxv32i8( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vnsrl.wi v14, v8, 8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.v v10, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i8( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8i16_nxv16i16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vnsrl.wi v14, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.v v10, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4i32_nxvv8i32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vnsrl.wx v12, v8, a0 +; CHECK-NEXT: vnsrl.wi v14, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v14 +; CHECK-NEXT: vmv.v.v v10, v12 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2i64_nxv4i64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i64( %vec) +ret {, } %retval +} + +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i1() +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4i64() + +; Floats + +define {, } @vector_deinterleave_nxv2f16_nxv4f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f16_nxv8f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f32_nxv4f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8f16_nxv16f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vnsrl.wi v14, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.v v10, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f32_nxv8f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vnsrl.wx v12, v8, a0 +; CHECK-NEXT: vnsrl.wi v14, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v14 +; CHECK-NEXT: vmv.v.v v10, v12 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f64_nxv4f64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f64( %vec) +ret {, } %retval +} + +declare {,} @llvm.experimental.vector.deinterleave2.nxv4f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f64() diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) { + %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b) + ret <32 x i1> %res +} + +define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vector_interleave_v16i16_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 8 +; CHECK-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0) +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b) + ret <16 x i16> %res +} + +define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vector_interleave_v8i32_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: lui a0, %hi(.LCPI2_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI2_0) +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b) + ret <8 x i32> %res +} + +define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { + %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b) + ret <4 x i64> %res +} + +declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) + +; Floats + +define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: vector_interleave_v4f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: lui a0, %hi(.LCPI4_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0) +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: ret + %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b) + ret <4 x half> %res +} + +define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: vector_interleave_v8f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: lui a0, %hi(.LCPI5_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b) + ret <8 x half> %res +} + +define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: vector_interleave_v4f32_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: lui a0, %hi(.LCPI6_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b) + ret <4 x float> %res +} + +define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: vector_interleave_v16f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 8 +; CHECK-NEXT: lui a0, %hi(.LCPI7_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b) + ret <16 x half> %res +} + +define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: vector_interleave_v8f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: lui a0, %hi(.LCPI8_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b) + ret <8 x float> %res +} + +define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> %b) { + %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b) + ret <4 x double> %res +} + + +declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) +declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>) +declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define @vector_interleave_nxv32i1_nxv16i1( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv32i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vand.vi v24, v16, 1 +; CHECK-NEXT: vmsne.vi v10, v24, 0 +; CHECK-NEXT: vsrl.vi v16, v16, 1 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vadd.vx v16, v16, a1, v0.t +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v26, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v24, v12, 1, v0 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vand.vi v12, v10, 1 +; CHECK-NEXT: vmsne.vi v14, v12, 0 +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v0, v14, a0 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv32i1( %a, %b) + ret %res +} + +define @vector_interleave_nxv16i16_nxv8i16( %a, %b) { + %res = call @llvm.experimental.vector.interleave2.nxv16i16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8i32_nxv4i32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vand.vi v14, v12, 1 +; CHECK-NEXT: vmsne.vi v0, v14, 0 +; CHECK-NEXT: vsrl.vi v12, v12, 1 +; CHECK-NEXT: vadd.vx v16, v12, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8i32( %a, %b) + ret %res +} + +define @vector_interleave_nxv4i64_nxv2i64( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4i64_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vand.vi v13, v12, 1 +; CHECK-NEXT: vmsne.vi v0, v13, 0 +; CHECK-NEXT: vsrl.vi v12, v12, 1 +; CHECK-NEXT: vadd.vx v16, v12, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4i64( %a, %b) + ret %res +} + +declare @llvm.experimental.vector.interleave2.nxv32i1(, ) +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) + +; Floats + +define @vector_interleave_nxv4f16_nxv2f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vand.vi v11, v10, 1 +; CHECK-NEXT: vmsne.vi v0, v11, 0 +; CHECK-NEXT: vsrl.vi v10, v10, 1 +; CHECK-NEXT: vadd.vx v10, v10, a0, v0.t +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v9, v8, v10, v0.t +; CHECK-NEXT: vslidedown.vx v8, v9, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vx v9, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f16_nxv4f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8m2 def $v8m2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vand.vi v12, v10, 1 +; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: vsrl.vi v10, v10, 1 +; CHECK-NEXT: vadd.vx v12, v10, a0, v0.t +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m2 def $v8m2 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f32_nxv2f32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f32_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8m2 def $v8m2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vand.vi v11, v10, 1 +; CHECK-NEXT: vmsne.vi v0, v11, 0 +; CHECK-NEXT: vsrl.vi v10, v10, 1 +; CHECK-NEXT: vadd.vx v12, v10, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m2 def $v8m2 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv16f16_nxv8f16( %a, %b) { + %res = call @llvm.experimental.vector.interleave2.nxv16f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f32_nxv4f32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vand.vi v14, v12, 1 +; CHECK-NEXT: vmsne.vi v0, v14, 0 +; CHECK-NEXT: vsrl.vi v12, v12, 1 +; CHECK-NEXT: vadd.vx v16, v12, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f64_nxv2f64( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f64_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vand.vi v13, v12, 1 +; CHECK-NEXT: vmsne.vi v0, v13, 0 +; CHECK-NEXT: vsrl.vi v12, v12, 1 +; CHECK-NEXT: vadd.vx v16, v12, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f64( %a, %b) + ret %res +} + + +declare @llvm.experimental.vector.interleave2.nxv4f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f16(, ) +declare @llvm.experimental.vector.interleave2.nxv4f32(, ) +declare @llvm.experimental.vector.interleave2.nxv16f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, )