diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -705,6 +705,8 @@ SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_REVERSE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -616,6 +616,10 @@ setOperationAction({ISD::VP_FP_TO_SINT, ISD::VP_FP_TO_UINT, ISD::VP_TRUNCATE, ISD::VP_SETCC}, VT, Custom); + + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_REVERSE, VT, Custom); setOperationPromotedToType( @@ -707,6 +711,9 @@ VT, Expand); } + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + // Splice setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); @@ -788,6 +795,9 @@ {ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + setOperationAction({ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE}, VT, Custom); setOperationAction(FloatingPointVPOps, VT, Custom); @@ -4166,6 +4176,10 @@ return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return lowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::VECTOR_DEINTERLEAVE: + return lowerVECTOR_DEINTERLEAVE(Op, DAG); + case ISD::VECTOR_INTERLEAVE: + return lowerVECTOR_INTERLEAVE(Op, DAG); case ISD::STEP_VECTOR: return lowerSTEP_VECTOR(Op, DAG); case ISD::VECTOR_REVERSE: @@ -6552,6 +6566,182 @@ return DAG.getBitcast(Op.getSimpleValueType(), Slidedown); } +// Widen a vector's operands to i8, then truncate its results back to the +// original type, typically i1. All operand and result types must be the same. +static SDValue widenVectorOpsToi8(SDValue N, SDLoc &DL, SelectionDAG &DAG) { + MVT VT = N.getSimpleValueType(); + MVT WideVT = VT.changeVectorElementType(MVT::i8); + SmallVector WideOps; + for (SDValue Op : N->ops()) { + assert(Op.getSimpleValueType() == VT && + "Operands and result must be same type"); + WideOps.push_back(DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op)); + } + + unsigned NumVals = N->getNumValues(); + + SDVTList VTs = DAG.getVTList(SmallVector( + NumVals, N.getValueType().changeVectorElementType(MVT::i8))); + SDValue WideN = DAG.getNode(N.getOpcode(), DL, VTs, WideOps); + SmallVector TruncVals; + for (unsigned I = 0; I < NumVals; I++) { + TruncVals.push_back(DAG.getNode(ISD::TRUNCATE, DL, + N->getSimpleValueType(I), + SDValue(WideN.getNode(), I))); + } + + if (TruncVals.size() > 1) + return DAG.getMergeValues(TruncVals, DL); + return TruncVals.front(); +} + +SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VecVT = Op.getSimpleValueType(); + MVT XLenVT = Subtarget.getXLenVT(); + + assert(VecVT.isScalableVector() && + "vector_interleave on non-scalable vector!"); + + // 1 bit element vectors need to be widened to e8 + if (VecVT.getVectorElementType() == MVT::i1) + return widenVectorOpsToi8(Op, DL, DAG); + + // Concatenate the two vectors as one vector to deinterleave + MVT ConcatVT = + MVT::getVectorVT(VecVT.getVectorElementType(), + VecVT.getVectorElementCount().multiplyCoefficientBy(2)); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, + Op.getOperand(0), Op.getOperand(1)); + + // We want to operate on all lanes, so get the mask and VL and mask for it + auto [Mask, VL] = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget); + SDValue Passthru = DAG.getUNDEF(ConcatVT); + + // If the element type is smaller than ELEN, then we can deinterleave + // through vnsrl.wi + if (VecVT.getScalarSizeInBits() < Subtarget.getELEN()) { + // Bitcast the concatenated vector from -> + // This is also casts FPs to ints + MVT WideVT = MVT::getVectorVT( + MVT::getIntegerVT(ConcatVT.getScalarSizeInBits() * 2), + ConcatVT.getVectorElementCount().divideCoefficientBy(2)); + SDValue Wide = DAG.getBitcast(WideVT, Concat); + + MVT NarrowVT = VecVT.changeVectorElementTypeToInteger(); + SDValue Passthru = DAG.getUNDEF(VecVT); + + SDValue Even = DAG.getNode( + RISCVISD::VNSRL_VL, DL, NarrowVT, Wide, + DAG.getSplatVector(NarrowVT, DL, DAG.getConstant(0, DL, XLenVT)), + Passthru, Mask, VL); + SDValue Odd = DAG.getNode( + RISCVISD::VNSRL_VL, DL, NarrowVT, Wide, + DAG.getSplatVector( + NarrowVT, DL, + DAG.getConstant(VecVT.getScalarSizeInBits(), DL, XLenVT)), + Passthru, Mask, VL); + + // Bitcast the results back in case it was casted from an FP vector + return DAG.getMergeValues( + {DAG.getBitcast(VecVT, Even), DAG.getBitcast(VecVT, Odd)}, DL); + } + + // For the indices, use the same SEW to avoid an extra vsetvli + MVT IdxVT = ConcatVT.changeVectorElementTypeToInteger(); + // Create a vector of even indices {0, 2, 4, ...} + SDValue EvenIdx = + DAG.getStepVector(DL, IdxVT, APInt(IdxVT.getScalarSizeInBits(), 2)); + // Create a vector of odd indices {1, 3, 5, ... } + SDValue OddIdx = + DAG.getNode(ISD::ADD, DL, IdxVT, EvenIdx, DAG.getConstant(1, DL, IdxVT)); + + // Gather the even and odd elements into two separate vectors + SDValue EvenWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT, + Concat, EvenIdx, Passthru, Mask, VL); + SDValue OddWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT, + Concat, OddIdx, Passthru, Mask, VL); + + // Extract the result half of the gather for even and odd + SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide, + DAG.getConstant(0, DL, XLenVT)); + SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide, + DAG.getConstant(0, DL, XLenVT)); + + return DAG.getMergeValues({Even, Odd}, DL); +} + +SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VecVT = Op.getSimpleValueType(); + + assert(VecVT.isScalableVector() && + "vector_interleave on non-scalable vector!"); + + // i1 vectors need to be widened to i8 + if (VecVT.getVectorElementType() == MVT::i1) + return widenVectorOpsToi8(Op, DL, DAG); + + MVT XLenVT = Subtarget.getXLenVT(); + SDValue VL = DAG.getRegister(RISCV::X0, XLenVT); + + SDValue Interleaved; + + // If the element type is smaller than ELEN, then we can interleave with + // vwaddu.vv and vwmaccu.vx + if (VecVT.getScalarSizeInBits() < Subtarget.getELEN()) { + Interleaved = getWideningInterleave(Op.getOperand(0), Op.getOperand(1), DL, + DAG, Subtarget); + } else { + // Otherwise, fallback to using vrgathere16.vv + MVT ConcatVT = + MVT::getVectorVT(VecVT.getVectorElementType(), + VecVT.getVectorElementCount().multiplyCoefficientBy(2)); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, + Op.getOperand(0), Op.getOperand(1)); + + MVT IdxVT = ConcatVT.changeVectorElementType(MVT::i16); + + // 0 1 2 3 4 5 6 7 ... + SDValue StepVec = DAG.getStepVector(DL, IdxVT); + + // 1 1 1 1 1 1 1 1 ... + SDValue Ones = DAG.getSplatVector(IdxVT, DL, DAG.getConstant(1, DL, XLenVT)); + + // 1 0 1 0 1 0 1 0 ... + SDValue OddMask = DAG.getNode(ISD::AND, DL, IdxVT, StepVec, Ones); + OddMask = DAG.getSetCC( + DL, IdxVT.changeVectorElementType(MVT::i1), OddMask, + DAG.getSplatVector(IdxVT, DL, DAG.getConstant(0, DL, XLenVT)), + ISD::CondCode::SETNE); + + SDValue VLMax = DAG.getSplatVector(IdxVT, DL, computeVLMax(VecVT, DL, DAG)); + + // Build up the index vector for interleaving the concatenated vector + // 0 0 1 1 2 2 3 3 ... + SDValue Idx = DAG.getNode(ISD::SRL, DL, IdxVT, StepVec, Ones); + // 0 n 1 n+1 2 n+2 3 n+3 ... + Idx = + DAG.getNode(RISCVISD::ADD_VL, DL, IdxVT, Idx, VLMax, Idx, OddMask, VL); + + // Then perform the interleave + // v[0] v[n] v[1] v[n+1] v[2] v[n+2] v[3] v[n+3] ... + Interleaved = DAG.getNode(RISCVISD::VRGATHEREI16_VV_VL, DL, ConcatVT, + Concat, Idx, DAG.getUNDEF(ConcatVT), OddMask, VL); + } + + // Extract the two halves from the interleaved result + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Interleaved, + DAG.getVectorIdxConstant(0, DL)); + SDValue Hi = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, VecVT, Interleaved, + DAG.getVectorIdxConstant(VecVT.getVectorMinNumElements(), DL)); + + return DAG.getMergeValues({Lo, Hi}, DL); +} + // Lower step_vector to the vid instruction. Any non-identity step value must // be accounted for my manual expansion. SDValue RISCVTargetLowering::lowerSTEP_VECTOR(SDValue Op, diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -0,0 +1,405 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck --check-prefixes=CHECK,RV64 %s + +; Integers + +define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { +; RV32-LABEL: vector_deinterleave_v16i1_v32i1: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vfirst.m a0, v0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: sb a0, 16(sp) +; RV32-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV32-NEXT: vmv.x.s a0, v0 +; RV32-NEXT: slli a1, a0, 17 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 23(sp) +; RV32-NEXT: slli a1, a0, 19 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 22(sp) +; RV32-NEXT: slli a1, a0, 21 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 21(sp) +; RV32-NEXT: slli a1, a0, 23 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 20(sp) +; RV32-NEXT: slli a1, a0, 25 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 19(sp) +; RV32-NEXT: slli a1, a0, 27 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 18(sp) +; RV32-NEXT: slli a1, a0, 29 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 17(sp) +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v8, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vfirst.m a1, v8 +; RV32-NEXT: seqz a1, a1 +; RV32-NEXT: sb a1, 24(sp) +; RV32-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: slli a2, a1, 17 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 31(sp) +; RV32-NEXT: slli a2, a1, 19 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 30(sp) +; RV32-NEXT: slli a2, a1, 21 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 29(sp) +; RV32-NEXT: slli a2, a1, 23 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 28(sp) +; RV32-NEXT: slli a2, a1, 25 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 27(sp) +; RV32-NEXT: slli a2, a1, 27 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 26(sp) +; RV32-NEXT: slli a2, a1, 29 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 25(sp) +; RV32-NEXT: slli a2, a0, 16 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 7(sp) +; RV32-NEXT: slli a2, a0, 18 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 6(sp) +; RV32-NEXT: slli a2, a0, 20 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 5(sp) +; RV32-NEXT: slli a2, a0, 22 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 4(sp) +; RV32-NEXT: slli a2, a0, 24 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 3(sp) +; RV32-NEXT: slli a2, a0, 26 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 2(sp) +; RV32-NEXT: slli a2, a0, 28 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 1(sp) +; RV32-NEXT: slli a0, a0, 30 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 0(sp) +; RV32-NEXT: slli a0, a1, 16 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: slli a0, a1, 18 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: slli a0, a1, 20 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: slli a0, a1, 22 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: slli a0, a1, 24 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: slli a0, a1, 26 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: slli a1, a1, 30 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 8(sp) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vle8.v v8, (a0) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vand.vi v8, v8, 1 +; RV32-NEXT: vmsne.vi v0, v8, 0 +; RV32-NEXT: vand.vi v8, v9, 1 +; RV32-NEXT: vmsne.vi v8, v8, 0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_deinterleave_v16i1_v32i1: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vfirst.m a0, v0 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: sb a0, 16(sp) +; RV64-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV64-NEXT: vmv.x.s a0, v0 +; RV64-NEXT: slli a1, a0, 49 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 23(sp) +; RV64-NEXT: slli a1, a0, 51 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 22(sp) +; RV64-NEXT: slli a1, a0, 53 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 21(sp) +; RV64-NEXT: slli a1, a0, 55 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 20(sp) +; RV64-NEXT: slli a1, a0, 57 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 19(sp) +; RV64-NEXT: slli a1, a0, 59 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 18(sp) +; RV64-NEXT: slli a1, a0, 61 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 17(sp) +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v8, v0, 2 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vfirst.m a1, v8 +; RV64-NEXT: seqz a1, a1 +; RV64-NEXT: sb a1, 24(sp) +; RV64-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: slli a2, a1, 49 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 31(sp) +; RV64-NEXT: slli a2, a1, 51 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 30(sp) +; RV64-NEXT: slli a2, a1, 53 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 29(sp) +; RV64-NEXT: slli a2, a1, 55 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 28(sp) +; RV64-NEXT: slli a2, a1, 57 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 27(sp) +; RV64-NEXT: slli a2, a1, 59 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 26(sp) +; RV64-NEXT: slli a2, a1, 61 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 25(sp) +; RV64-NEXT: slli a2, a0, 48 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 7(sp) +; RV64-NEXT: slli a2, a0, 50 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 6(sp) +; RV64-NEXT: slli a2, a0, 52 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 5(sp) +; RV64-NEXT: slli a2, a0, 54 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 4(sp) +; RV64-NEXT: slli a2, a0, 56 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 3(sp) +; RV64-NEXT: slli a2, a0, 58 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 2(sp) +; RV64-NEXT: slli a2, a0, 60 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 1(sp) +; RV64-NEXT: slli a0, a0, 62 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 0(sp) +; RV64-NEXT: slli a0, a1, 48 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 15(sp) +; RV64-NEXT: slli a0, a1, 50 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 14(sp) +; RV64-NEXT: slli a0, a1, 52 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 13(sp) +; RV64-NEXT: slli a0, a1, 54 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 12(sp) +; RV64-NEXT: slli a0, a1, 56 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 11(sp) +; RV64-NEXT: slli a0, a1, 58 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 10(sp) +; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 9(sp) +; RV64-NEXT: slli a1, a1, 62 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 8(sp) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vle8.v v8, (a0) +; RV64-NEXT: mv a0, sp +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vand.vi v8, v8, 1 +; RV64-NEXT: vmsne.vi v0, v8, 0 +; RV64-NEXT: vand.vi v8, v9, 1 +; RV64-NEXT: vmsne.vi v8, v8, 0 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: ret +%retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec) +ret {<16 x i1>, <16 x i1>} %retval +} + +define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) { +; CHECK-LABEL: vector_deinterleave_v16i8_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec) +ret {<16 x i8>, <16 x i8>} %retval +} + +define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec) { +; CHECK-LABEL: vector_deinterleave_v8i16_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec) +ret {<8 x i16>, <8 x i16>} %retval +} + +define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_vv8i32(<8 x i32> %vec) { +; CHECK-LABEL: vector_deinterleave_v4i32_vv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec) +ret {<4 x i32>, <4 x i32>} %retval +} + +define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v11, v8, 1 +; CHECK-NEXT: vrgather.vi v11, v12, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) +ret {<2 x i64>, <2 x i64>} %retval +} + +declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>) +declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) + +; Floats + +define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) +ret {<2 x half>, <2 x half>} %retval +} + +define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec) +ret {<4 x half>, <4 x half>} %retval +} + +define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v8, a0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: ret +%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec) +ret {<2 x float>, <2 x float>} %retval +} + +define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v8f16_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec) +ret {<8 x half>, <8 x half>} %retval +} + +define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec) +ret {<4 x float>, <4 x float>} %retval +} + +define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v11, v8, 1 +; CHECK-NEXT: vrgather.vi v11, v12, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) +ret {<2 x double>, <2 x double>} %retval +} + +declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) +declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) +declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>) +declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) +declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -0,0 +1,182 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {, } @vector_deinterleave_nxv16i1_nxv32i1( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v12, 8 +; CHECK-NEXT: vand.vi v10, v8, 1 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv16i8_nxv32i8( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vnsrl.wi v14, v8, 8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.v v10, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i8( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8i16_nxv16i16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vnsrl.wi v14, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.v v10, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4i32_nxvv8i32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vnsrl.wx v12, v8, a0 +; CHECK-NEXT: vnsrl.wi v14, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v14 +; CHECK-NEXT: vmv.v.v v10, v12 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2i64_nxv4i64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i64( %vec) +ret {, } %retval +} + +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i1() +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4i64() + +; Floats + +define {, } @vector_deinterleave_nxv2f16_nxv4f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f16_nxv8f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f32_nxv4f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8f16_nxv16f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vnsrl.wi v14, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.v v10, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f32_nxv8f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vnsrl.wx v12, v8, a0 +; CHECK-NEXT: vnsrl.wi v14, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v14 +; CHECK-NEXT: vmv.v.v v10, v12 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f64_nxv4f64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f64( %vec) +ret {, } %retval +} + +declare {,} @llvm.experimental.vector.deinterleave2.nxv4f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f64() diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -0,0 +1,437 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck -check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck -check-prefixes=CHECK,RV64 %s + +; Integers + +define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) { +; RV32-LABEL: vector_interleave_v32i1_v16i1: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -64 +; RV32-NEXT: .cfi_def_cfa_offset 64 +; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: addi s0, sp, 64 +; RV32-NEXT: .cfi_def_cfa s0, 0 +; RV32-NEXT: andi sp, sp, -32 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vfirst.m a0, v8 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: sb a0, 1(sp) +; RV32-NEXT: vfirst.m a0, v0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: sb a0, 0(sp) +; RV32-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: slli a1, a0, 16 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 31(sp) +; RV32-NEXT: vmv.x.s a1, v0 +; RV32-NEXT: slli a2, a1, 16 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 30(sp) +; RV32-NEXT: slli a2, a0, 17 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 29(sp) +; RV32-NEXT: slli a2, a1, 17 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 28(sp) +; RV32-NEXT: slli a2, a0, 18 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 27(sp) +; RV32-NEXT: slli a2, a1, 18 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 26(sp) +; RV32-NEXT: slli a2, a0, 19 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 25(sp) +; RV32-NEXT: slli a2, a1, 19 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 24(sp) +; RV32-NEXT: slli a2, a0, 20 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 23(sp) +; RV32-NEXT: slli a2, a1, 20 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 22(sp) +; RV32-NEXT: slli a2, a0, 21 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 21(sp) +; RV32-NEXT: slli a2, a1, 21 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 20(sp) +; RV32-NEXT: slli a2, a0, 22 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 19(sp) +; RV32-NEXT: slli a2, a1, 22 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 18(sp) +; RV32-NEXT: slli a2, a0, 23 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 17(sp) +; RV32-NEXT: slli a2, a1, 23 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 16(sp) +; RV32-NEXT: slli a2, a0, 24 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 15(sp) +; RV32-NEXT: slli a2, a1, 24 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 14(sp) +; RV32-NEXT: slli a2, a0, 25 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 13(sp) +; RV32-NEXT: slli a2, a1, 25 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 12(sp) +; RV32-NEXT: slli a2, a0, 26 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 11(sp) +; RV32-NEXT: slli a2, a1, 26 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 10(sp) +; RV32-NEXT: slli a2, a0, 27 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 9(sp) +; RV32-NEXT: slli a2, a1, 27 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 8(sp) +; RV32-NEXT: slli a2, a0, 28 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 7(sp) +; RV32-NEXT: slli a2, a1, 28 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 6(sp) +; RV32-NEXT: slli a2, a0, 29 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 5(sp) +; RV32-NEXT: slli a2, a1, 29 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 4(sp) +; RV32-NEXT: slli a0, a0, 30 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 3(sp) +; RV32-NEXT: slli a1, a1, 30 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 2(sp) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vand.vi v8, v8, 1 +; RV32-NEXT: vmsne.vi v0, v8, 0 +; RV32-NEXT: addi sp, s0, -64 +; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_v32i1_v16i1: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -64 +; RV64-NEXT: .cfi_def_cfa_offset 64 +; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: addi s0, sp, 64 +; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: andi sp, sp, -32 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vfirst.m a0, v8 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: sb a0, 1(sp) +; RV64-NEXT: vfirst.m a0, v0 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: sb a0, 0(sp) +; RV64-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: slli a1, a0, 48 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 31(sp) +; RV64-NEXT: vmv.x.s a1, v0 +; RV64-NEXT: slli a2, a1, 48 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 30(sp) +; RV64-NEXT: slli a2, a0, 49 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 29(sp) +; RV64-NEXT: slli a2, a1, 49 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 28(sp) +; RV64-NEXT: slli a2, a0, 50 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 27(sp) +; RV64-NEXT: slli a2, a1, 50 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 26(sp) +; RV64-NEXT: slli a2, a0, 51 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 25(sp) +; RV64-NEXT: slli a2, a1, 51 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 24(sp) +; RV64-NEXT: slli a2, a0, 52 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 23(sp) +; RV64-NEXT: slli a2, a1, 52 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 22(sp) +; RV64-NEXT: slli a2, a0, 53 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 21(sp) +; RV64-NEXT: slli a2, a1, 53 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 20(sp) +; RV64-NEXT: slli a2, a0, 54 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 19(sp) +; RV64-NEXT: slli a2, a1, 54 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 18(sp) +; RV64-NEXT: slli a2, a0, 55 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 17(sp) +; RV64-NEXT: slli a2, a1, 55 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 16(sp) +; RV64-NEXT: slli a2, a0, 56 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 15(sp) +; RV64-NEXT: slli a2, a1, 56 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 14(sp) +; RV64-NEXT: slli a2, a0, 57 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 13(sp) +; RV64-NEXT: slli a2, a1, 57 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 12(sp) +; RV64-NEXT: slli a2, a0, 58 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 11(sp) +; RV64-NEXT: slli a2, a1, 58 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 10(sp) +; RV64-NEXT: slli a2, a0, 59 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 9(sp) +; RV64-NEXT: slli a2, a1, 59 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 8(sp) +; RV64-NEXT: slli a2, a0, 60 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 7(sp) +; RV64-NEXT: slli a2, a1, 60 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 6(sp) +; RV64-NEXT: slli a2, a0, 61 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 5(sp) +; RV64-NEXT: slli a2, a1, 61 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 4(sp) +; RV64-NEXT: slli a0, a0, 62 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 3(sp) +; RV64-NEXT: slli a1, a1, 62 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 2(sp) +; RV64-NEXT: li a0, 32 +; RV64-NEXT: mv a1, sp +; RV64-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vand.vi v8, v8, 1 +; RV64-NEXT: vmsne.vi v0, v8, 0 +; RV64-NEXT: addi sp, s0, -64 +; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: ret + %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b) + ret <32 x i1> %res +} + +define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vector_interleave_v16i16_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b) + ret <16 x i16> %res +} + +define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vector_interleave_v8i32_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b) + ret <8 x i32> %res +} + +define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { +; RV32-LABEL: vector_interleave_v4i64_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v10, v9 +; RV32-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; RV32-NEXT: vslideup.vi v12, v8, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, ma +; RV32-NEXT: vslideup.vi v12, v10, 2 +; RV32-NEXT: lui a0, %hi(.LCPI3_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI3_0) +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vle16.v v10, (a0) +; RV32-NEXT: vrgatherei16.vv v8, v12, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_v4i64_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v10, v9 +; RV64-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; RV64-NEXT: vslideup.vi v12, v8, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, ma +; RV64-NEXT: vslideup.vi v12, v10, 2 +; RV64-NEXT: lui a0, %hi(.LCPI3_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI3_0) +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vle64.v v10, (a0) +; RV64-NEXT: vrgather.vv v8, v12, v10 +; RV64-NEXT: ret + %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b) + ret <4 x i64> %res +} + +declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) + +; Floats + +define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: vector_interleave_v4f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b) + ret <4 x half> %res +} + +define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: vector_interleave_v8f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b) + ret <8 x half> %res +} + +define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: vector_interleave_v4f32_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b) + ret <4 x float> %res +} + +define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: vector_interleave_v16f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b) + ret <16 x half> %res +} + +define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: vector_interleave_v8f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b) + ret <8 x float> %res +} + +define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> %b) { +; RV32-LABEL: vector_interleave_v4f64_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v10, v9 +; RV32-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; RV32-NEXT: vslideup.vi v12, v8, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, ma +; RV32-NEXT: vslideup.vi v12, v10, 2 +; RV32-NEXT: lui a0, %hi(.LCPI9_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI9_0) +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vle16.v v10, (a0) +; RV32-NEXT: vrgatherei16.vv v8, v12, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_v4f64_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v10, v9 +; RV64-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; RV64-NEXT: vslideup.vi v12, v8, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, ma +; RV64-NEXT: vslideup.vi v12, v10, 2 +; RV64-NEXT: lui a0, %hi(.LCPI9_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI9_0) +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vle64.v v10, (a0) +; RV64-NEXT: vrgather.vv v8, v12, v10 +; RV64-NEXT: ret + %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b) + ret <4 x double> %res +} + + +declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) +declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>) +declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -0,0 +1,187 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define @vector_interleave_nxv32i1_nxv16i1( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv32i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vwaddu.vv v16, v8, v12 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v16, a0, v12 +; CHECK-NEXT: vand.vi v8, v18, 1 +; CHECK-NEXT: vmsne.vi v10, v8, 0 +; CHECK-NEXT: vand.vi v8, v16, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v0, v10, a0 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv32i1( %a, %b) + ret %res +} + +define @vector_interleave_nxv16i16_nxv8i16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv16i16_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vwaddu.vv v12, v8, v10 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v12, a0, v10 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16i16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8i32_nxv4i32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vwaddu.vv v12, v8, v10 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v12, a0, v10 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8i32( %a, %b) + ret %res +} + +define @vector_interleave_nxv4i64_nxv2i64( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4i64_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vand.vi v13, v12, 1 +; CHECK-NEXT: vmsne.vi v0, v13, 0 +; CHECK-NEXT: vsrl.vi v16, v12, 1 +; CHECK-NEXT: vadd.vx v16, v16, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4i64( %a, %b) + ret %res +} + +declare @llvm.experimental.vector.interleave2.nxv32i1(, ) +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) + +; Floats + +define @vector_interleave_nxv4f16_nxv2f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v10, a0 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vx v10, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f16_nxv4f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f32_nxv2f32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f32_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv16f16_nxv8f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv16f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vwaddu.vv v12, v8, v10 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v12, a0, v10 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f32_nxv4f32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vwaddu.vv v12, v8, v10 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v12, a0, v10 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f64_nxv2f64( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f64_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vand.vi v13, v12, 1 +; CHECK-NEXT: vmsne.vi v0, v13, 0 +; CHECK-NEXT: vsrl.vi v16, v12, 1 +; CHECK-NEXT: vadd.vx v16, v16, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f64( %a, %b) + ret %res +} + + +declare @llvm.experimental.vector.interleave2.nxv4f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f16(, ) +declare @llvm.experimental.vector.interleave2.nxv4f32(, ) +declare @llvm.experimental.vector.interleave2.nxv16f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, )