diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -270,6 +270,8 @@ VWSUB_W_VL, VWSUBU_W_VL, + // Narrowing logical shift right. + // Operands are (source, shift, passthru, mask, vl) VNSRL_VL, // Vector compare producing a mask. Fourth operand is input mask. Fifth @@ -698,6 +700,8 @@ SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_REVERSE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -600,6 +600,10 @@ setOperationAction({ISD::VP_FP_TO_SINT, ISD::VP_FP_TO_UINT, ISD::VP_TRUNCATE, ISD::VP_SETCC}, VT, Custom); + + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_REVERSE, VT, Custom); setOperationPromotedToType( @@ -691,6 +695,9 @@ VT, Expand); } + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + // Splice setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); @@ -772,6 +779,9 @@ {ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + setOperationAction({ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE}, VT, Custom); setOperationAction(FloatingPointVPOps, VT, Custom); @@ -4092,6 +4102,10 @@ return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return lowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::VECTOR_DEINTERLEAVE: + return lowerVECTOR_DEINTERLEAVE(Op, DAG); + case ISD::VECTOR_INTERLEAVE: + return lowerVECTOR_INTERLEAVE(Op, DAG); case ISD::STEP_VECTOR: return lowerSTEP_VECTOR(Op, DAG); case ISD::VECTOR_REVERSE: @@ -6478,6 +6492,227 @@ return DAG.getBitcast(Op.getSimpleValueType(), Slidedown); } +// Convert a vector to a mask vector ( -> ), +// treating the integer values as booleans. +// This produces vmsne.vi v0, oddmask, 0 +static SDValue convertToMask(SDValue N, SDLoc &DL, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + MVT VT = N.getSimpleValueType(); + return DAG.getSetCC( + DL, getMaskTypeFor(VT), N, + DAG.getSplatVector(VT, DL, DAG.getConstant(0, DL, Subtarget.getXLenVT())), + ISD::CondCode::SETNE); +} + +// Widen a vector's operands to i8, then truncate its results back to the +// original type, typically i8. All operand and result types must be the same. +static SDValue wideVectorOpToi8(SDValue N, SDLoc &DL, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + MVT VT = N.getSimpleValueType(); + MVT WideVT = VT.changeVectorElementType(MVT::i8); + SmallVector WideOps; + for (SDValue Op : N.getNode()->ops()) { + assert(Op.getSimpleValueType() == VT && + "Operands and result must be same type"); + WideOps.push_back(DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op)); + } + + unsigned NumVals = N.getNode()->getNumValues(); + + SDVTList VTs = DAG.getVTList(SmallVector( + NumVals, N.getValueType().changeVectorElementType(MVT::i8))); + SDValue WideN = DAG.getNode(N.getOpcode(), DL, VTs, WideOps); + SmallVector TruncVals; + for (unsigned I = 0; I < WideN.getNode()->getNumValues(); I++) { + TruncVals.push_back( + convertToMask(SDValue(WideN.getNode(), I), DL, DAG, Subtarget)); + } + + if (TruncVals.size() > 1) + return DAG.getMergeValues(TruncVals, DL); + return TruncVals.front(); +} + +SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VecVT = Op.getSimpleValueType(); + + assert(VecVT.isScalableVector() && + "vector_interleave on non-scalable vector!"); + + // 1 bit element vectors need to be widened to e8 + if (VecVT.getVectorElementType() == MVT::i1) + return wideVectorOpToi8(Op, DL, DAG, Subtarget); + + // Concatenate the two vectors as one vector to deinterleave + MVT WideVT = MVT::getScalableVectorVT(VecVT.getVectorElementType(), + VecVT.getVectorMinNumElements() * 2); + SDValue Wide = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideVT, Op.getOperand(0), + Op.getOperand(1)); + + // We want to operate on all the elements in the vector, so define + // the VL and mask to do so + MVT XLenVT = Subtarget.getXLenVT(); + SDValue VL = DAG.getRegister(RISCV::X0, XLenVT); + SDValue TrueMask = getAllOnesMask(WideVT, VL, DL, DAG); + + // If the element type is smaller than ELEN, then we can deinterleave + // through vnsrl.wi + if (VecVT.getScalarSizeInBits() < Subtarget.getELEN()) { + // Bitcast the concatenated vector from -> + // This is also casts FPs to ints + MVT WideEltVT = MVT::getIntegerVT(WideVT.getScalarSizeInBits() * 2); + WideVT = MVT::getVectorVT( + WideEltVT, WideVT.getVectorElementCount().divideCoefficientBy(2)); + Wide = DAG.getBitcast(WideVT, Wide); + + MVT NarrowVT = VecVT.changeVectorElementTypeToInteger(); + + SDValue Passthru = DAG.getUNDEF(VecVT); + + SDValue Even = DAG.getNode( + RISCVISD::VNSRL_VL, DL, NarrowVT, Wide, + DAG.getSplatVector(NarrowVT, DL, DAG.getConstant(0, DL, XLenVT)), + Passthru, TrueMask, VL); + SDValue Odd = DAG.getNode( + RISCVISD::VNSRL_VL, DL, NarrowVT, Wide, + DAG.getSplatVector( + NarrowVT, DL, + DAG.getConstant(VecVT.getScalarSizeInBits(), DL, XLenVT)), + Passthru, TrueMask, VL); + + // Bitcast the results back in case it was casted from an FP vector + return DAG.getMergeValues( + {DAG.getBitcast(VecVT, Even), DAG.getBitcast(VecVT, Odd)}, DL); + } + + MVT IdxVT = WideVT.changeVectorElementTypeToInteger(); + // Create a vector of even indices {0, 2, 4, ...} + SDValue EvenIdx = + DAG.getStepVector(DL, IdxVT, APInt(WideVT.getScalarSizeInBits(), 2)); + // Create a vector of odd indices {1, 3, 5, ... } + SDValue OddIdx = + DAG.getNode(ISD::ADD, DL, IdxVT, EvenIdx, DAG.getConstant(1, DL, IdxVT)); + + // Gather the even and odd elements into two separate vectors + SDValue EvenWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, WideVT, Wide, + EvenIdx, DAG.getUNDEF(WideVT), TrueMask, VL); + SDValue OddWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, WideVT, Wide, + OddIdx, DAG.getUNDEF(WideVT), TrueMask, VL); + + // Extract the result half of the gather for even and odd + SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide, + DAG.getConstant(0, DL, XLenVT)); + SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide, + DAG.getConstant(0, DL, XLenVT)); + + return DAG.getMergeValues({Even, Odd}, DL); +} + +SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VecVT = Op.getSimpleValueType(); + + assert(VecVT.isScalableVector() && + "vector_interleave on non-scalable vector!"); + + // i1 vectors need to be widened to i8 + if (VecVT.getVectorElementType() == MVT::i1) + return wideVectorOpToi8(Op, DL, DAG, Subtarget); + + MVT XLenVT = Subtarget.getXLenVT(); + SDValue VL = DAG.getRegister(RISCV::X0, XLenVT); + + MVT ConcatVT = + MVT::getVectorVT(VecVT.getVectorElementType(), + VecVT.getVectorElementCount().multiplyCoefficientBy(2)); + + SDValue Interleaved; + + // If the element type is smaller than ELEN, then we can interleave with + // vwaddu.vv and vwmacc.vx + if (VecVT.getScalarSizeInBits() < Subtarget.getELEN()) { + // We're working with a vector of the same size as the resulting + // interleaved vector, but with half the number of elements and + // twice the SEW (Hence the restriction on not using the maximum + // ELEN) + MVT WideVT = + MVT::getVectorVT(MVT::getIntegerVT(VecVT.getScalarSizeInBits() * 2), + VecVT.getVectorElementCount()); + MVT IntVecVT = VecVT.changeTypeToInteger(); + + // Bitcast the input vectors in case they are FP + SDValue Evens = DAG.getBitcast(IntVecVT, Op.getOperand(0)); + SDValue Odds = DAG.getBitcast(IntVecVT, Op.getOperand(1)); + + SDValue Passthru = DAG.getUNDEF(WideVT); + SDValue TrueMask = getAllOnesMask(WideVT, VL, DL, DAG); + + // First add the odds and evens together + Interleaved = DAG.getNode(RISCVISD::VWADD_VL, DL, WideVT, Evens, Odds, + Passthru, TrueMask, VL); + // Then get the odds multiplied by 2^(VecVT.getScalarSizeInBits() - 1) + SDValue AllOnesVec = + DAG.getSplatVector(IntVecVT, DL, DAG.getAllOnesConstant(DL, XLenVT)); + SDValue OddsMul = DAG.getNode(RISCVISD::VWMUL_VL, DL, WideVT, Odds, + AllOnesVec, Passthru, TrueMask, VL); + + // Add the two together so we get + // (odds * 0xff...ff) + (odds + evens) + // = (odds * 0x100...00) + evens + // = (odds << VecVT.getScalarSizeInBits()) + evens + // Note use RISCV::ADD_VL here instead of ISD::ADD so that this + // and Op1Mul get selected as vwmacc.vx + Interleaved = DAG.getNode(RISCVISD::ADD_VL, DL, WideVT, Interleaved, + OddsMul, Passthru, TrueMask, VL); + // Bitcast it back to the final type and count of elements + Interleaved = DAG.getBitcast(ConcatVT, Interleaved); + } else { + // Otherwise, fallback to using vrgathere16.vv + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, + Op.getOperand(0), Op.getOperand(1)); + + MVT IdxVT = MVT::getVectorVT(MVT::i16, ConcatVT.getVectorElementCount()); + + SDValue StepVec = DAG.getStepVector(DL, IdxVT); + + SDValue One = DAG.getConstant(1, DL, XLenVT); + + // ... 0 1 0 1 0 1 0 1 + SDValue OddMask = DAG.getNode(ISD::AND, DL, IdxVT, StepVec, + DAG.getSplatVector(IdxVT, DL, One)); + OddMask = convertToMask(OddMask, DL, DAG, Subtarget); + + SDValue VLMAX = DAG.getNode( + ISD::VSCALE, DL, XLenVT, + getVLOp(VecVT.getVectorMinNumElements(), DL, DAG, Subtarget)); + + // Build up the index vector for interleaving the concatenated vector + // ... 3 3 2 2 1 1 0 0 + SDValue Idx = DAG.getNode(ISD::SRL, DL, IdxVT, StepVec, + DAG.getSplatVector(IdxVT, DL, One)); + // ... n+3 3 n+2 2 n+1 1 n 0 + Idx = DAG.getNode(RISCVISD::ADD_VL, DL, IdxVT, Idx, + DAG.getSplatVector(IdxVT, DL, VLMAX), DAG.getUNDEF(IdxVT), + OddMask, VL); + + // Perform the interleaving + Interleaved = DAG.getNode(RISCVISD::VRGATHEREI16_VV_VL, DL, ConcatVT, + Concat, Idx, DAG.getUNDEF(ConcatVT), OddMask, VL); + } + + // Extract the two halves from the interleaved result + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Interleaved, + DAG.getVectorIdxConstant(0, DL)); + SDValue Hi = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, VecVT, Interleaved, + DAG.getVectorIdxConstant(VecVT.getVectorMinNumElements(), DL)); + + return DAG.getMergeValues({Lo, Hi}, DL); +} + // Lower step_vector to the vid instruction. Any non-identity step value must // be accounted for my manual expansion. SDValue RISCVTargetLowering::lowerSTEP_VECTOR(SDValue Op, diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -0,0 +1,405 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck --check-prefixes=CHECK,RV64 %s + +; Integers + +define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { +; RV32-LABEL: vector_deinterleave_v16i1_v32i1: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vfirst.m a0, v0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: sb a0, 16(sp) +; RV32-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV32-NEXT: vmv.x.s a0, v0 +; RV32-NEXT: slli a1, a0, 17 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 23(sp) +; RV32-NEXT: slli a1, a0, 19 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 22(sp) +; RV32-NEXT: slli a1, a0, 21 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 21(sp) +; RV32-NEXT: slli a1, a0, 23 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 20(sp) +; RV32-NEXT: slli a1, a0, 25 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 19(sp) +; RV32-NEXT: slli a1, a0, 27 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 18(sp) +; RV32-NEXT: slli a1, a0, 29 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 17(sp) +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v8, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vfirst.m a1, v8 +; RV32-NEXT: seqz a1, a1 +; RV32-NEXT: sb a1, 24(sp) +; RV32-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: slli a2, a1, 17 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 31(sp) +; RV32-NEXT: slli a2, a1, 19 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 30(sp) +; RV32-NEXT: slli a2, a1, 21 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 29(sp) +; RV32-NEXT: slli a2, a1, 23 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 28(sp) +; RV32-NEXT: slli a2, a1, 25 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 27(sp) +; RV32-NEXT: slli a2, a1, 27 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 26(sp) +; RV32-NEXT: slli a2, a1, 29 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 25(sp) +; RV32-NEXT: slli a2, a0, 16 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 7(sp) +; RV32-NEXT: slli a2, a0, 18 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 6(sp) +; RV32-NEXT: slli a2, a0, 20 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 5(sp) +; RV32-NEXT: slli a2, a0, 22 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 4(sp) +; RV32-NEXT: slli a2, a0, 24 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 3(sp) +; RV32-NEXT: slli a2, a0, 26 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 2(sp) +; RV32-NEXT: slli a2, a0, 28 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: sb a2, 1(sp) +; RV32-NEXT: slli a0, a0, 30 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 0(sp) +; RV32-NEXT: slli a0, a1, 16 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: slli a0, a1, 18 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: slli a0, a1, 20 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: slli a0, a1, 22 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: slli a0, a1, 24 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: slli a0, a1, 26 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: slli a1, a1, 30 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: sb a1, 8(sp) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vle8.v v8, (a0) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vand.vi v8, v8, 1 +; RV32-NEXT: vmsne.vi v0, v8, 0 +; RV32-NEXT: vand.vi v8, v9, 1 +; RV32-NEXT: vmsne.vi v8, v8, 0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_deinterleave_v16i1_v32i1: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vfirst.m a0, v0 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: sb a0, 16(sp) +; RV64-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV64-NEXT: vmv.x.s a0, v0 +; RV64-NEXT: slli a1, a0, 49 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 23(sp) +; RV64-NEXT: slli a1, a0, 51 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 22(sp) +; RV64-NEXT: slli a1, a0, 53 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 21(sp) +; RV64-NEXT: slli a1, a0, 55 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 20(sp) +; RV64-NEXT: slli a1, a0, 57 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 19(sp) +; RV64-NEXT: slli a1, a0, 59 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 18(sp) +; RV64-NEXT: slli a1, a0, 61 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 17(sp) +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v8, v0, 2 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vfirst.m a1, v8 +; RV64-NEXT: seqz a1, a1 +; RV64-NEXT: sb a1, 24(sp) +; RV64-NEXT: vsetivli zero, 0, e16, mf4, ta, ma +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: slli a2, a1, 49 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 31(sp) +; RV64-NEXT: slli a2, a1, 51 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 30(sp) +; RV64-NEXT: slli a2, a1, 53 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 29(sp) +; RV64-NEXT: slli a2, a1, 55 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 28(sp) +; RV64-NEXT: slli a2, a1, 57 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 27(sp) +; RV64-NEXT: slli a2, a1, 59 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 26(sp) +; RV64-NEXT: slli a2, a1, 61 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 25(sp) +; RV64-NEXT: slli a2, a0, 48 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 7(sp) +; RV64-NEXT: slli a2, a0, 50 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 6(sp) +; RV64-NEXT: slli a2, a0, 52 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 5(sp) +; RV64-NEXT: slli a2, a0, 54 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 4(sp) +; RV64-NEXT: slli a2, a0, 56 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 3(sp) +; RV64-NEXT: slli a2, a0, 58 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 2(sp) +; RV64-NEXT: slli a2, a0, 60 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sb a2, 1(sp) +; RV64-NEXT: slli a0, a0, 62 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 0(sp) +; RV64-NEXT: slli a0, a1, 48 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 15(sp) +; RV64-NEXT: slli a0, a1, 50 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 14(sp) +; RV64-NEXT: slli a0, a1, 52 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 13(sp) +; RV64-NEXT: slli a0, a1, 54 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 12(sp) +; RV64-NEXT: slli a0, a1, 56 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 11(sp) +; RV64-NEXT: slli a0, a1, 58 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 10(sp) +; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: sb a0, 9(sp) +; RV64-NEXT: slli a1, a1, 62 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 8(sp) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vle8.v v8, (a0) +; RV64-NEXT: mv a0, sp +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vand.vi v8, v8, 1 +; RV64-NEXT: vmsne.vi v0, v8, 0 +; RV64-NEXT: vand.vi v8, v9, 1 +; RV64-NEXT: vmsne.vi v8, v8, 0 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: ret +%retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec) +ret {<16 x i1>, <16 x i1>} %retval +} + +define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) { +; CHECK-LABEL: vector_deinterleave_v16i8_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec) +ret {<16 x i8>, <16 x i8>} %retval +} + +define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec) { +; CHECK-LABEL: vector_deinterleave_v8i16_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec) +ret {<8 x i16>, <8 x i16>} %retval +} + +define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_vv8i32(<8 x i32> %vec) { +; CHECK-LABEL: vector_deinterleave_v4i32_vv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec) +ret {<4 x i32>, <4 x i32>} %retval +} + +define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v11, v8, 1 +; CHECK-NEXT: vrgather.vi v11, v12, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) +ret {<2 x i64>, <2 x i64>} %retval +} + +declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>) +declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) + +; Floats + +define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) +ret {<2 x half>, <2 x half>} %retval +} + +define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec) +ret {<4 x half>, <4 x half>} %retval +} + +define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v8, a0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: ret +%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec) +ret {<2 x float>, <2 x float>} %retval +} + +define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v8f16_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec) +ret {<8 x half>, <8 x half>} %retval +} + +define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec) +ret {<4 x float>, <4 x float>} %retval +} + +define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v11, v8, 1 +; CHECK-NEXT: vrgather.vi v11, v12, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) +ret {<2 x double>, <2 x double>} %retval +} + +declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) +declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) +declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>) +declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) +declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {, } @vector_deinterleave_nxv16i1_nxv32i1( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv16i8_nxv32i8( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vnsrl.wi v14, v8, 8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.v v10, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i8( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8i16_nxv16i16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vnsrl.wi v14, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.v v10, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4i32_nxvv8i32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vnsrl.wx v12, v8, a0 +; CHECK-NEXT: vnsrl.wi v14, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v14 +; CHECK-NEXT: vmv.v.v v10, v12 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2i64_nxv4i64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i64( %vec) +ret {, } %retval +} + +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i1() +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4i64() + +; Floats + +define {, } @vector_deinterleave_nxv2f16_nxv4f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f16_nxv8f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f32_nxv4f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8f16_nxv16f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vnsrl.wi v14, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.v v10, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f32_nxv8f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vnsrl.wx v12, v8, a0 +; CHECK-NEXT: vnsrl.wi v14, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v14 +; CHECK-NEXT: vmv.v.v v10, v12 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f64_nxv4f64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f64( %vec) +ret {, } %retval +} + +declare {,} @llvm.experimental.vector.deinterleave2.nxv4f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f64() diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) { + %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b) + ret <32 x i1> %res +} + +define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vector_interleave_v16i16_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 8 +; CHECK-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0) +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b) + ret <16 x i16> %res +} + +define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vector_interleave_v8i32_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: lui a0, %hi(.LCPI2_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI2_0) +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b) + ret <8 x i32> %res +} + +define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { + %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b) + ret <4 x i64> %res +} + +declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) + +; Floats + +define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: vector_interleave_v4f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: lui a0, %hi(.LCPI4_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0) +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: ret + %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b) + ret <4 x half> %res +} + +define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: vector_interleave_v8f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: lui a0, %hi(.LCPI5_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b) + ret <8 x half> %res +} + +define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: vector_interleave_v4f32_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: lui a0, %hi(.LCPI6_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b) + ret <4 x float> %res +} + +define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: vector_interleave_v16f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 8 +; CHECK-NEXT: lui a0, %hi(.LCPI7_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b) + ret <16 x half> %res +} + +define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: vector_interleave_v8f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: lui a0, %hi(.LCPI8_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b) + ret <8 x float> %res +} + +define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> %b) { + %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b) + ret <4 x double> %res +} + + +declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) +declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>) +declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define @vector_interleave_nxv32i1_nxv16i1( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv32i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vwadd.vv v16, v8, v12 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmacc.vx v16, a0, v12 +; CHECK-NEXT: vmsne.vi v8, v18, 0 +; CHECK-NEXT: vmsne.vi v0, v16, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v0, v8, a0 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv32i1( %a, %b) + ret %res +} + +define @vector_interleave_nxv16i16_nxv8i16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv16i16_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vwadd.vv v12, v8, v10 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmacc.vx v12, a0, v10 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16i16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8i32_nxv4i32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vwadd.vv v12, v8, v10 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmacc.vx v12, a0, v10 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8i32( %a, %b) + ret %res +} + +define @vector_interleave_nxv4i64_nxv2i64( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4i64_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vand.vi v13, v12, 1 +; CHECK-NEXT: vmsne.vi v0, v13, 0 +; CHECK-NEXT: vsrl.vi v12, v12, 1 +; CHECK-NEXT: vadd.vx v16, v12, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4i64( %a, %b) + ret %res +} + +declare @llvm.experimental.vector.interleave2.nxv32i1(, ) +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) + +; Floats + +define @vector_interleave_nxv4f16_nxv2f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vwadd.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmacc.vx v10, a0, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v10, a0 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vx v10, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f16_nxv4f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vwadd.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmacc.vx v10, a0, v9 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f32_nxv2f32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f32_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vwadd.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmacc.vx v10, a0, v9 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv16f16_nxv8f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv16f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vwadd.vv v12, v8, v10 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmacc.vx v12, a0, v10 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f32_nxv4f32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vwadd.vv v12, v8, v10 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmacc.vx v12, a0, v10 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f64_nxv2f64( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f64_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vand.vi v13, v12, 1 +; CHECK-NEXT: vmsne.vi v0, v13, 0 +; CHECK-NEXT: vsrl.vi v12, v12, 1 +; CHECK-NEXT: vadd.vx v16, v12, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f64( %a, %b) + ret %res +} + + +declare @llvm.experimental.vector.interleave2.nxv4f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f16(, ) +declare @llvm.experimental.vector.interleave2.nxv4f32(, ) +declare @llvm.experimental.vector.interleave2.nxv16f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, )