diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -6681,33 +6681,102 @@ return TruncVals.front(); } +// (concat_vectors (extract_subvector x, 0), +// (extract_subvector x, n), +// (extract_subvector x, n*2), +// undef*) +// -> (insert_subvector undef, x, 0) +static SDValue getInsertOfExtractsFromConcat(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue Src; + for (unsigned i = 0; i < Op.getNumOperands(); i++) { + SDValue V = Op.getOperand(i); + MVT VVT = V.getSimpleValueType(); + if (V.isUndef()) + continue; + if (V.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return SDValue(); + if (V.getConstantOperandVal(1) != i * VVT.getVectorMinNumElements()) + return SDValue(); + SDValue ExtractSrc = V.getOperand(0); + if (Src && Src != ExtractSrc) + return SDValue(); + // Ensure we have concattenated into + if (VVT.getVectorElementCount() * Op.getNumOperands() != + ExtractSrc.getSimpleValueType().getVectorElementCount()) + return SDValue(); + Src = ExtractSrc; + } + if (!Src) + return SDValue(); + // Can only insert smaller vectors + if (Src.getSimpleValueType().getVectorMinNumElements() > + VT.getVectorMinNumElements()) + return SDValue(); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Src, + DAG.getVectorIdxConstant(0, DL)); +} + +// Concatenate the vectors together into the next legal vector type. +static SDValue getExtendedConcatVector(SmallVector Vecs, SDLoc &DL, + SelectionDAG &DAG) { + MVT VecVT = Vecs[0].getSimpleValueType(); + + ElementCount ConcatCount = + VecVT.getVectorElementCount().multiplyCoefficientBy(Vecs.size()); + MVT ConcatVT = MVT::getVectorVT(VecVT.getVectorElementType(), ConcatCount); + + // ConcatVT may not be a valid MVT, usually because of an unsupported element + // count like v6i32. In this case we attempt to legalize it by extending it to + // the next power of two with undefs. + SmallVector ConcatOps(Vecs); + if (!ConcatVT.isValid()) { + MVT NewConcatVT = MVT::getVectorVT(VecVT.getVectorElementType(), + ConcatCount.coefficientNextPowerOf2()); + unsigned NumUndefs = (NewConcatVT.getVectorMinNumElements() - + ConcatCount.getKnownMinValue()) / + VecVT.getVectorMinNumElements(); + for (unsigned i = 0; i < NumUndefs; i++) + ConcatOps.push_back(DAG.getUNDEF(VecVT)); + ConcatVT = NewConcatVT; + } + + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, ConcatOps); + // If coming from vector_interleave/vector_deinterleave, vecs will often be + // extract_subvectors of the same source vector. In that case we can just + // extend the source vector with insert_subvector into an undef + if (SDValue Insert = getInsertOfExtractsFromConcat(Concat, DAG)) + return Insert; + return Concat; +} + SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VecVT = Op.getSimpleValueType(); MVT XLenVT = Subtarget.getXLenVT(); + unsigned NF = Op.getNumOperands(); assert(VecVT.isScalableVector() && - "vector_interleave on non-scalable vector!"); + "vector_deinterleave on non-scalable vector!"); // 1 bit element vectors need to be widened to e8 if (VecVT.getVectorElementType() == MVT::i1) return widenVectorOpsToi8(Op, DL, DAG); // Concatenate the two vectors as one vector to deinterleave - MVT ConcatVT = - MVT::getVectorVT(VecVT.getVectorElementType(), - VecVT.getVectorElementCount().multiplyCoefficientBy(2)); - SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, - Op.getOperand(0), Op.getOperand(1)); + SDValue Concat = + getExtendedConcatVector(SmallVector(Op->ops()), DL, DAG); + MVT ConcatVT = Concat.getSimpleValueType(); // We want to operate on all lanes, so get the mask and VL and mask for it auto [Mask, VL] = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget); SDValue Passthru = DAG.getUNDEF(ConcatVT); - // We can deinterleave through vnsrl.wi if the element type is smaller than - // ELEN - if (VecVT.getScalarSizeInBits() < Subtarget.getELEN()) { + // We can deinterleave through vnsrl.wi if NF=2 and the element type is + // smaller than ELEN + if (NF == 2 && VecVT.getScalarSizeInBits() < Subtarget.getELEN()) { SDValue Even = getDeinterleaveViaVNSRL(DL, VecVT, Concat, true, Subtarget, DAG); SDValue Odd = @@ -6717,30 +6786,46 @@ // For the indices, use the same SEW to avoid an extra vsetvli MVT IdxVT = ConcatVT.changeVectorElementTypeToInteger(); - // Create a vector of even indices {0, 2, 4, ...} - SDValue EvenIdx = - DAG.getStepVector(DL, IdxVT, APInt(IdxVT.getScalarSizeInBits(), 2)); - // Create a vector of odd indices {1, 3, 5, ... } - SDValue OddIdx = - DAG.getNode(ISD::ADD, DL, IdxVT, EvenIdx, DAG.getConstant(1, DL, IdxVT)); - - // Gather the even and odd elements into two separate vectors - SDValue EvenWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT, - Concat, EvenIdx, Passthru, Mask, VL); - SDValue OddWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT, - Concat, OddIdx, Passthru, Mask, VL); - - // Extract the result half of the gather for even and odd - SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide, - DAG.getConstant(0, DL, XLenVT)); - SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide, - DAG.getConstant(0, DL, XLenVT)); + unsigned EltSize = ConcatVT.getVectorElementType().getSizeInBits(); + unsigned MinSize = ConcatVT.getSizeInBits().getKnownMinValue(); + unsigned VectorBitsMax = Subtarget.getRealMaxVLen(); + unsigned MaxVLMAX = + RISCVTargetLowering::computeVLMAX(VectorBitsMax, EltSize, MinSize); + unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL; + + // If this is SEW=8 and VLMAX is potentially more than 256, we need + // to use vrgatherei16.vv. + // TODO: Share this logic with lowerVECTOR_REVERSE + if (MaxVLMAX > 256 && EltSize == 8) { + GatherOpc = RISCVISD::VRGATHEREI16_VV_VL; + IdxVT = IdxVT.changeVectorElementType(MVT::i16); + } + + SmallVector Results; - return DAG.getMergeValues({Even, Odd}, DL); + SDValue Idx = + DAG.getStepVector(DL, IdxVT, APInt(IdxVT.getScalarSizeInBits(), NF)); + + // Generate indices of the following and gather them: + // NF=2: <0, 2, 4, 6, ...>, <1, 3, 5, 7, ...> + // NF=3: <0, 3, 6, 9, ...>, <1, 4, 7, 10, ...>, <2, 5, 8, 11, ...> + // etc. + for (unsigned i = 0; i < NF; i++) { + SDValue Wide = + DAG.getNode(GatherOpc, DL, ConcatVT, Concat, Idx, Passthru, Mask, VL); + // Extract out the deinterleaved vector from + SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Wide, + DAG.getConstant(0, DL, XLenVT)); + Results.push_back(Res); + Idx = DAG.getNode(ISD::ADD, DL, IdxVT, Idx, DAG.getConstant(1, DL, IdxVT)); + } + return DAG.getMergeValues(Results, DL); } SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const { + unsigned NF = Op.getNumOperands(); SDLoc DL(Op); MVT VecVT = Op.getSimpleValueType(); @@ -6751,62 +6836,64 @@ if (VecVT.getVectorElementType() == MVT::i1) return widenVectorOpsToi8(Op, DL, DAG); - MVT XLenVT = Subtarget.getXLenVT(); - SDValue VL = DAG.getRegister(RISCV::X0, XLenVT); - SDValue Interleaved; - // If the element type is smaller than ELEN, then we can interleave with - // vwaddu.vv and vwmaccu.vx - if (VecVT.getScalarSizeInBits() < Subtarget.getELEN()) { + // If NF = 2 and the element type is smaller than ELEN, then we can interleave + // with vwaddu.vv and vwmaccu.vx + if (NF == 2 && VecVT.getScalarSizeInBits() < Subtarget.getELEN()) { Interleaved = getWideningInterleave(Op.getOperand(0), Op.getOperand(1), DL, DAG, Subtarget); } else { - // Otherwise, fallback to using vrgathere16.vv - MVT ConcatVT = - MVT::getVectorVT(VecVT.getVectorElementType(), - VecVT.getVectorElementCount().multiplyCoefficientBy(2)); - SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, - Op.getOperand(0), Op.getOperand(1)); + SDValue Concat = + getExtendedConcatVector(SmallVector(Op->op_values()), DL, DAG); + MVT ConcatVT = Concat.getSimpleValueType(); MVT IdxVT = ConcatVT.changeVectorElementType(MVT::i16); - // 0 1 2 3 4 5 6 7 ... - SDValue StepVec = DAG.getStepVector(DL, IdxVT); - - // 1 1 1 1 1 1 1 1 ... - SDValue Ones = DAG.getSplatVector(IdxVT, DL, DAG.getConstant(1, DL, XLenVT)); - - // 1 0 1 0 1 0 1 0 ... - SDValue OddMask = DAG.getNode(ISD::AND, DL, IdxVT, StepVec, Ones); - OddMask = DAG.getSetCC( - DL, IdxVT.changeVectorElementType(MVT::i1), OddMask, - DAG.getSplatVector(IdxVT, DL, DAG.getConstant(0, DL, XLenVT)), - ISD::CondCode::SETNE); - - SDValue VLMax = DAG.getSplatVector(IdxVT, DL, computeVLMax(VecVT, DL, DAG)); - - // Build up the index vector for interleaving the concatenated vector - // 0 0 1 1 2 2 3 3 ... - SDValue Idx = DAG.getNode(ISD::SRL, DL, IdxVT, StepVec, Ones); - // 0 n 1 n+1 2 n+2 3 n+3 ... - Idx = - DAG.getNode(RISCVISD::ADD_VL, DL, IdxVT, Idx, VLMax, Idx, OddMask, VL); + // The length of one of the input vectors + SDValue VLMax = computeVLMax(VecVT, DL, DAG); - // Then perform the interleave - // v[0] v[n] v[1] v[n+1] v[2] v[n+2] v[3] v[n+3] ... - Interleaved = DAG.getNode(RISCVISD::VRGATHEREI16_VV_VL, DL, ConcatVT, - Concat, Idx, DAG.getUNDEF(ConcatVT), OddMask, VL); + // Compute the indices for interleaving: + // + // 0 1 2 3 4 5 <- (step 2) + // for NF = 2 + // 0 0 1 1 2 2 <- (/ step 2) + // 0 1 0 1 0 1 <- (rem step 2) + // 0 n*1 0 n*1 0 n*1 <- (* (rem step 2) vlmax) + // 0 n*1 1 n*1+1 2 n*1+2 <- (+ (* (rem step 3) vlmax) (/ step 2)) + // + // for NF = 3 + // 0 0 0 1 1 1 <- (/ step 3) + // 0 1 2 0 1 2 <- (rem step 3) + // 0 n*1 n*2 0 n*1 n*2 <- (* (rem step 3) vlmax) + // 0 n*1 n*2 1 n*1+1 n*2+1 <- (+ (* (rem step 3) vlmax) (/ step 3)) + SDValue StepVec = DAG.getStepVector(DL, IdxVT); + SDValue NFSplat = DAG.getSplatVector( + IdxVT, DL, DAG.getConstant(NF, DL, Subtarget.getXLenVT())); + SDValue Div = DAG.getNode(ISD::UDIV, DL, IdxVT, StepVec, NFSplat); + SDValue Idx = DAG.getNode(ISD::UREM, DL, IdxVT, StepVec, NFSplat); + Idx = DAG.getNode(ISD::MUL, DL, IdxVT, Idx, + DAG.getSplatVector(IdxVT, DL, VLMax)); + Idx = DAG.getNode(ISD::ADD, DL, IdxVT, Div, Idx); + + // Perform the interleave + auto [TrueMask, VL] = + getDefaultVLOps(ConcatVT, ConcatVT, DL, DAG, Subtarget); + Interleaved = + DAG.getNode(RISCVISD::VRGATHEREI16_VV_VL, DL, ConcatVT, Concat, Idx, + DAG.getUNDEF(ConcatVT), TrueMask, VL); } - // Extract the two halves from the interleaved result - SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Interleaved, - DAG.getVectorIdxConstant(0, DL)); - SDValue Hi = DAG.getNode( - ISD::EXTRACT_SUBVECTOR, DL, VecVT, Interleaved, - DAG.getVectorIdxConstant(VecVT.getVectorMinNumElements(), DL)); - - return DAG.getMergeValues({Lo, Hi}, DL); + // Extract the NF subvectors from the interleaved result + SmallVector SubVecs; + for (unsigned i = 0; i < NF; i++) { + SDValue Idx = + DAG.getVectorIdxConstant(VecVT.getVectorMinNumElements() * i, DL); + SDValue SubVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Interleaved, Idx); + SubVecs.push_back(SubVec); + } + return DAG.getMergeValues(SubVecs, DL); } // Lower step_vector to the vid instruction. Any non-identity step value must diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zfh,+experimental-zvfh | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zfh,+experimental-zvfh | FileCheck --check-prefixes=CHECK,RV64 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zfh,+experimental-zvfh -riscv-v-vector-bits-max=256 | FileCheck --check-prefixes=CHECK,256BIT-VEC %s ; Integers @@ -41,6 +42,146 @@ ret {, } %retval } + +define {, , } @vector_deinterleave_nxv4i8_nxv12i8( %vec) { +; RV32-LABEL: vector_deinterleave_nxv4i8_nxv12i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; RV32-NEXT: vid.v v12 +; RV32-NEXT: li a0, 3 +; RV32-NEXT: vmul.vx v16, v12, a0 +; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV32-NEXT: vrgatherei16.vv v12, v8, v16 +; RV32-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RV32-NEXT: vadd.vi v20, v16, 1 +; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV32-NEXT: vrgatherei16.vv v14, v8, v20 +; RV32-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RV32-NEXT: vadd.vi v16, v16, 2 +; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV32-NEXT: vrgatherei16.vv v10, v8, v16 +; RV32-NEXT: vmv1r.v v8, v12 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: # kill: def $v10 killed $v10 killed $v10m2 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_deinterleave_nxv4i8_nxv12i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; RV64-NEXT: vid.v v12 +; RV64-NEXT: li a0, 3 +; RV64-NEXT: vmul.vx v16, v12, a0 +; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v12, v8, v16 +; RV64-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RV64-NEXT: vadd.vi v20, v16, 1 +; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v14, v8, v20 +; RV64-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RV64-NEXT: vadd.vi v16, v16, 2 +; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v8, v16 +; RV64-NEXT: vmv1r.v v8, v12 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: # kill: def $v10 killed $v10 killed $v10m2 +; RV64-NEXT: ret +; +; 256BIT-VEC-LABEL: vector_deinterleave_nxv4i8_nxv12i8: +; 256BIT-VEC: # %bb.0: +; 256BIT-VEC-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; 256BIT-VEC-NEXT: vid.v v10 +; 256BIT-VEC-NEXT: li a0, 3 +; 256BIT-VEC-NEXT: vmul.vx v10, v10, a0 +; 256BIT-VEC-NEXT: vrgather.vv v12, v8, v10 +; 256BIT-VEC-NEXT: vadd.vi v14, v10, 1 +; 256BIT-VEC-NEXT: vrgather.vv v16, v8, v14 +; 256BIT-VEC-NEXT: vadd.vi v14, v10, 2 +; 256BIT-VEC-NEXT: vrgather.vv v10, v8, v14 +; 256BIT-VEC-NEXT: vmv1r.v v8, v12 +; 256BIT-VEC-NEXT: vmv1r.v v9, v16 +; 256BIT-VEC-NEXT: # kill: def $v10 killed $v10 killed $v10m2 +; 256BIT-VEC-NEXT: ret +%retval = call {, , } @llvm.experimental.vector.deinterleave.nxv4i8.nxv12i8( %vec) +ret {, , } %retval +} + + +define {, , , , } @vector_deinterleave_nxv1i8_nxv5i8( %vec) { +; RV32-LABEL: vector_deinterleave_nxv1i8_nxv5i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; RV32-NEXT: vid.v v10 +; RV32-NEXT: li a0, 5 +; RV32-NEXT: vmul.vx v14, v10, a0 +; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-NEXT: vrgatherei16.vv v13, v8, v14 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vadd.vi v10, v14, 1 +; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-NEXT: vrgatherei16.vv v9, v8, v10 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vadd.vi v16, v14, 2 +; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-NEXT: vrgatherei16.vv v10, v8, v16 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vadd.vi v16, v14, 3 +; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-NEXT: vrgatherei16.vv v11, v8, v16 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vadd.vi v14, v14, 4 +; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-NEXT: vrgatherei16.vv v12, v8, v14 +; RV32-NEXT: vmv.v.v v8, v13 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_deinterleave_nxv1i8_nxv5i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; RV64-NEXT: vid.v v10 +; RV64-NEXT: li a0, 5 +; RV64-NEXT: vmul.vx v14, v10, a0 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64-NEXT: vrgatherei16.vv v13, v8, v14 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vadd.vi v10, v14, 1 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64-NEXT: vrgatherei16.vv v9, v8, v10 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vadd.vi v16, v14, 2 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v8, v16 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vadd.vi v16, v14, 3 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64-NEXT: vrgatherei16.vv v11, v8, v16 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vadd.vi v14, v14, 4 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64-NEXT: vrgatherei16.vv v12, v8, v14 +; RV64-NEXT: vmv.v.v v8, v13 +; RV64-NEXT: ret +; +; 256BIT-VEC-LABEL: vector_deinterleave_nxv1i8_nxv5i8: +; 256BIT-VEC: # %bb.0: +; 256BIT-VEC-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; 256BIT-VEC-NEXT: vid.v v9 +; 256BIT-VEC-NEXT: li a0, 5 +; 256BIT-VEC-NEXT: vmul.vx v12, v9, a0 +; 256BIT-VEC-NEXT: vrgather.vv v13, v8, v12 +; 256BIT-VEC-NEXT: vadd.vi v10, v12, 1 +; 256BIT-VEC-NEXT: vrgather.vv v9, v8, v10 +; 256BIT-VEC-NEXT: vadd.vi v11, v12, 2 +; 256BIT-VEC-NEXT: vrgather.vv v10, v8, v11 +; 256BIT-VEC-NEXT: vadd.vi v14, v12, 3 +; 256BIT-VEC-NEXT: vrgather.vv v11, v8, v14 +; 256BIT-VEC-NEXT: vadd.vi v14, v12, 4 +; 256BIT-VEC-NEXT: vrgather.vv v12, v8, v14 +; 256BIT-VEC-NEXT: vmv.v.v v8, v13 +; 256BIT-VEC-NEXT: ret +%retval = call {, , , , } @llvm.experimental.vector.deinterleave.nxv1i8.nxv5i8( %vec) +ret {, , , , } %retval +} + define {, } @vector_deinterleave_nxv8i16_nxv16i16( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16: ; CHECK: # %bb.0: @@ -54,8 +195,52 @@ ret {, } %retval } -define {, } @vector_deinterleave_nxv4i32_nxvv8i32( %vec) { -; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32: + + +define {, , , } @vector_deinterleave_nxv16i16_nxv4i16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i16_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vsll.vi v16, v12, 2 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v20, v16, 1 +; CHECK-NEXT: vrgather.vv v24, v8, v20 +; CHECK-NEXT: vadd.vi v20, v16, 2 +; CHECK-NEXT: vrgather.vv v28, v8, v20 +; CHECK-NEXT: vadd.vi v16, v16, 3 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v24 +; CHECK-NEXT: vmv1r.v v10, v28 +; CHECK-NEXT: vmv1r.v v11, v20 +; CHECK-NEXT: ret +%retval = call {, , , } @llvm.experimental.vector.deinterleave.nxv4i16.nxv16i16( %vec) +ret {, , , } %retval +} + +define {, , } @vector_deinterleave_nxv2i32_nxv6i32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i32_nxv6i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 3 +; CHECK-NEXT: vmul.vx v16, v12, a0 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v20, v16, 1 +; CHECK-NEXT: vrgather.vv v24, v8, v20 +; CHECK-NEXT: vadd.vi v16, v16, 2 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v24 +; CHECK-NEXT: vmv1r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, , } @llvm.experimental.vector.deinterleave.nxv2i32.nxv6i32( %vec) +ret {, , } %retval +} + +define {, } @vector_deinterleave_nxv4i32_nxv8i32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -57,21 +57,66 @@ ret %res } +define @vector_interleave_nxv6i32_nxv2i32( %a, %b, %c) { +; CHECK-LABEL: vector_interleave_nxv6i32_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10 killed $v10 killed $v8m4 def $v8m4 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 3 +; CHECK-NEXT: vdivu.vx v14, v12, a0 +; CHECK-NEXT: vremu.vx v16, v12, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vmadd.vx v16, a0, v14 +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8m4 def $v8m4 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call (...) @llvm.experimental.vector.interleave.nxv6i32.nxv2i32( %a, %b, %c) + ret %res +} + +define @vector_interleave_nxv8i32_nxv2i32( %a, %b, %c, %d) { +; CHECK-LABEL: vector_interleave_nxv8i32_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v11 killed $v11 killed $v8m4 def $v8m4 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vdivu.vx v14, v12, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vand.vi v16, v12, 3 +; CHECK-NEXT: # kill: def $v10 killed $v10 killed $v8m4 def $v8m4 +; CHECK-NEXT: vmadd.vx v16, a0, v14 +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8m4 def $v8m4 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m4 def $v8m4 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %res = call (...) @llvm.experimental.vector.interleave.nxv8i32.nxv2i32( %a, %b, %c, %d) + ret %res +} + define @vector_interleave_nxv4i64_nxv2i64( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv4i64_nxv2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vdivu.vx v13, v12, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vand.vi v13, v12, 1 -; CHECK-NEXT: vmsne.vi v0, v13, 0 -; CHECK-NEXT: vsrl.vi v16, v12, 1 -; CHECK-NEXT: vadd.vx v16, v16, a0, v0.t +; CHECK-NEXT: vand.vi v16, v12, 1 +; CHECK-NEXT: vmadd.vx v16, a0, v13 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %res = call (...) @llvm.experimental.vector.interleave.nxv4i64.nxv2i64( %a, %b) @@ -81,6 +126,7 @@ declare @llvm.experimental.vector.interleave.nxv32i1.nxv16i1(...) declare @llvm.experimental.vector.interleave.nxv16i16.nxv8i16(...) declare @llvm.experimental.vector.interleave.nxv8i32.nxv4i32(...) +declare @llvm.experimental.vector.interleave.nxv8i32.nxv2i32(...) declare @llvm.experimental.vector.interleave.nxv6i32.nxv2i32(...) declare @llvm.experimental.vector.interleave.nxv4i64.nxv2i64(...) @@ -162,17 +208,17 @@ ; CHECK-LABEL: vector_interleave_nxv4f64_nxv2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m4 def $v8m4 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vdivu.vx v13, v12, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vand.vi v13, v12, 1 -; CHECK-NEXT: vmsne.vi v0, v13, 0 -; CHECK-NEXT: vsrl.vi v16, v12, 1 -; CHECK-NEXT: vadd.vx v16, v16, a0, v0.t +; CHECK-NEXT: vand.vi v16, v12, 1 +; CHECK-NEXT: vmadd.vx v16, a0, v13 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 def $v8m4 -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %res = call (...) @llvm.experimental.vector.interleave.nxv4f64.nxv2f64( %a, %b)