diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1453,6 +1453,74 @@ return SDValue(); } +// Called by type legalization to handle splat of i64 on RV32. +// FIXME: We can optimize this when the type has sign or zero bits in one +// of the halves. +static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar, + SDValue VL, SelectionDAG &DAG) { + SDValue ThirtyTwoV = DAG.getConstant(32, DL, VT); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar, + DAG.getConstant(0, DL, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar, + DAG.getConstant(1, DL, MVT::i32)); + + // vmv.v.x vX, hi + // vsll.vx vX, vX, /*32*/ + // vmv.v.x vY, lo + // vsll.vx vY, vY, /*32*/ + // vsrl.vx vY, vY, /*32*/ + // vor.vv vX, vX, vY + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); + SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL); + Lo = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL); + Lo = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL); + Lo = DAG.getNode(RISCVISD::SRL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL); + + Hi = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Hi, VL); + Hi = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Hi, ThirtyTwoV, Mask, VL); + + return DAG.getNode(RISCVISD::OR_VL, DL, VT, Lo, Hi, Mask, VL); +} + +// This function lowers a splat of a scalar operand Splat with the vector +// length VL. It ensures the final sequence is type legal, which is useful when +// lowering a splat after type legalization. +static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL, + SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + if (VT.isFloatingPoint()) + return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Scalar, VL); + + MVT XLenVT = Subtarget.getXLenVT(); + + // Simplest case is that the operand needs to be promoted to XLenVT. + if (Scalar.getValueType().bitsLE(XLenVT)) { + // If the operand is a constant, sign extend to increase our chances + // of being able to use a .vi instruction. ANY_EXTEND would become a + // a zero extend and the simm5 check in isel would fail. + // FIXME: Should we ignore the upper bits in isel instead? + unsigned ExtOpc = + isa(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND; + Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar); + return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Scalar, VL); + } + + assert(XLenVT == MVT::i32 && Scalar.getValueType() == MVT::i64 && + "Unexpected scalar for splat lowering!"); + + // If this is a sign-extended 32-bit constant, we can truncate it and rely + // on the instruction to sign-extend since SEW>XLEN. + if (auto *CVal = dyn_cast(Scalar)) { + if (isInt<32>(CVal->getSExtValue())) + return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, + DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32), + VL); + } + + // Otherwise use the more complicated splatting algorithm. + return splatSplitI64WithVL(DL, VT, Scalar, VL, DAG); +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -1463,48 +1531,130 @@ unsigned NumElts = VT.getVectorNumElements(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); + MVT ContainerVT = + RISCVTargetLowering::getContainerForFixedLengthVector(DAG, VT, Subtarget); + + SDValue TrueMask, VL; + std::tie(TrueMask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); + if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); if (Lane >= 0) { - MVT ContainerVT = RISCVTargetLowering::getContainerForFixedLengthVector( - DAG, VT, Subtarget); - V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); assert(Lane < (int)NumElts && "Unexpected lane!"); - - SDValue Mask, VL; - std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); SDValue Gather = DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, V1, - DAG.getConstant(Lane, DL, XLenVT), Mask, VL); + DAG.getConstant(Lane, DL, XLenVT), TrueMask, VL); return convertFromScalableVector(VT, Gather, DAG, Subtarget); } } - // Detect shuffles which can be re-expressed as vector selects. - SmallVector MaskVals; - // By default we preserve the original operand order, and select LHS as true - // and RHS as false. However, since RVV vector selects may feature splats but - // only on the LHS, we may choose to invert our mask and instead select - // between RHS and LHS. - bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1); - + // Detect shuffles which can be re-expressed as vector selects; these are + // shuffles in which each element in the destination is taken from an element + // at the corresponding index in either source vectors. bool IsSelect = all_of(enumerate(SVN->getMask()), [&](const auto &MaskIdx) { int MaskIndex = MaskIdx.value(); - bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ SwapOps; - MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT)); return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts; }); - if (IsSelect) { - assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle"); - MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); - SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals); - return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, SwapOps ? V2 : V1, - SwapOps ? V1 : V2); + assert(!V1.isUndef() && "Unexpected shuffle canonicalization"); + + SmallVector MaskVals; + // As a backup, shuffles can be lowered via a vrgather instruction, possibly + // merged with a second vrgather. + SmallVector GatherIndicesLHS, GatherIndicesRHS; + + // By default we preserve the original operand order, and use a mask to + // select LHS as true and RHS as false. However, since RVV vector selects may + // feature splats but only on the LHS, we may choose to invert our mask and + // instead select between RHS and LHS. + bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1); + bool InvertMask = IsSelect == SwapOps; + + // Now construct the mask that will be used by the vselect or blended + // vrgather operation. For vrgathers, construct the appropriate indices into + // each vector. + for (int MaskIndex : SVN->getMask()) { + bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ InvertMask; + MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT)); + if (!IsSelect) { + bool IsLHS = MaskIndex < (int)NumElts; + // For "undef" elements of -1, shuffle in element 0 instead. + GatherIndicesLHS.push_back( + DAG.getConstant(IsLHS ? std::max(MaskIndex, 0) : 0, DL, XLenVT)); + // TODO: If we're masking out unused elements anyway, it might produce + // better code if we use the most-common element index instead of 0. + GatherIndicesRHS.push_back( + DAG.getConstant(IsLHS ? 0 : MaskIndex - NumElts, DL, XLenVT)); + } } - return SDValue(); + if (SwapOps) { + std::swap(V1, V2); + std::swap(GatherIndicesLHS, GatherIndicesRHS); + } + + assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle"); + MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals); + + if (IsSelect) + return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2); + + if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) { + // On such a large vector we're unable to use i8 as the index type. + // FIXME: We could promote the index to i16 and use vrgatherei16, but that + // may involve vector splitting if we're already at LMUL=8, or our + // user-supplied maximum fixed-length LMUL. + return SDValue(); + } + + unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL; + MVT IndexVT = VT.changeTypeToInteger(); + // Since we can't introduce illegal index types at this stage, use i16 and + // vrgatherei16 if the corresponding index type for plain vrgather is greater + // than XLenVT. + if (IndexVT.getScalarType().bitsGT(XLenVT)) { + GatherOpc = RISCVISD::VRGATHEREI16_VV_VL; + IndexVT = IndexVT.changeVectorElementType(MVT::i16); + } + + MVT IndexContainerVT = + ContainerVT.changeVectorElementType(IndexVT.getScalarType()); + + SDValue Gather; + // TODO: This doesn't trigger for i64 vectors on RV32, since there we + // encounter a bitcasted BUILD_VECTOR with low/high i32 values. + if (SDValue SplatValue = DAG.getSplatValue(V1)) { + Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget); + } else { + SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS); + LHSIndices = + convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget); + + V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); + Gather = + DAG.getNode(GatherOpc, DL, ContainerVT, V1, LHSIndices, TrueMask, VL); + } + + // If a second vector operand is used by this shuffle, blend it in with an + // additional vrgather. + if (!V2.isUndef()) { + MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1); + SelectMask = + convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget); + + SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS); + RHSIndices = + convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget); + + V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); + V2 = DAG.getNode(GatherOpc, DL, ContainerVT, V2, RHSIndices, TrueMask, VL); + Gather = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, SelectMask, V2, + Gather, VL); + } + + return convertFromScalableVector(VT, Gather, DAG, Subtarget); } static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT, @@ -2778,35 +2928,6 @@ return DAG.getNode(ISD::TRUNCATE, DL, EltVT, Elt0); } -// Called by type legalization to handle splat of i64 on RV32. -// FIXME: We can optimize this when the type has sign or zero bits in one -// of the halves. -static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar, - SDValue VL, SelectionDAG &DAG) { - SDValue ThirtyTwoV = DAG.getConstant(32, DL, VT); - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar, - DAG.getConstant(0, DL, MVT::i32)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar, - DAG.getConstant(1, DL, MVT::i32)); - - // vmv.v.x vX, hi - // vsll.vx vX, vX, /*32*/ - // vmv.v.x vY, lo - // vsll.vx vY, vY, /*32*/ - // vsrl.vx vY, vY, /*32*/ - // vor.vv vX, vX, vY - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); - SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL); - Lo = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL); - Lo = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL); - Lo = DAG.getNode(RISCVISD::SRL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL); - - Hi = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Hi, VL); - Hi = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Hi, ThirtyTwoV, Mask, VL); - - return DAG.getNode(RISCVISD::OR_VL, DL, VT, Lo, Hi, Mask, VL); -} - // Some RVV intrinsics may claim that they want an integer operand to be // promoted or expanded. static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG, @@ -2904,31 +3025,9 @@ assert(Op.getValueType() == XLenVT && "Unexpected VT!"); return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(), Op.getOperand(1)); - case Intrinsic::riscv_vmv_v_x: { - SDValue Scalar = Op.getOperand(1); - if (Scalar.getValueType().bitsLE(XLenVT)) { - unsigned ExtOpc = - isa(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND; - Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar); - return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, Op.getValueType(), Scalar, - Op.getOperand(2)); - } - - assert(Scalar.getValueType() == MVT::i64 && "Unexpected scalar VT!"); - - // If this is a sign-extended 32-bit constant, we can truncate it and rely - // on the instruction to sign-extend since SEW>XLEN. - if (auto *CVal = dyn_cast(Scalar)) { - if (isInt<32>(CVal->getSExtValue())) - return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, Op.getValueType(), - DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32), - Op.getOperand(2)); - } - - // Otherwise use the more complicated splatting algorithm. - return splatSplitI64WithVL(DL, Op.getSimpleValueType(), Scalar, - Op.getOperand(2), DAG); - } + case Intrinsic::riscv_vmv_v_x: + return lowerScalarSplat(Op.getOperand(1), Op.getOperand(2), + Op.getSimpleValueType(), DL, DAG, Subtarget); case Intrinsic::riscv_vfmv_v_f: return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -1089,6 +1089,18 @@ (!cast("PseudoVRGATHER_VI_"# vti.LMul.MX) vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.SEW)>; + def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm), + (riscv_vrgather_vv_vl + vti.RegClass:$rs2, + vti.RegClass:$rs1, + (vti.Mask true_mask), + VLOpFrag), + vti.RegClass:$merge, + VLOpFrag)), + (!cast("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK") + vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1, + vti.Mask:$vm, GPR:$vl, vti.SEW)>; + // emul = lmul * 16 / sew defvar vlmul = vti.LMul; defvar octuple_lmul = octuple_from_str.ret; @@ -1103,6 +1115,18 @@ VLOpFrag)), (!cast(inst) vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.SEW)>; + + def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm), + (riscv_vrgatherei16_vv_vl + vti.RegClass:$rs2, + (ivti.Vector ivti.RegClass:$rs1), + (vti.Mask true_mask), + VLOpFrag), + vti.RegClass:$merge, + VLOpFrag)), + (!cast(inst#"_MASK") + vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1, + vti.Mask:$vm, GPR:$vl, vti.SEW)>; } } @@ -1136,6 +1160,18 @@ (!cast("PseudoVRGATHER_VI_"# vti.LMul.MX) vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.SEW)>; + def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm), + (riscv_vrgather_vv_vl + vti.RegClass:$rs2, + (ivti.Vector vti.RegClass:$rs1), + (vti.Mask true_mask), + VLOpFrag), + vti.RegClass:$merge, + VLOpFrag)), + (!cast("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK") + vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1, + vti.Mask:$vm, GPR:$vl, vti.SEW)>; + defvar vlmul = vti.LMul; defvar octuple_lmul = octuple_from_str.ret; defvar octuple_emul = !srl(!mul(octuple_lmul, 16), shift_amount.val); @@ -1149,6 +1185,18 @@ VLOpFrag)), (!cast(inst) vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.SEW)>; + + def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm), + (riscv_vrgatherei16_vv_vl + vti.RegClass:$rs2, + (ivti.Vector ivti.RegClass:$rs1), + (vti.Mask true_mask), + VLOpFrag), + vti.RegClass:$merge, + VLOpFrag)), + (!cast(inst#"_MASK") + vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1, + vti.Mask:$vm, GPR:$vl, vti.SEW)>; } } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -61,76 +61,23 @@ define <4 x double> @vrgather_permute_shuffle_vu_v4f64(<4 x double> %x) { ; RV32-LABEL: vrgather_permute_shuffle_vu_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -32 -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v8, 1 -; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.v.f v25, ft0 -; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu -; RV32-NEXT: vfmv.f.s ft1, v8 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.s.f v25, ft1 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vse64.v v25, (a0) -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v8, 2 -; RV32-NEXT: vfmv.f.s ft1, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.v.f v25, ft1 -; RV32-NEXT: vfmv.s.f v25, ft0 -; RV32-NEXT: vse64.v v25, (sp) +; RV32-NEXT: lui a0, %hi(.LCPI4_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI4_0) +; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) ; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: vrgatherei16.vv v26, v8, v25 +; RV32-NEXT: vmv2r.v v8, v26 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_permute_shuffle_vu_v4f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -32 -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v8, 1 -; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.v.f v25, ft0 -; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu -; RV64-NEXT: vfmv.f.s ft1, v8 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.s.f v25, ft1 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vse64.v v25, (a0) -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v8, 2 -; RV64-NEXT: vfmv.f.s ft1, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.v.f v25, ft1 -; RV64-NEXT: vfmv.s.f v25, ft0 -; RV64-NEXT: vse64.v v25, (sp) -; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: lui a0, %hi(.LCPI4_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI4_0) +; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV64-NEXT: vle64.v v28, (a0) +; RV64-NEXT: vrgather.vv v26, v8, v28 +; RV64-NEXT: vmv2r.v v8, v26 ; RV64-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> undef, <4 x i32> ret <4 x double> %s @@ -139,76 +86,23 @@ define <4 x double> @vrgather_permute_shuffle_uv_v4f64(<4 x double> %x) { ; RV32-LABEL: vrgather_permute_shuffle_uv_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -32 -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v8, 1 -; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.v.f v25, ft0 -; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu -; RV32-NEXT: vfmv.f.s ft1, v8 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.s.f v25, ft1 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vse64.v v25, (a0) -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v8, 2 -; RV32-NEXT: vfmv.f.s ft1, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.v.f v25, ft1 -; RV32-NEXT: vfmv.s.f v25, ft0 -; RV32-NEXT: vse64.v v25, (sp) +; RV32-NEXT: lui a0, %hi(.LCPI5_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI5_0) +; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) ; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: vrgatherei16.vv v26, v8, v25 +; RV32-NEXT: vmv2r.v v8, v26 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_permute_shuffle_uv_v4f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -32 -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v8, 1 -; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.v.f v25, ft0 -; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu -; RV64-NEXT: vfmv.f.s ft1, v8 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.s.f v25, ft1 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vse64.v v25, (a0) -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v8, 2 -; RV64-NEXT: vfmv.f.s ft1, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.v.f v25, ft1 -; RV64-NEXT: vfmv.s.f v25, ft0 -; RV64-NEXT: vse64.v v25, (sp) -; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: lui a0, %hi(.LCPI5_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI5_0) +; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV64-NEXT: vle64.v v28, (a0) +; RV64-NEXT: vrgather.vv v26, v8, v28 +; RV64-NEXT: vmv2r.v v8, v26 ; RV64-NEXT: ret %s = shufflevector <4 x double> undef, <4 x double> %x, <4 x i32> ret <4 x double> %s @@ -217,84 +111,45 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) { ; RV32-LABEL: vrgather_shuffle_vv_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -32 -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v10, 1 -; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.v.f v25, ft0 -; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.s.f v25, ft0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vse64.v v25, (a0) -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v8, 2 -; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.v.f v25, ft0 -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v8, 1 -; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.s.f v25, ft0 -; RV32-NEXT: vse64.v v25, (sp) +; RV32-NEXT: addi a0, zero, 1 +; RV32-NEXT: addi a1, zero, 8 +; RV32-NEXT: vsetivli a2, 1, e8,m1,ta,mu +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; RV32-NEXT: vmv.s.x v25, a0 +; RV32-NEXT: vmv.v.i v28, 0 +; RV32-NEXT: vsetivli a0, 4, e16,m1,tu,mu +; RV32-NEXT: vslideup.vi v28, v25, 3 +; RV32-NEXT: lui a0, %hi(.LCPI6_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI6_0) +; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) ; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: vrgatherei16.vv v26, v8, v25 +; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu +; RV32-NEXT: vrgatherei16.vv v26, v10, v28, v0.t +; RV32-NEXT: vmv2r.v v8, v26 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_vv_v4f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -32 -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v10, 1 -; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.v.f v25, ft0 -; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.s.f v25, ft0 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vse64.v v25, (a0) -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v8, 2 -; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.v.f v25, ft0 -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v8, 1 -; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.s.f v25, ft0 -; RV64-NEXT: vse64.v v25, (sp) -; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: addi a0, zero, 1 +; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV64-NEXT: vmv.s.x v26, a0 +; RV64-NEXT: vmv.v.i v28, 0 +; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu +; RV64-NEXT: vslideup.vi v28, v26, 3 +; RV64-NEXT: addi a0, zero, 8 +; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: lui a0, %hi(.LCPI6_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI6_0) +; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV64-NEXT: vle64.v v30, (a0) +; RV64-NEXT: vrgather.vv v26, v8, v30 +; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu +; RV64-NEXT: vrgather.vv v26, v10, v28, v0.t +; RV64-NEXT: vmv2r.v v8, v26 ; RV64-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> ret <4 x double> %s @@ -303,72 +158,37 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) { ; RV32-LABEL: vrgather_shuffle_xv_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -32 -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v8, 1 -; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.v.f v25, ft0 -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v8, 2 -; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: lui a0, %hi(.LCPI7_0) -; RV32-NEXT: fld ft1, %lo(.LCPI7_0)(a0) -; RV32-NEXT: vfmv.s.f v25, ft0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vse64.v v25, (a0) -; RV32-NEXT: fsd ft1, 8(sp) -; RV32-NEXT: fsd ft1, 0(sp) +; RV32-NEXT: addi a0, zero, 12 +; RV32-NEXT: lui a1, %hi(.LCPI7_0) +; RV32-NEXT: fld ft0, %lo(.LCPI7_0)(a1) +; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV32-NEXT: vmv.s.x v0, a0 ; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: vfmv.v.f v26, ft0 +; RV32-NEXT: lui a0, %hi(.LCPI7_1) +; RV32-NEXT: addi a0, a0, %lo(.LCPI7_1) +; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) +; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu +; RV32-NEXT: vrgatherei16.vv v26, v8, v25, v0.t +; RV32-NEXT: vmv2r.v v8, v26 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_xv_v4f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -32 -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v8, 1 -; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.v.f v25, ft0 -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v8, 2 -; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu +; RV64-NEXT: addi a0, zero, 12 +; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: lui a0, %hi(.LCPI7_0) -; RV64-NEXT: fld ft1, %lo(.LCPI7_0)(a0) -; RV64-NEXT: vfmv.s.f v25, ft0 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vse64.v v25, (a0) -; RV64-NEXT: fsd ft1, 8(sp) -; RV64-NEXT: fsd ft1, 0(sp) -; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0) +; RV64-NEXT: lui a1, %hi(.LCPI7_1) +; RV64-NEXT: fld ft0, %lo(.LCPI7_1)(a1) +; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV64-NEXT: vle64.v v28, (a0) +; RV64-NEXT: vfmv.v.f v26, ft0 +; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu +; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t +; RV64-NEXT: vmv2r.v v8, v26 ; RV64-NEXT: ret %s = shufflevector <4 x double> , <4 x double> %x, <4 x i32> ret <4 x double> %s @@ -377,68 +197,40 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) { ; RV32-LABEL: vrgather_shuffle_vx_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -32 -; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV32-NEXT: vslidedown.vi v26, v8, 3 -; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.v.f v25, ft0 -; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu -; RV32-NEXT: vfmv.f.s ft0, v8 +; RV32-NEXT: addi a0, zero, 3 +; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; RV32-NEXT: vmv.s.x v25, a0 +; RV32-NEXT: vmv.v.i v28, 0 ; RV32-NEXT: lui a0, %hi(.LCPI8_0) -; RV32-NEXT: fld ft1, %lo(.LCPI8_0)(a0) -; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV32-NEXT: vfmv.s.f v25, ft0 -; RV32-NEXT: vse64.v v25, (sp) -; RV32-NEXT: fsd ft1, 24(sp) -; RV32-NEXT: fsd ft1, 16(sp) +; RV32-NEXT: fld ft0, %lo(.LCPI8_0)(a0) +; RV32-NEXT: vsetivli a0, 2, e16,m1,tu,mu +; RV32-NEXT: vslideup.vi v28, v25, 1 ; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: vfmv.v.f v26, ft0 +; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu +; RV32-NEXT: vrgatherei16.vv v26, v8, v28, v0.t +; RV32-NEXT: vmv2r.v v8, v26 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_vx_v4f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -32 -; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v26, v8, 3 -; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.v.f v25, ft0 -; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: lui a0, %hi(.LCPI8_0) -; RV64-NEXT: fld ft1, %lo(.LCPI8_0)(a0) -; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu -; RV64-NEXT: vfmv.s.f v25, ft0 -; RV64-NEXT: vse64.v v25, (sp) -; RV64-NEXT: fsd ft1, 24(sp) -; RV64-NEXT: fsd ft1, 16(sp) +; RV64-NEXT: addi a0, zero, 3 +; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV64-NEXT: vmv.s.x v26, a0 +; RV64-NEXT: vmv.v.i v28, 0 +; RV64-NEXT: vsetivli a1, 2, e64,m2,tu,mu +; RV64-NEXT: vslideup.vi v28, v26, 1 +; RV64-NEXT: lui a1, %hi(.LCPI8_0) +; RV64-NEXT: fld ft0, %lo(.LCPI8_0)(a1) +; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: vfmv.v.f v26, ft0 +; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu +; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t +; RV64-NEXT: vmv2r.v v8, v26 ; RV64-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> ret <4 x double> %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -57,23 +57,12 @@ define <4 x i16> @vrgather_permute_shuffle_vu_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_permute_shuffle_vu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: sh a0, 12(sp) -; CHECK-NEXT: vsetivli a0, 1, e16,m1,ta,mu -; CHECK-NEXT: vslidedown.vi v25, v8, 1 -; CHECK-NEXT: vmv.x.s a0, v25 -; CHECK-NEXT: sh a0, 14(sp) -; CHECK-NEXT: vslidedown.vi v25, v8, 2 -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sh a1, 10(sp) -; CHECK-NEXT: sh a0, 8(sp) -; CHECK-NEXT: vsetivli a0, 4, e16,m1,ta,mu -; CHECK-NEXT: addi a0, sp, 8 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: lui a0, %hi(.LCPI4_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0) +; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v26, (a0) +; CHECK-NEXT: vrgather.vv v25, v8, v26 +; CHECK-NEXT: vmv1r.v v8, v25 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> undef, <4 x i32> ret <4 x i16> %s @@ -82,23 +71,12 @@ define <4 x i16> @vrgather_permute_shuffle_uv_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_permute_shuffle_uv_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: sh a0, 12(sp) -; CHECK-NEXT: vsetivli a0, 1, e16,m1,ta,mu -; CHECK-NEXT: vslidedown.vi v25, v8, 1 -; CHECK-NEXT: vmv.x.s a0, v25 -; CHECK-NEXT: sh a0, 14(sp) -; CHECK-NEXT: vslidedown.vi v25, v8, 2 -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sh a1, 10(sp) -; CHECK-NEXT: sh a0, 8(sp) -; CHECK-NEXT: vsetivli a0, 4, e16,m1,ta,mu -; CHECK-NEXT: addi a0, sp, 8 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: lui a0, %hi(.LCPI5_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) +; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v26, (a0) +; CHECK-NEXT: vrgather.vv v25, v8, v26 +; CHECK-NEXT: vmv1r.v v8, v25 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> undef, <4 x i16> %x, <4 x i32> ret <4 x i16> %s @@ -107,114 +85,64 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) { ; CHECK-LABEL: vrgather_shuffle_vv_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: sh a0, 12(sp) -; CHECK-NEXT: vsetivli a0, 1, e16,m1,ta,mu -; CHECK-NEXT: vslidedown.vi v25, v9, 1 -; CHECK-NEXT: vmv.x.s a0, v25 -; CHECK-NEXT: sh a0, 14(sp) -; CHECK-NEXT: vslidedown.vi v25, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v25 -; CHECK-NEXT: sh a0, 10(sp) -; CHECK-NEXT: vslidedown.vi v25, v8, 1 -; CHECK-NEXT: vmv.x.s a0, v25 -; CHECK-NEXT: sh a0, 8(sp) -; CHECK-NEXT: vsetivli a0, 4, e16,m1,ta,mu -; CHECK-NEXT: addi a0, sp, 8 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: addi a0, zero, 1 +; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; CHECK-NEXT: vmv.s.x v25, a0 +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vsetivli a0, 4, e16,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 3 +; CHECK-NEXT: addi a0, zero, 8 +; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI6_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0) +; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v27, (a0) +; CHECK-NEXT: vrgather.vv v25, v8, v27 +; CHECK-NEXT: vsetivli a0, 4, e16,m1,tu,mu +; CHECK-NEXT: vrgather.vv v25, v9, v26, v0.t +; CHECK-NEXT: vmv1r.v v8, v25 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> %y, <4 x i32> ret <4 x i16> %s } define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) { -; RV32-LABEL: vrgather_shuffle_xv_v4i16: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 80 -; RV32-NEXT: addi a0, a0, 5 -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetivli a0, 1, e16,m1,ta,mu -; RV32-NEXT: vslidedown.vi v25, v8, 1 -; RV32-NEXT: vmv.x.s a0, v25 -; RV32-NEXT: sh a0, 14(sp) -; RV32-NEXT: vslidedown.vi v25, v8, 2 -; RV32-NEXT: vmv.x.s a0, v25 -; RV32-NEXT: sh a0, 12(sp) -; RV32-NEXT: vsetivli a0, 4, e16,m1,ta,mu -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vle16.v v8, (a0) -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: vrgather_shuffle_xv_v4i16: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: lui a0, 80 -; RV64-NEXT: addiw a0, a0, 5 -; RV64-NEXT: sw a0, 8(sp) -; RV64-NEXT: vsetivli a0, 1, e16,m1,ta,mu -; RV64-NEXT: vslidedown.vi v25, v8, 1 -; RV64-NEXT: vmv.x.s a0, v25 -; RV64-NEXT: sh a0, 14(sp) -; RV64-NEXT: vslidedown.vi v25, v8, 2 -; RV64-NEXT: vmv.x.s a0, v25 -; RV64-NEXT: sh a0, 12(sp) -; RV64-NEXT: vsetivli a0, 4, e16,m1,ta,mu -; RV64-NEXT: addi a0, sp, 8 -; RV64-NEXT: vle16.v v8, (a0) -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: ret +; CHECK-LABEL: vrgather_shuffle_xv_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, zero, 12 +; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI7_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) +; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v26, (a0) +; CHECK-NEXT: vmv.v.i v25, 5 +; CHECK-NEXT: vsetivli a0, 4, e16,m1,tu,mu +; CHECK-NEXT: vrgather.vv v25, v8, v26, v0.t +; CHECK-NEXT: vmv1r.v v8, v25 +; CHECK-NEXT: ret %s = shufflevector <4 x i16> , <4 x i16> %x, <4 x i32> ret <4 x i16> %s } define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) { -; RV32-LABEL: vrgather_shuffle_vx_v4i16: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 80 -; RV32-NEXT: addi a0, a0, 5 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: vsetvli zero, zero, e16,m1,ta,mu -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: sh a0, 8(sp) -; RV32-NEXT: vsetivli a0, 1, e16,m1,ta,mu -; RV32-NEXT: vslidedown.vi v25, v8, 3 -; RV32-NEXT: vmv.x.s a0, v25 -; RV32-NEXT: sh a0, 10(sp) -; RV32-NEXT: vsetivli a0, 4, e16,m1,ta,mu -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vle16.v v8, (a0) -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: vrgather_shuffle_vx_v4i16: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: lui a0, 80 -; RV64-NEXT: addiw a0, a0, 5 -; RV64-NEXT: sw a0, 12(sp) -; RV64-NEXT: vsetvli zero, zero, e16,m1,ta,mu -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: sh a0, 8(sp) -; RV64-NEXT: vsetivli a0, 1, e16,m1,ta,mu -; RV64-NEXT: vslidedown.vi v25, v8, 3 -; RV64-NEXT: vmv.x.s a0, v25 -; RV64-NEXT: sh a0, 10(sp) -; RV64-NEXT: vsetivli a0, 4, e16,m1,ta,mu -; RV64-NEXT: addi a0, sp, 8 -; RV64-NEXT: vle16.v v8, (a0) -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: ret +; CHECK-LABEL: vrgather_shuffle_vx_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, zero, 3 +; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; CHECK-NEXT: vmv.s.x v25, a0 +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vsetivli a1, 2, e16,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 1 +; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli a0, 4, e16,m1,ta,mu +; CHECK-NEXT: vmv.v.i v25, 5 +; CHECK-NEXT: vsetivli a0, 4, e16,m1,tu,mu +; CHECK-NEXT: vrgather.vv v25, v8, v26, v0.t +; CHECK-NEXT: vmv1r.v v8, v25 +; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> ret <4 x i16> %s } @@ -222,96 +150,23 @@ define <8 x i64> @vrgather_permute_shuffle_vu_v8i64(<8 x i64> %x) { ; RV32-LABEL: vrgather_permute_shuffle_vu_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: sw a0, 48(sp) -; RV32-NEXT: sw a0, 16(sp) -; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu -; RV32-NEXT: vslidedown.vi v28, v8, 3 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 60(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 2 -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 56(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 1 -; RV32-NEXT: vmv.x.s a2, v28 -; RV32-NEXT: sw a2, 52(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 13 -; RV32-NEXT: vmv.x.s a3, v28 -; RV32-NEXT: sw a3, 44(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 12 -; RV32-NEXT: vmv.x.s a3, v28 -; RV32-NEXT: sw a3, 40(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 15 -; RV32-NEXT: vmv.x.s a3, v28 -; RV32-NEXT: sw a3, 36(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 14 -; RV32-NEXT: vmv.x.s a3, v28 -; RV32-NEXT: sw a3, 32(sp) -; RV32-NEXT: sw a0, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a2, 20(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 5 -; RV32-NEXT: vmv.x.s a2, v28 -; RV32-NEXT: sw a2, 12(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 4 -; RV32-NEXT: vmv.x.s a2, v28 -; RV32-NEXT: sw a2, 8(sp) -; RV32-NEXT: sw a0, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: lui a0, %hi(.LCPI9_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI9_0) +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) +; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu +; RV32-NEXT: vrgatherei16.vv v28, v8, v25 +; RV32-NEXT: vmv4r.v v8, v28 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_permute_shuffle_vu_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: sd a0, 48(sp) -; RV64-NEXT: sd a0, 16(sp) -; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu -; RV64-NEXT: vslidedown.vi v28, v8, 1 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 56(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 6 -; RV64-NEXT: vmv.x.s a1, v28 -; RV64-NEXT: sd a1, 40(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 7 -; RV64-NEXT: vmv.x.s a1, v28 -; RV64-NEXT: sd a1, 32(sp) -; RV64-NEXT: sd a0, 24(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 2 -; RV64-NEXT: vmv.x.s a1, v28 -; RV64-NEXT: sd a1, 8(sp) -; RV64-NEXT: sd a0, 0(sp) -; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: lui a0, %hi(.LCPI9_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI9_0) +; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu +; RV64-NEXT: vle64.v v12, (a0) +; RV64-NEXT: vrgather.vv v28, v8, v12 +; RV64-NEXT: vmv4r.v v8, v28 ; RV64-NEXT: ret %s = shufflevector <8 x i64> %x, <8 x i64> undef, <8 x i32> ret <8 x i64> %s @@ -320,96 +175,23 @@ define <8 x i64> @vrgather_permute_shuffle_uv_v8i64(<8 x i64> %x) { ; RV32-LABEL: vrgather_permute_shuffle_uv_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: sw a0, 48(sp) -; RV32-NEXT: sw a0, 40(sp) -; RV32-NEXT: sw a0, 16(sp) -; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu -; RV32-NEXT: vslidedown.vi v28, v8, 7 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 60(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 6 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 56(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 1 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 52(sp) -; RV32-NEXT: sw a0, 44(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 15 -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 14 -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 3 -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 2 -; RV32-NEXT: vmv.x.s a2, v28 -; RV32-NEXT: sw a2, 24(sp) -; RV32-NEXT: sw a0, 20(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 5 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 4 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: lui a0, %hi(.LCPI10_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI10_0) +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) +; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu +; RV32-NEXT: vrgatherei16.vv v28, v8, v25 +; RV32-NEXT: vmv4r.v v8, v28 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_permute_shuffle_uv_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: sd a0, 48(sp) -; RV64-NEXT: sd a0, 40(sp) -; RV64-NEXT: sd a0, 16(sp) -; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu -; RV64-NEXT: vslidedown.vi v28, v8, 3 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 56(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 7 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 32(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 1 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 24(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 2 -; RV64-NEXT: vmv.x.s a1, v28 -; RV64-NEXT: sd a1, 8(sp) -; RV64-NEXT: sd a0, 0(sp) -; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: lui a0, %hi(.LCPI10_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI10_0) +; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu +; RV64-NEXT: vle64.v v12, (a0) +; RV64-NEXT: vrgather.vv v28, v8, v12 +; RV64-NEXT: vmv4r.v v8, v28 ; RV64-NEXT: ret %s = shufflevector <8 x i64> undef, <8 x i64> %x, <8 x i32> ret <8 x i64> %s @@ -418,102 +200,55 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) { ; RV32-LABEL: vrgather_shuffle_vv_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu -; RV32-NEXT: vslidedown.vi v28, v12, 11 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 60(sp) -; RV32-NEXT: vslidedown.vi v28, v12, 10 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 56(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 7 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 52(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 6 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 48(sp) -; RV32-NEXT: vslidedown.vi v28, v12, 5 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 44(sp) -; RV32-NEXT: vslidedown.vi v28, v12, 4 -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 3 -; RV32-NEXT: vmv.x.s a2, v28 -; RV32-NEXT: sw a2, 36(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 2 -; RV32-NEXT: vmv.x.s a3, v28 -; RV32-NEXT: sw a3, 32(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 11 -; RV32-NEXT: vmv.x.s a4, v28 -; RV32-NEXT: sw a4, 28(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 10 -; RV32-NEXT: vmv.x.s a4, v28 -; RV32-NEXT: sw a4, 24(sp) -; RV32-NEXT: sw a0, 20(sp) -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 5 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 4 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: sw a2, 4(sp) -; RV32-NEXT: sw a3, 0(sp) -; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: addi a0, zero, 5 +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vmv.s.x v25, a0 +; RV32-NEXT: addi a0, zero, 36 +; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vsetivli a0, 8, e16,m1,ta,mu +; RV32-NEXT: vmv.v.i v26, 0 +; RV32-NEXT: vmerge.vim v26, v26, 2, v0 +; RV32-NEXT: vsetivli a0, 8, e16,m1,tu,mu +; RV32-NEXT: vslideup.vi v26, v25, 7 +; RV32-NEXT: addi a0, zero, 164 +; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI11_0) +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) +; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu +; RV32-NEXT: vrgatherei16.vv v28, v8, v25 +; RV32-NEXT: vsetivli a0, 8, e64,m4,tu,mu +; RV32-NEXT: vrgatherei16.vv v28, v12, v26, v0.t +; RV32-NEXT: vmv4r.v v8, v28 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_vv_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu -; RV64-NEXT: vslidedown.vi v28, v12, 5 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 56(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 3 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 48(sp) -; RV64-NEXT: vslidedown.vi v28, v12, 2 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 40(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 1 -; RV64-NEXT: vmv.x.s a1, v28 -; RV64-NEXT: sd a1, 32(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 5 -; RV64-NEXT: vmv.x.s a2, v28 -; RV64-NEXT: sd a2, 24(sp) -; RV64-NEXT: sd a0, 16(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 2 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 8(sp) -; RV64-NEXT: sd a1, 0(sp) +; RV64-NEXT: addi a0, zero, 5 +; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu +; RV64-NEXT: vmv.s.x v28, a0 +; RV64-NEXT: addi a0, zero, 36 +; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vmerge.vim v16, v16, 2, v0 +; RV64-NEXT: vsetivli a0, 8, e64,m4,tu,mu +; RV64-NEXT: vslideup.vi v16, v28, 7 +; RV64-NEXT: addi a0, zero, 164 +; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: lui a0, %hi(.LCPI11_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI11_0) +; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu +; RV64-NEXT: vle64.v v20, (a0) +; RV64-NEXT: vrgather.vv v28, v8, v20 +; RV64-NEXT: vsetivli a0, 8, e64,m4,tu,mu +; RV64-NEXT: vrgather.vv v28, v12, v16, v0.t +; RV64-NEXT: vmv4r.v v8, v28 ; RV64-NEXT: ret %s = shufflevector <8 x i64> %x, <8 x i64> %y, <8 x i32> ret <8 x i64> %s @@ -522,86 +257,52 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) { ; RV32-LABEL: vrgather_shuffle_xv_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: addi a0, zero, -1 -; RV32-NEXT: sw a0, 60(sp) -; RV32-NEXT: sw a0, 56(sp) -; RV32-NEXT: sw a0, 28(sp) -; RV32-NEXT: sw a0, 24(sp) -; RV32-NEXT: sw a0, 20(sp) -; RV32-NEXT: sw a0, 16(sp) -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: sw a0, 32(sp) -; RV32-NEXT: sw a0, 0(sp) -; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu -; RV32-NEXT: vslidedown.vi v28, v8, 13 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 52(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 12 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 48(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 9 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 44(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 8 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 40(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 1 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 36(sp) -; RV32-NEXT: sw a0, 4(sp) +; RV32-NEXT: addi a0, zero, 6 +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vmv.s.x v25, a0 +; RV32-NEXT: addi a0, zero, 4 +; RV32-NEXT: vmv.s.x v26, a0 +; RV32-NEXT: vmv.v.i v27, 0 +; RV32-NEXT: vsetivli a0, 6, e16,m1,tu,mu +; RV32-NEXT: vslideup.vi v27, v26, 5 +; RV32-NEXT: vsetivli a0, 7, e16,m1,tu,mu +; RV32-NEXT: vslideup.vi v27, v25, 6 +; RV32-NEXT: addi a0, zero, 113 +; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI12_0) +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) ; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: vmv.v.i v12, -1 +; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu +; RV32-NEXT: vrgatherei16.vv v28, v12, v25 +; RV32-NEXT: vsetivli a0, 8, e64,m4,tu,mu +; RV32-NEXT: vrgatherei16.vv v28, v8, v27, v0.t +; RV32-NEXT: vmv4r.v v8, v28 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_xv_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: addi a0, zero, -1 -; RV64-NEXT: sd a0, 56(sp) -; RV64-NEXT: sd a0, 24(sp) -; RV64-NEXT: sd a0, 16(sp) -; RV64-NEXT: sd a0, 8(sp) -; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: sd a0, 32(sp) -; RV64-NEXT: sd a0, 0(sp) -; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu -; RV64-NEXT: vslidedown.vi v28, v8, 6 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 48(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 4 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 40(sp) +; RV64-NEXT: addi a0, zero, 6 +; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu +; RV64-NEXT: vmv.s.x v28, a0 +; RV64-NEXT: addi a0, zero, 4 +; RV64-NEXT: vmv.s.x v12, a0 +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli a0, 6, e64,m4,tu,mu +; RV64-NEXT: vslideup.vi v16, v12, 5 +; RV64-NEXT: vsetivli a0, 7, e64,m4,tu,mu +; RV64-NEXT: vslideup.vi v16, v28, 6 +; RV64-NEXT: addi a0, zero, 113 +; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: vmv.v.i v28, -1 +; RV64-NEXT: vsetivli a0, 8, e64,m4,tu,mu +; RV64-NEXT: vrgather.vv v28, v8, v16, v0.t +; RV64-NEXT: vmv4r.v v8, v28 ; RV64-NEXT: ret %s = shufflevector <8 x i64> , <8 x i64> %x, <8 x i32> ret <8 x i64> %s @@ -610,98 +311,39 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) { ; RV32-LABEL: vrgather_shuffle_vx_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: sw zero, 60(sp) -; RV32-NEXT: addi a0, zero, 5 -; RV32-NEXT: sw a0, 56(sp) -; RV32-NEXT: sw zero, 28(sp) -; RV32-NEXT: sw a0, 24(sp) -; RV32-NEXT: sw zero, 20(sp) -; RV32-NEXT: sw a0, 16(sp) -; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: sw a0, 0(sp) -; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu -; RV32-NEXT: vslidedown.vi v28, v8, 15 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 52(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 14 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 48(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 3 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 44(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 2 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 40(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 9 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 36(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 8 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 32(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 7 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 6 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vslidedown.vi v28, v8, 1 -; RV32-NEXT: vmv.x.s a0, v28 -; RV32-NEXT: sw a0, 4(sp) -; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: addi a0, zero, 140 +; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) +; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu +; RV32-NEXT: vrgatherei16.vv v28, v8, v25 +; RV32-NEXT: lui a0, %hi(.LCPI13_1) +; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1) +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) +; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu +; RV32-NEXT: vmv.v.i v8, 5 +; RV32-NEXT: vsetivli a0, 8, e64,m4,tu,mu +; RV32-NEXT: vrgatherei16.vv v28, v8, v25, v0.t +; RV32-NEXT: vmv4r.v v8, v28 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_vx_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: addi a0, zero, 5 -; RV64-NEXT: sd a0, 56(sp) -; RV64-NEXT: sd a0, 24(sp) -; RV64-NEXT: sd a0, 16(sp) -; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: sd a0, 0(sp) -; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu -; RV64-NEXT: vslidedown.vi v28, v8, 7 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 48(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 1 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 40(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 4 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 32(sp) -; RV64-NEXT: vslidedown.vi v28, v8, 3 -; RV64-NEXT: vmv.x.s a0, v28 -; RV64-NEXT: sd a0, 8(sp) -; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: addi a0, zero, 115 +; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu +; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: lui a0, %hi(.LCPI13_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu +; RV64-NEXT: vle64.v v12, (a0) +; RV64-NEXT: vmv.v.i v28, 5 +; RV64-NEXT: vsetivli a0, 8, e64,m4,tu,mu +; RV64-NEXT: vrgather.vv v28, v8, v12, v0.t +; RV64-NEXT: vmv4r.v v8, v28 ; RV64-NEXT: ret %s = shufflevector <8 x i64> %x, <8 x i64> , <8 x i32> ret <8 x i64> %s