diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -280,6 +280,8 @@ VWSUB_W_VL, VWSUBU_W_VL, + VNSRL_VL, + // Vector compare producing a mask. Fourth operand is input mask. Fifth // operand is VL. SETCC_VL, @@ -386,6 +388,8 @@ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; + bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const override; bool isIntDivCheap(EVT VT, AttributeList Attr) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1398,6 +1398,39 @@ return Imm.isZero(); } +// TODO: This is very conservative. +bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + // Only support extracting a fixed from a fixed vector for now. + if (ResVT.isScalableVector() || SrcVT.isScalableVector()) + return false; + + unsigned ResElts = ResVT.getVectorNumElements(); + unsigned SrcElts = SrcVT.getVectorNumElements(); + + // Convervatively only handle extracting half of a vector. + // TODO: Relax this. + if ((ResElts * 2) != SrcElts) + return false; + + // The smallest type we can slide is i8. + // TODO: We can extract index 0 from a mask vector without a slide. + if (ResVT.getVectorElementType() == MVT::i1) + return false; + + // Slide can support arbitrary index, but we only treat vslidedown.vi as + // cheap. + if (Index >= 32) + return false; + + // TODO: We can do arbitrary slidedowns, but for now only support extracting + // the upper half of a vector until we have more test coverage. + return Index == 0 || Index == ResElts; +} + bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const { return (VT == MVT::f16 && Subtarget.hasStdExtZfh()) || (VT == MVT::f32 && Subtarget.hasStdExtF()) || @@ -2629,6 +2662,86 @@ return Rotation; } +// Lower the following shuffles to vnsrl. +// t34: v8i8 = extract_subvector t11, Constant:i64<0> +// t33: v8i8 = extract_subvector t11, Constant:i64<8> +// a) t35: v8i8 = vector_shuffle<0,2,4,6,8,10,12,14> t34, t33 +// b) t35: v8i8 = vector_shuffle<1,3,5,7,9,11,13,15> t34, t33 +static SDValue lowerVECTOR_SHUFFLEAsVNSRL(const SDLoc &DL, MVT VT, + MVT ContainerVT, SDValue V1, + SDValue V2, SDValue TrueMask, + SDValue VL, ArrayRef Mask, + const RISCVSubtarget &Subtarget, + SelectionDAG &DAG) { + // Need to be able to widen the vector. + if (VT.getScalarSizeInBits() >= Subtarget.getELEN()) + return SDValue(); + + // Both input must be extracts. + if (V1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + V2.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return SDValue(); + + // Extracting from the same source. + SDValue Src = V1.getOperand(0); + if (Src != V2.getOperand(0)) + return SDValue(); + + // Src needs to have twice the number of elements. + if (Src.getValueType().getVectorNumElements() != (Mask.size() * 2)) + return SDValue(); + + // The extracts must extract the two halves of the source. + if (V1.getConstantOperandVal(1) != 0 || + V2.getConstantOperandVal(1) != Mask.size()) + return SDValue(); + + // First index must be the first even or odd element from V1. + if (Mask[0] != 0 && Mask[0] != 1) + return SDValue(); + + // The others must increase by 2 each time. + // TODO: Support undef elements? + for (unsigned i = 1; i != Mask.size(); ++i) + if (Mask[i] != Mask[i - 1] + 2) + return SDValue(); + + // Convert the source using a container type with twice the elements. Since + // source VT is legal and twice this VT, we know VT isn't LMUL=8 so it is + // safe to double. + MVT DoubleContainerVT = + MVT::getVectorVT(ContainerVT.getVectorElementType(), + ContainerVT.getVectorElementCount() * 2); + Src = convertToScalableVector(DoubleContainerVT, Src, DAG, Subtarget); + + // Convert the vector to a wider integer type with the original element + // count. This also converts FP to int. + unsigned EltBits = ContainerVT.getScalarSizeInBits(); + MVT WideIntEltVT = MVT::getIntegerVT(EltBits * 2); + MVT WideIntContainerVT = + MVT::getVectorVT(WideIntEltVT, ContainerVT.getVectorElementCount()); + Src = DAG.getBitcast(WideIntContainerVT, Src); + + // Convert to the integer version of the container type. + MVT IntEltVT = MVT::getIntegerVT(EltBits); + MVT IntContainerVT = + MVT::getVectorVT(IntEltVT, ContainerVT.getVectorElementCount()); + + // If we want even elements, then the shift amount is 0. Otherwise, shift by + // the original element size. + unsigned Shift = Mask[0] == 0 ? 0 : EltBits; + SDValue SplatShift = DAG.getNode( + RISCVISD::VMV_V_X_VL, DL, IntContainerVT, DAG.getUNDEF(ContainerVT), + DAG.getConstant(Shift, DL, Subtarget.getXLenVT()), VL); + SDValue Res = + DAG.getNode(RISCVISD::VNSRL_VL, DL, IntContainerVT, Src, SplatShift, + DAG.getUNDEF(IntContainerVT), TrueMask, VL); + // Cast back to FP if needed. + Res = DAG.getBitcast(ContainerVT, Res); + + return convertFromScalableVector(VT, Res, DAG, Subtarget); +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -2760,6 +2873,10 @@ return convertFromScalableVector(VT, Res, DAG, Subtarget); } + if (SDValue V = lowerVECTOR_SHUFFLEAsVNSRL( + DL, VT, ContainerVT, V1, V2, TrueMask, VL, Mask, Subtarget, DAG)) + return V; + // Detect an interleave shuffle and lower to // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1)) bool SwapSources; @@ -12259,6 +12376,7 @@ NODE_NAME_CASE(VWADDU_W_VL) NODE_NAME_CASE(VWSUB_W_VL) NODE_NAME_CASE(VWSUBU_W_VL) + NODE_NAME_CASE(VNSRL_VL) NODE_NAME_CASE(SETCC_VL) NODE_NAME_CASE(VSELECT_VL) NODE_NAME_CASE(VP_MERGE_VL) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -256,6 +256,16 @@ def riscv_vwsub_vl : SDNode<"RISCVISD::VWSUB_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; def riscv_vwsubu_vl : SDNode<"RISCVISD::VWSUBU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; +def SDT_RISCVVNBinOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisOpSmallerThanOp<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisSameNumEltsAs<0, 4>, + SDTCVecEltisVT<4, i1>, + SDTCisVT<5, XLenVT>]>; +def riscv_vnsrl_vl : SDNode<"RISCVISD::VNSRL_VL", SDT_RISCVVNBinOp_VL>; + def SDT_RISCVVWBinOpW_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameNumEltsAs<1, 2>, @@ -446,6 +456,24 @@ } } +multiclass VPatBinaryNVL_WV_WX_WI { + foreach VtiToWti = AllWidenableIntVectors in { + defvar vti = VtiToWti.Vti; + defvar wti = VtiToWti.Wti; + defm : VPatBinaryVL_V; + defm : VPatBinaryVL_XI; + defm : VPatBinaryVL_XI(SplatPat#_#uimm5), + uimm5>; + } +} + multiclass VPatBinaryVL_VF; defm : VPatNarrowShiftSplatExt_WX; +defm : VPatBinaryNVL_WV_WX_WI; + foreach vtiTowti = AllWidenableIntVectors in { defvar vti = vtiTowti.Vti; defvar wti = vtiTowti.Wti; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -53,22 +53,16 @@ ; ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization: ; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: addi sp, sp, -16 -; LMULMAX2-NEXT: .cfi_def_cfa_offset 16 -; LMULMAX2-NEXT: addi a0, sp, 8 -; LMULMAX2-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; LMULMAX2-NEXT: vse32.v v10, (a0) -; LMULMAX2-NEXT: mv a0, sp -; LMULMAX2-NEXT: vse32.v v8, (a0) -; LMULMAX2-NEXT: vslidedown.vi v10, v10, 7 -; LMULMAX2-NEXT: addi a1, sp, 12 -; LMULMAX2-NEXT: vse32.v v10, (a1) -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 7 -; LMULMAX2-NEXT: addi a1, sp, 4 -; LMULMAX2-NEXT: vse32.v v8, (a1) -; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-NEXT: vle32.v v8, (a0) -; LMULMAX2-NEXT: addi sp, sp, 16 +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-NEXT: vid.v v12 +; LMULMAX2-NEXT: li a0, 7 +; LMULMAX2-NEXT: vmul.vx v14, v12, a0 +; LMULMAX2-NEXT: vrgather.vv v12, v8, v14 +; LMULMAX2-NEXT: li a0, 12 +; LMULMAX2-NEXT: vmv.s.x v0, a0 +; LMULMAX2-NEXT: vadd.vi v8, v14, -14 +; LMULMAX2-NEXT: vrgather.vv v12, v10, v8, v0.t +; LMULMAX2-NEXT: vmv1r.v v8, v12 ; LMULMAX2-NEXT: ret %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> ret <4 x float> %z diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -0,0 +1,367 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+zvl256b \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,V +; RUN: llc < %s -mtriple=riscv64 -mattr=+f,+zve32f,+zfh,+experimental-zvfh,+zvl256b \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,ZVE32F + +define void @vnsrl_0_i8(ptr %in, ptr %out) { +; CHECK-LABEL: vnsrl_0_i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, mu +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: ret +entry: + %0 = load <16 x i8>, ptr %in, align 1 + %shuffle.i5 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> + store <8 x i8> %shuffle.i5, ptr %out, align 1 + ret void +} + +define void @vnsrl_8_i8(ptr %in, ptr %out) { +; CHECK-LABEL: vnsrl_8_i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, mu +; CHECK-NEXT: vnsrl.wi v8, v8, 8 +; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: ret +entry: + %0 = load <16 x i8>, ptr %in, align 1 + %shuffle.i5 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> + store <8 x i8> %shuffle.i5, ptr %out, align 1 + ret void +} + +define void @vnsrl_0_i16(ptr %in, ptr %out) { +; V-LABEL: vnsrl_0_i16: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; V-NEXT: vle16.v v8, (a0) +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vse16.v v8, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_i16: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVE32F-NEXT: vle16.v v8, (a0) +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 +; ZVE32F-NEXT: vse16.v v8, (a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <8 x i16>, ptr %in, align 2 + %shuffle.i5 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> + store <4 x i16> %shuffle.i5, ptr %out, align 2 + ret void +} + +define void @vnsrl_16_i16(ptr %in, ptr %out) { +; V-LABEL: vnsrl_16_i16: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; V-NEXT: vle16.v v8, (a0) +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; V-NEXT: vnsrl.wi v8, v8, 16 +; V-NEXT: vse16.v v8, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_16_i16: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVE32F-NEXT: vle16.v v8, (a0) +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; ZVE32F-NEXT: vnsrl.wi v8, v8, 16 +; ZVE32F-NEXT: vse16.v v8, (a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <8 x i16>, ptr %in, align 2 + %shuffle.i5 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> + store <4 x i16> %shuffle.i5, ptr %out, align 2 + ret void +} + +define void @vnsrl_0_half(ptr %in, ptr %out) { +; V-LABEL: vnsrl_0_half: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; V-NEXT: vle16.v v8, (a0) +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vse16.v v8, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_half: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVE32F-NEXT: vle16.v v8, (a0) +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 +; ZVE32F-NEXT: vse16.v v8, (a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <8 x half>, ptr %in, align 2 + %shuffle.i5 = shufflevector <8 x half> %0, <8 x half> poison, <4 x i32> + store <4 x half> %shuffle.i5, ptr %out, align 2 + ret void +} + +define void @vnsrl_16_half(ptr %in, ptr %out) { +; V-LABEL: vnsrl_16_half: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; V-NEXT: vle16.v v8, (a0) +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; V-NEXT: vnsrl.wi v8, v8, 16 +; V-NEXT: vse16.v v8, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_16_half: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVE32F-NEXT: vle16.v v8, (a0) +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; ZVE32F-NEXT: vnsrl.wi v8, v8, 16 +; ZVE32F-NEXT: vse16.v v8, (a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <8 x half>, ptr %in, align 2 + %shuffle.i5 = shufflevector <8 x half> %0, <8 x half> poison, <4 x i32> + store <4 x half> %shuffle.i5, ptr %out, align 2 + ret void +} + +define void @vnsrl_0_i32(ptr %in, ptr %out) { +; V-LABEL: vnsrl_0_i32: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e32, mf2, ta, mu +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vse32.v v8, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_i32: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; ZVE32F-NEXT: li a0, 2 +; ZVE32F-NEXT: vmv.s.x v0, a0 +; ZVE32F-NEXT: vrgather.vi v10, v8, 0 +; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t +; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x i32>, ptr %in, align 4 + %shuffle.i5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuffle.i5, ptr %out, align 4 + ret void +} + +define void @vnsrl_32_i32(ptr %in, ptr %out) { +; V-LABEL: vnsrl_32_i32: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e32, mf2, ta, mu +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: li a0, 32 +; V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; V-NEXT: vnsrl.wx v8, v8, a0 +; V-NEXT: vse32.v v8, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_32_i32: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; ZVE32F-NEXT: li a0, 2 +; ZVE32F-NEXT: vmv.s.x v0, a0 +; ZVE32F-NEXT: vrgather.vi v10, v8, 1 +; ZVE32F-NEXT: vrgather.vi v10, v9, 1, v0.t +; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x i32>, ptr %in, align 4 + %shuffle.i5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuffle.i5, ptr %out, align 4 + ret void +} + +define void @vnsrl_0_float(ptr %in, ptr %out) { +; V-LABEL: vnsrl_0_float: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e32, mf2, ta, mu +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vse32.v v8, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_float: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; ZVE32F-NEXT: li a0, 2 +; ZVE32F-NEXT: vmv.s.x v0, a0 +; ZVE32F-NEXT: vrgather.vi v10, v8, 0 +; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t +; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x float>, ptr %in, align 4 + %shuffle.i5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> + store <2 x float> %shuffle.i5, ptr %out, align 4 + ret void +} + +define void @vnsrl_32_float(ptr %in, ptr %out) { +; V-LABEL: vnsrl_32_float: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e32, mf2, ta, mu +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: li a0, 32 +; V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; V-NEXT: vnsrl.wx v8, v8, a0 +; V-NEXT: vse32.v v8, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_32_float: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; ZVE32F-NEXT: li a0, 2 +; ZVE32F-NEXT: vmv.s.x v0, a0 +; ZVE32F-NEXT: vrgather.vi v10, v8, 1 +; ZVE32F-NEXT: vrgather.vi v10, v9, 1, v0.t +; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x float>, ptr %in, align 4 + %shuffle.i5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> + store <2 x float> %shuffle.i5, ptr %out, align 4 + ret void +} + +define void @vnsrl_0_i64(ptr %in, ptr %out) { +; V-LABEL: vnsrl_0_i64: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; V-NEXT: vle64.v v8, (a0) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vslidedown.vi v9, v8, 2 +; V-NEXT: li a0, 2 +; V-NEXT: vmv.s.x v0, a0 +; V-NEXT: vrgather.vi v10, v8, 0 +; V-NEXT: vrgather.vi v10, v9, 0, v0.t +; V-NEXT: vse64.v v10, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_i64: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: ld a2, 16(a0) +; ZVE32F-NEXT: ld a0, 0(a0) +; ZVE32F-NEXT: sd a2, 8(a1) +; ZVE32F-NEXT: sd a0, 0(a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x i64>, ptr %in, align 8 + %shuffle.i5 = shufflevector <4 x i64> %0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuffle.i5, ptr %out, align 8 + ret void +} + +define void @vnsrl_64_i64(ptr %in, ptr %out) { +; V-LABEL: vnsrl_64_i64: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; V-NEXT: vle64.v v8, (a0) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vslidedown.vi v9, v8, 2 +; V-NEXT: li a0, 2 +; V-NEXT: vmv.s.x v0, a0 +; V-NEXT: vrgather.vi v10, v8, 1 +; V-NEXT: vrgather.vi v10, v9, 1, v0.t +; V-NEXT: vse64.v v10, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_64_i64: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: ld a2, 24(a0) +; ZVE32F-NEXT: ld a0, 8(a0) +; ZVE32F-NEXT: sd a2, 8(a1) +; ZVE32F-NEXT: sd a0, 0(a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x i64>, ptr %in, align 8 + %shuffle.i5 = shufflevector <4 x i64> %0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuffle.i5, ptr %out, align 8 + ret void +} + +define void @vnsrl_0_double(ptr %in, ptr %out) { +; V-LABEL: vnsrl_0_double: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; V-NEXT: vle64.v v8, (a0) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vslidedown.vi v9, v8, 2 +; V-NEXT: li a0, 2 +; V-NEXT: vmv.s.x v0, a0 +; V-NEXT: vrgather.vi v10, v8, 0 +; V-NEXT: vrgather.vi v10, v9, 0, v0.t +; V-NEXT: vse64.v v10, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_double: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: ld a2, 16(a0) +; ZVE32F-NEXT: ld a0, 0(a0) +; ZVE32F-NEXT: sd a2, 8(a1) +; ZVE32F-NEXT: sd a0, 0(a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x double>, ptr %in, align 8 + %shuffle.i5 = shufflevector <4 x double> %0, <4 x double> poison, <2 x i32> + store <2 x double> %shuffle.i5, ptr %out, align 8 + ret void +} + +define void @vnsrl_64_double(ptr %in, ptr %out) { +; V-LABEL: vnsrl_64_double: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; V-NEXT: vle64.v v8, (a0) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vslidedown.vi v9, v8, 2 +; V-NEXT: li a0, 2 +; V-NEXT: vmv.s.x v0, a0 +; V-NEXT: vrgather.vi v10, v8, 1 +; V-NEXT: vrgather.vi v10, v9, 1, v0.t +; V-NEXT: vse64.v v10, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_64_double: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: ld a2, 24(a0) +; ZVE32F-NEXT: ld a0, 8(a0) +; ZVE32F-NEXT: sd a2, 8(a1) +; ZVE32F-NEXT: sd a0, 0(a1) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x double>, ptr %in, align 8 + %shuffle.i5 = shufflevector <4 x double> %0, <4 x double> poison, <2 x i32> + store <2 x double> %shuffle.i5, ptr %out, align 8 + ret void +}