diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2901,13 +2901,20 @@ } -static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, bool &SwapSources, - const RISCVSubtarget &Subtarget) { +/// Is this shuffle interleaving contiguous elements from one vector into the +/// even elements and contiguous elements from another vector into the odd +/// elements. \p Src1 will contain the element that should be in the first even +/// element. \p Src2 will contain the element that should be in the first odd +/// element. These can be the first element in a source or the element half +/// way through the source. +static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, int &EvenSrc, + int &OddSrc, const RISCVSubtarget &Subtarget) { // We need to be able to widen elements to the next larger integer type. if (VT.getScalarSizeInBits() >= Subtarget.getELEN()) return false; int Size = Mask.size(); + int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); int Srcs[] = {-1, -1}; @@ -2919,8 +2926,8 @@ // Is this an even or odd element. int Pol = i % 2; - // Ensure we consistently use the same source for this element polarity. - int Src = Mask[i] / Size; + // Ensure we consistently use the same half source for this polarity. + int Src = alignDown(Mask[i], HalfSize); if (Srcs[Pol] < 0) Srcs[Pol] = Src; if (Srcs[Pol] != Src) @@ -2928,17 +2935,24 @@ // Make sure the element within the source is appropriate for this element // in the destination. - int Elt = Mask[i] % Size; + int Elt = Mask[i] % HalfSize; if (Elt != i / 2) return false; } - // We need to find a source for each polarity and they can't be the same. - if (Srcs[0] < 0 || Srcs[1] < 0 || Srcs[0] == Srcs[1]) + // One source should be low half of first vector. + if (Srcs[0] != 0 && Srcs[1] != 0) + return false; + + // Other source should be the upper half of the first source or the lower + // half of the second source. + // FIXME: This is only a heuristic to avoid regressions. + if (Srcs[0] != HalfSize && Srcs[0] != Size && Srcs[1] != HalfSize && + Srcs[1] != Size) return false; - // Swap the sources if the second source was in the even polarity. - SwapSources = Srcs[0] > Srcs[1]; + EvenSrc = Srcs[0]; + OddSrc = Srcs[1]; return true; } @@ -3338,18 +3352,22 @@ // Detect an interleave shuffle and lower to // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1)) - bool SwapSources; - if (isInterleaveShuffle(Mask, VT, SwapSources, Subtarget)) { - // Swap sources if needed. - if (SwapSources) - std::swap(V1, V2); - - // Extract the lower half of the vectors. + int EvenSrc, OddSrc; + if (isInterleaveShuffle(Mask, VT, EvenSrc, OddSrc, Subtarget)) { + // Extract the halves of the vectors. MVT HalfVT = VT.getHalfNumVectorElementsVT(); - V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, - DAG.getConstant(0, DL, XLenVT)); - V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V2, - DAG.getConstant(0, DL, XLenVT)); + + int Size = Mask.size(); + SDValue EvenV, OddV; + assert(EvenSrc >= 0 && "Undef source?"); + EvenV = (EvenSrc / Size) == 0 ? V1 : V2; + EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV, + DAG.getConstant(EvenSrc % Size, DL, XLenVT)); + + assert(OddSrc >= 0 && "Undef source?"); + OddV = (OddSrc / Size) == 0 ? V1 : V2; + OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV, + DAG.getConstant(OddSrc % Size, DL, XLenVT)); // Double the element width and halve the number of elements in an int type. unsigned EltBits = VT.getScalarSizeInBits(); @@ -3365,36 +3383,37 @@ // larger type. MVT HalfContainerVT = MVT::getVectorVT( VT.getVectorElementType(), WideIntContainerVT.getVectorElementCount()); - V1 = convertToScalableVector(HalfContainerVT, V1, DAG, Subtarget); - V2 = convertToScalableVector(HalfContainerVT, V2, DAG, Subtarget); + EvenV = convertToScalableVector(HalfContainerVT, EvenV, DAG, Subtarget); + OddV = convertToScalableVector(HalfContainerVT, OddV, DAG, Subtarget); // Cast sources to integer. MVT IntEltVT = MVT::getIntegerVT(EltBits); MVT IntHalfVT = MVT::getVectorVT(IntEltVT, HalfContainerVT.getVectorElementCount()); - V1 = DAG.getBitcast(IntHalfVT, V1); - V2 = DAG.getBitcast(IntHalfVT, V2); + EvenV = DAG.getBitcast(IntHalfVT, EvenV); + OddV = DAG.getBitcast(IntHalfVT, OddV); - // Freeze V2 since we use it twice and we need to be sure that the add and + // Freeze OddV since we use it twice and we need to be sure that the add and // multiply see the same value. - V2 = DAG.getFreeze(V2); + OddV = DAG.getFreeze(OddV); // Recreate TrueMask using the widened type's element count. TrueMask = getAllOnesMask(HalfContainerVT, VL, DL, DAG); - // Widen V1 and V2 with 0s and add one copy of V2 to V1. + // Widen EvenV and OddV with 0s and add one copy of OddV to EvenV. SDValue Add = - DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, V1, V2, + DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, EvenV, OddV, DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); - // Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer. + // Create 2^eltbits - 1 copies of OddV by multiplying by the largest + // integer. SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT, DAG.getUNDEF(IntHalfVT), DAG.getAllOnesConstant(DL, XLenVT), VL); SDValue WidenMul = - DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, V2, Multiplier, - DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); + DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, OddV, + Multiplier, DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); // Add the new copies to our previous addition giving us 2^eltbits copies of - // V2. This is equivalent to shifting V2 left by eltbits. This should + // OddV. This is equivalent to shifting OddV left by eltbits. This should // combine with the vwmulu.vv above to form vwmaccu.vv. Add = DAG.getNode(RISCVISD::ADD_VL, DL, WideIntContainerVT, Add, WidenMul, DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); @@ -3555,10 +3574,9 @@ MVT SVT = VT.getSimpleVT(); - bool SwapSources; - int LoSrc, HiSrc; - return (isElementRotate(LoSrc, HiSrc, M) > 0) || - isInterleaveShuffle(M, SVT, SwapSources, Subtarget); + int Dummy1, Dummy2; + return (isElementRotate(Dummy1, Dummy2, M) > 0) || + isInterleaveShuffle(M, SVT, Dummy1, Dummy2, Subtarget); } // Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -370,3 +370,152 @@ %a = shufflevector <32 x float> %x, <32 x float> %y, <64 x i32> ret <64 x float> %a } + +define <4 x half> @unary_interleave_v4f16(<4 x half> %x) { +; V128-LABEL: unary_interleave_v4f16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4f16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x half> %x, <4 x half> poison, <4 x i32> + ret <4 x half> %a +} + +define <4 x float> @unary_interleave_v4f32(<4 x float> %x) { +; V128-LABEL: unary_interleave_v4f32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4f32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> + ret <4 x float> %a +} + +; FIXME: Is there better codegen we can do here? +define <4 x double> @unary_interleave_v4f64(<4 x double> %x) { +; RV32-V128-LABEL: unary_interleave_v4f64: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-V128-NEXT: vle16.v v12, (a0) +; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v12 +; RV32-V128-NEXT: vmv.v.v v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V128-LABEL: unary_interleave_v4f64: +; RV64-V128: # %bb.0: +; RV64-V128-NEXT: lui a0, %hi(.LCPI13_0) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-V128-NEXT: vle64.v v12, (a0) +; RV64-V128-NEXT: vrgather.vv v10, v8, v12 +; RV64-V128-NEXT: vmv.v.v v8, v10 +; RV64-V128-NEXT: ret +; +; RV32-V512-LABEL: unary_interleave_v4f64: +; RV32-V512: # %bb.0: +; RV32-V512-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-V512-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV32-V512-NEXT: vle16.v v10, (a0) +; RV32-V512-NEXT: vrgatherei16.vv v9, v8, v10 +; RV32-V512-NEXT: vmv.v.v v8, v9 +; RV32-V512-NEXT: ret +; +; RV64-V512-LABEL: unary_interleave_v4f64: +; RV64-V512: # %bb.0: +; RV64-V512-NEXT: lui a0, %hi(.LCPI13_0) +; RV64-V512-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV64-V512-NEXT: vle64.v v10, (a0) +; RV64-V512-NEXT: vrgather.vv v9, v8, v10 +; RV64-V512-NEXT: vmv.v.v v8, v9 +; RV64-V512-NEXT: ret + %a = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> + ret <4 x double> %a +} + +define <8 x half> @unary_interleave_v8f16(<8 x half> %x) { +; V128-LABEL: unary_interleave_v8f16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 4 +; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8f16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x half> %x, <8 x half> poison, <8 x i32> + ret <8 x half> %a +} + +define <8 x float> @unary_interleave_v8f32(<8 x float> %x) { +; V128-LABEL: unary_interleave_v8f32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; V128-NEXT: vslidedown.vi v12, v8, 4 +; V128-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V128-NEXT: vwaddu.vv v10, v12, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v8 +; V128-NEXT: vmv2r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8f32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma +; V512-NEXT: vwaddu.vv v9, v10, v8 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v8 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> + ret <8 x float> %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -476,3 +476,204 @@ %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> ret <64 x i32> %a } + +define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) { +; V128-LABEL: unary_interleave_v4i8: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4i8: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> + ret <4 x i8> %a +} + +define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) { +; V128-LABEL: unary_interleave_v4i16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4i16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> + ret <4 x i16> %a +} + +define <4 x i32> @unary_interleave_v4i32(<4 x i32> %x) { +; V128-LABEL: unary_interleave_v4i32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4i32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + ret <4 x i32> %a +} + +; FIXME: Is there better codegen we can do here? +define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) { +; RV32-V128-LABEL: unary_interleave_v4i64: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: lui a0, %hi(.LCPI19_0) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-V128-NEXT: vle16.v v12, (a0) +; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v12 +; RV32-V128-NEXT: vmv.v.v v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V128-LABEL: unary_interleave_v4i64: +; RV64-V128: # %bb.0: +; RV64-V128-NEXT: lui a0, %hi(.LCPI19_0) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-V128-NEXT: vle64.v v12, (a0) +; RV64-V128-NEXT: vrgather.vv v10, v8, v12 +; RV64-V128-NEXT: vmv.v.v v8, v10 +; RV64-V128-NEXT: ret +; +; RV32-V512-LABEL: unary_interleave_v4i64: +; RV32-V512: # %bb.0: +; RV32-V512-NEXT: lui a0, %hi(.LCPI19_0) +; RV32-V512-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV32-V512-NEXT: vle16.v v10, (a0) +; RV32-V512-NEXT: vrgatherei16.vv v9, v8, v10 +; RV32-V512-NEXT: vmv.v.v v8, v9 +; RV32-V512-NEXT: ret +; +; RV64-V512-LABEL: unary_interleave_v4i64: +; RV64-V512: # %bb.0: +; RV64-V512-NEXT: lui a0, %hi(.LCPI19_0) +; RV64-V512-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV64-V512-NEXT: vle64.v v10, (a0) +; RV64-V512-NEXT: vrgather.vv v9, v8, v10 +; RV64-V512-NEXT: vmv.v.v v8, v9 +; RV64-V512-NEXT: ret + %a = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> + ret <4 x i64> %a +} + +define <8 x i8> @unary_interleave_v8i8(<8 x i8> %x) { +; V128-LABEL: unary_interleave_v8i8: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 4 +; V128-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8i8: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> + ret <8 x i8> %a +} + +define <8 x i16> @unary_interleave_v8i16(<8 x i16> %x) { +; V128-LABEL: unary_interleave_v8i16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 4 +; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v10, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v8 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8i16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma +; V512-NEXT: vwaddu.vv v9, v10, v8 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v8 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> + ret <8 x i16> %a +} + +define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) { +; V128-LABEL: unary_interleave_v8i32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; V128-NEXT: vslidedown.vi v12, v8, 4 +; V128-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V128-NEXT: vwaddu.vv v10, v8, v12 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v12 +; V128-NEXT: vmv2r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8i32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> + ret <8 x i32> %a +}