diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3215,6 +3215,78 @@ DAG.getConstant(0, DL, XLenVT)); } +// Given two input vectors of <[vscale x ]n x ty>, use vwaddu.vv and vwmaccu.vx +// to create an interleaved vector of <[vscale x] n*2 x ty>. +// This requires that the size of ty is less than the subtarget's maximum ELEN. +static SDValue getWideningInterleave(SDValue EvenV, SDValue OddV, SDLoc &DL, + SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + MVT VecVT = EvenV.getSimpleValueType(); + MVT VecContainerVT = VecVT; // + // Convert fixed vectors to scalable if needed + if (VecContainerVT.isFixedLengthVector()) { + VecContainerVT = getContainerForFixedLengthVector(DAG, VecVT, Subtarget); + EvenV = convertToScalableVector(VecContainerVT, EvenV, DAG, Subtarget); + OddV = convertToScalableVector(VecContainerVT, OddV, DAG, Subtarget); + } + + assert(VecVT.getScalarSizeInBits() < Subtarget.getELEN()); + + // We're working with a vector of the same size as the resulting + // interleaved vector, but with half the number of elements and + // twice the SEW (Hence the restriction on not using the maximum + // ELEN) + MVT WideVT = + MVT::getVectorVT(MVT::getIntegerVT(VecVT.getScalarSizeInBits() * 2), + VecVT.getVectorElementCount()); + MVT WideContainerVT = WideVT; // + if (WideContainerVT.isFixedLengthVector()) + WideContainerVT = getContainerForFixedLengthVector(DAG, WideVT, Subtarget); + + // Bitcast the input vectors to integers in case they are FP + VecContainerVT = VecContainerVT.changeTypeToInteger(); + EvenV = DAG.getBitcast(VecContainerVT, EvenV); + OddV = DAG.getBitcast(VecContainerVT, OddV); + + auto [Mask, VL] = getDefaultVLOps(VecVT, VecContainerVT, DL, DAG, Subtarget); + SDValue Passthru = DAG.getUNDEF(WideContainerVT); + + // Widen EvenV and OddV with 0s and add one copy of OddV to EvenV with + // vwaddu.vv + SDValue Interleaved = DAG.getNode(RISCVISD::VWADDU_VL, DL, WideContainerVT, + EvenV, OddV, Passthru, Mask, VL); + + // Then get OddV * by 2^(VecVT.getScalarSizeInBits() - 1) + SDValue AllOnesVec = DAG.getSplatVector( + VecContainerVT, DL, DAG.getAllOnesConstant(DL, Subtarget.getXLenVT())); + SDValue OddsMul = DAG.getNode(RISCVISD::VWMULU_VL, DL, WideContainerVT, OddV, + AllOnesVec, Passthru, Mask, VL); + + // Add the two together so we get + // (OddV * 0xff...ff) + (OddV + EvenV) + // = (OddV * 0x100...00) + EvenV + // = (OddV << VecVT.getScalarSizeInBits()) + EvenV + // Note the ADD_VL and VLMULU_VL should get selected as vwmaccu.vx + Interleaved = DAG.getNode(RISCVISD::ADD_VL, DL, WideContainerVT, Interleaved, + OddsMul, Passthru, Mask, VL); + + // Bitcast from to + MVT ResultContainerVT = MVT::getVectorVT( + VecVT.getVectorElementType(), // Make sure to use original type + VecContainerVT.getVectorElementCount().multiplyCoefficientBy(2)); + Interleaved = DAG.getBitcast(ResultContainerVT, Interleaved); + + // Convert back to a fixed vector if needed + MVT ResultVT = + MVT::getVectorVT(VecVT.getVectorElementType(), + VecVT.getVectorElementCount().multiplyCoefficientBy(2)); + if (ResultVT.isFixedLengthVector()) + Interleaved = + convertFromScalableVector(ResultVT, Interleaved, DAG, Subtarget); + + return Interleaved; +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -3373,62 +3445,7 @@ OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV, DAG.getConstant(OddSrc % Size, DL, XLenVT)); - // Double the element width and halve the number of elements in an int type. - unsigned EltBits = VT.getScalarSizeInBits(); - MVT WideIntEltVT = MVT::getIntegerVT(EltBits * 2); - MVT WideIntVT = - MVT::getVectorVT(WideIntEltVT, VT.getVectorNumElements() / 2); - // Convert this to a scalable vector. We need to base this on the - // destination size to ensure there's always a type with a smaller LMUL. - MVT WideIntContainerVT = - getContainerForFixedLengthVector(DAG, WideIntVT, Subtarget); - - // Convert sources to scalable vectors with the same element count as the - // larger type. - MVT HalfContainerVT = MVT::getVectorVT( - VT.getVectorElementType(), WideIntContainerVT.getVectorElementCount()); - EvenV = convertToScalableVector(HalfContainerVT, EvenV, DAG, Subtarget); - OddV = convertToScalableVector(HalfContainerVT, OddV, DAG, Subtarget); - - // Cast sources to integer. - MVT IntEltVT = MVT::getIntegerVT(EltBits); - MVT IntHalfVT = - MVT::getVectorVT(IntEltVT, HalfContainerVT.getVectorElementCount()); - EvenV = DAG.getBitcast(IntHalfVT, EvenV); - OddV = DAG.getBitcast(IntHalfVT, OddV); - - // Freeze OddV since we use it twice and we need to be sure that the add and - // multiply see the same value. - OddV = DAG.getFreeze(OddV); - - // Recreate TrueMask using the widened type's element count. - TrueMask = getAllOnesMask(HalfContainerVT, VL, DL, DAG); - - // Widen EvenV and OddV with 0s and add one copy of OddV to EvenV. - SDValue Add = - DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, EvenV, OddV, - DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); - // Create 2^eltbits - 1 copies of OddV by multiplying by the largest - // integer. - SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT, - DAG.getUNDEF(IntHalfVT), - DAG.getAllOnesConstant(DL, XLenVT), VL); - SDValue WidenMul = - DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, OddV, - Multiplier, DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); - // Add the new copies to our previous addition giving us 2^eltbits copies of - // OddV. This is equivalent to shifting OddV left by eltbits. This should - // combine with the vwmulu.vv above to form vwmaccu.vv. - Add = DAG.getNode(RISCVISD::ADD_VL, DL, WideIntContainerVT, Add, WidenMul, - DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); - // Cast back to ContainerVT. We need to re-create a new ContainerVT in case - // WideIntContainerVT is a larger fractional LMUL than implied by the fixed - // vector VT. - ContainerVT = - MVT::getVectorVT(VT.getVectorElementType(), - WideIntContainerVT.getVectorElementCount() * 2); - Add = DAG.getBitcast(ContainerVT, Add); - return convertFromScalableVector(VT, Add, DAG, Subtarget); + return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget); } // Detect shuffles which can be re-expressed as vector selects; these are diff --git a/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll b/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll --- a/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll +++ b/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll @@ -7,7 +7,7 @@ define dso_local <16 x i16> @interleave(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: interleave: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vwaddu.vv v10, v8, v9 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -9,7 +9,7 @@ define <4 x half> @interleave_v2f16(<2 x half> %x, <2 x half> %y) { ; CHECK-LABEL: interleave_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vwaddu.vv v10, v8, v9 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9 @@ -23,7 +23,7 @@ define <4 x float> @interleave_v2f32(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: interleave_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vwaddu.vv v10, v9, v8 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vwmaccu.vx v10, a0, v8 @@ -97,7 +97,7 @@ define <8 x half> @interleave_v4f16(<4 x half> %x, <4 x half> %y) { ; V128-LABEL: interleave_v4f16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; V128-NEXT: vwaddu.vv v10, v8, v9 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v9 @@ -106,7 +106,7 @@ ; ; V512-LABEL: interleave_v4f16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -119,7 +119,7 @@ define <8 x float> @interleave_v4f32(<4 x float> %x, <4 x float> %y) { ; V128-LABEL: interleave_v4f32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; V128-NEXT: vwaddu.vv v10, v8, v9 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v9 @@ -128,7 +128,7 @@ ; ; V512-LABEL: interleave_v4f32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -142,7 +142,7 @@ define <16 x half> @interleave_v8f16(<8 x half> %x, <8 x half> %y) { ; V128-LABEL: interleave_v8f16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e16, m1, ta, ma +; V128-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; V128-NEXT: vwaddu.vv v10, v9, v8 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v8 @@ -151,7 +151,7 @@ ; ; V512-LABEL: interleave_v8f16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma ; V512-NEXT: vwaddu.vv v10, v9, v8 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v8 @@ -164,7 +164,7 @@ define <16 x float> @interleave_v8f32(<8 x float> %x, <8 x float> %y) { ; V128-LABEL: interleave_v8f32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e32, m2, ta, ma +; V128-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; V128-NEXT: vwaddu.vv v12, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v12, a0, v10 @@ -173,7 +173,7 @@ ; ; V512-LABEL: interleave_v8f32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -186,8 +186,7 @@ define <32 x half> @interleave_v16f16(<16 x half> %x, <16 x half> %y) { ; V128-LABEL: interleave_v16f16: ; V128: # %bb.0: -; V128-NEXT: li a0, 32 -; V128-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; V128-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; V128-NEXT: vwaddu.vv v12, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v12, a0, v10 @@ -196,8 +195,7 @@ ; ; V512-LABEL: interleave_v16f16: ; V512: # %bb.0: -; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; V512-NEXT: vsetivli zero, 16, e16, mf2, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -210,8 +208,7 @@ define <32 x float> @interleave_v16f32(<16 x float> %x, <16 x float> %y) { ; V128-LABEL: interleave_v16f32: ; V128: # %bb.0: -; V128-NEXT: li a0, 32 -; V128-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v16, v8, v12 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v16, a0, v12 @@ -220,8 +217,7 @@ ; ; V512-LABEL: interleave_v16f32: ; V512: # %bb.0: -; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; V512-NEXT: vsetivli zero, 16, e32, m1, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -234,7 +230,7 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) { ; V128-LABEL: interleave_v32f16: ; V128: # %bb.0: -; V128-NEXT: li a0, 64 +; V128-NEXT: li a0, 32 ; V128-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; V128-NEXT: vwaddu.vv v16, v8, v12 ; V128-NEXT: li a0, -1 @@ -244,7 +240,7 @@ ; ; V512-LABEL: interleave_v32f16: ; V512: # %bb.0: -; V512-NEXT: li a0, 64 +; V512-NEXT: li a0, 32 ; V512-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 @@ -293,7 +289,7 @@ ; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t ; RV32-V128-NEXT: vmv.v.v v24, v8 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-V128-NEXT: vwaddu.vv v0, v8, v16 @@ -344,7 +340,7 @@ ; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t ; RV64-V128-NEXT: vmv.v.v v24, v8 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-V128-NEXT: vwaddu.vv v0, v8, v16 @@ -360,7 +356,7 @@ ; ; V512-LABEL: interleave_v32f32: ; V512: # %bb.0: -; V512-NEXT: li a0, 64 +; V512-NEXT: li a0, 32 ; V512-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; V512-NEXT: vwaddu.vv v12, v8, v10 ; V512-NEXT: li a0, -1 @@ -376,7 +372,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; V128-NEXT: vwaddu.vv v9, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v9, a0, v10 @@ -387,7 +383,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 2 -; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; V512-NEXT: vwaddu.vv v9, v8, v10 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v10 @@ -402,7 +397,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; V128-NEXT: vwaddu.vv v9, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v9, a0, v10 @@ -413,7 +408,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 2 -; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma ; V512-NEXT: vwaddu.vv v9, v8, v10 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v10 @@ -473,7 +467,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 4 -; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; V128-NEXT: vwaddu.vv v9, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v9, a0, v10 @@ -484,7 +478,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 4 -; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma ; V512-NEXT: vwaddu.vv v9, v8, v10 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v10 @@ -499,7 +492,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; V128-NEXT: vslidedown.vi v12, v8, 4 -; V128-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; V128-NEXT: vwaddu.vv v10, v12, v8 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v8 @@ -510,7 +503,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 4 -; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma ; V512-NEXT: vwaddu.vv v9, v10, v8 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -9,7 +9,7 @@ define <4 x i8> @interleave_v2i8(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: interleave_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vwaddu.vv v10, v8, v9 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9 @@ -22,7 +22,7 @@ define <4 x i16> @interleave_v2i16(<2 x i16> %x, <2 x i16> %y) { ; CHECK-LABEL: interleave_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vwaddu.vv v10, v8, v9 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9 @@ -36,7 +36,7 @@ define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: interleave_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vwaddu.vv v10, v9, v8 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vwmaccu.vx v10, a0, v8 @@ -110,7 +110,7 @@ define <8 x i8> @interleave_v4i8(<4 x i8> %x, <4 x i8> %y) { ; V128-LABEL: interleave_v4i8: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; V128-NEXT: vwaddu.vv v10, v9, v8 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v8 @@ -119,7 +119,7 @@ ; ; V512-LABEL: interleave_v4i8: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, ma +; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma ; V512-NEXT: vwaddu.vv v10, v9, v8 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v8 @@ -133,7 +133,7 @@ define <8 x i16> @interleave_v4i16(<4 x i16> %x, <4 x i16> %y) { ; V128-LABEL: interleave_v4i16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; V128-NEXT: vwaddu.vv v10, v8, v9 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v9 @@ -142,7 +142,7 @@ ; ; V512-LABEL: interleave_v4i16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -155,7 +155,7 @@ define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) { ; V128-LABEL: interleave_v4i32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; V128-NEXT: vwaddu.vv v10, v8, v9 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v9 @@ -164,7 +164,7 @@ ; ; V512-LABEL: interleave_v4i32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -177,7 +177,7 @@ define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) { ; V128-LABEL: interleave_v8i8: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e8, mf2, ta, ma +; V128-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; V128-NEXT: vwaddu.vv v10, v8, v9 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v9 @@ -186,7 +186,7 @@ ; ; V512-LABEL: interleave_v8i8: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e8, mf8, ta, ma +; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -200,7 +200,7 @@ define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) { ; V128-LABEL: interleave_v8i16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e16, m1, ta, ma +; V128-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; V128-NEXT: vwaddu.vv v10, v9, v8 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v8 @@ -209,7 +209,7 @@ ; ; V512-LABEL: interleave_v8i16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma ; V512-NEXT: vwaddu.vv v10, v9, v8 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v8 @@ -222,7 +222,7 @@ define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) { ; V128-LABEL: interleave_v8i32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e32, m2, ta, ma +; V128-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; V128-NEXT: vwaddu.vv v12, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v12, a0, v10 @@ -231,7 +231,7 @@ ; ; V512-LABEL: interleave_v8i32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -244,8 +244,7 @@ define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) { ; V128-LABEL: interleave_v16i8: ; V128: # %bb.0: -; V128-NEXT: li a0, 32 -; V128-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; V128-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; V128-NEXT: vwaddu.vv v10, v8, v9 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v9 @@ -254,8 +253,7 @@ ; ; V512-LABEL: interleave_v16i8: ; V512: # %bb.0: -; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; V512-NEXT: vsetivli zero, 16, e8, mf4, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -268,8 +266,7 @@ define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) { ; V128-LABEL: interleave_v16i16: ; V128: # %bb.0: -; V128-NEXT: li a0, 32 -; V128-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; V128-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; V128-NEXT: vwaddu.vv v12, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v12, a0, v10 @@ -278,8 +275,7 @@ ; ; V512-LABEL: interleave_v16i16: ; V512: # %bb.0: -; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; V512-NEXT: vsetivli zero, 16, e16, mf2, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -292,8 +288,7 @@ define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) { ; V128-LABEL: interleave_v16i32: ; V128: # %bb.0: -; V128-NEXT: li a0, 32 -; V128-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v16, v8, v12 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v16, a0, v12 @@ -302,8 +297,7 @@ ; ; V512-LABEL: interleave_v16i32: ; V512: # %bb.0: -; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; V512-NEXT: vsetivli zero, 16, e32, m1, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v10, a0, v9 @@ -316,7 +310,7 @@ define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) { ; V128-LABEL: interleave_v32i8: ; V128: # %bb.0: -; V128-NEXT: li a0, 64 +; V128-NEXT: li a0, 32 ; V128-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; V128-NEXT: vwaddu.vv v12, v8, v10 ; V128-NEXT: li a0, -1 @@ -326,7 +320,7 @@ ; ; V512-LABEL: interleave_v32i8: ; V512: # %bb.0: -; V512-NEXT: li a0, 64 +; V512-NEXT: li a0, 32 ; V512-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 @@ -340,7 +334,7 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) { ; V128-LABEL: interleave_v32i16: ; V128: # %bb.0: -; V128-NEXT: li a0, 64 +; V128-NEXT: li a0, 32 ; V128-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; V128-NEXT: vwaddu.vv v16, v8, v12 ; V128-NEXT: li a0, -1 @@ -350,7 +344,7 @@ ; ; V512-LABEL: interleave_v32i16: ; V512: # %bb.0: -; V512-NEXT: li a0, 64 +; V512-NEXT: li a0, 32 ; V512-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v9 ; V512-NEXT: li a0, -1 @@ -399,7 +393,7 @@ ; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t ; RV32-V128-NEXT: vmv.v.v v24, v8 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-V128-NEXT: vwaddu.vv v0, v8, v16 @@ -450,7 +444,7 @@ ; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t ; RV64-V128-NEXT: vmv.v.v v24, v8 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-V128-NEXT: vwaddu.vv v0, v8, v16 @@ -466,7 +460,7 @@ ; ; V512-LABEL: interleave_v32i32: ; V512: # %bb.0: -; V512-NEXT: li a0, 64 +; V512-NEXT: li a0, 32 ; V512-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; V512-NEXT: vwaddu.vv v12, v8, v10 ; V512-NEXT: li a0, -1 @@ -482,7 +476,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V128-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; V128-NEXT: vwaddu.vv v9, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v9, a0, v10 @@ -493,7 +487,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 2 -; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma ; V512-NEXT: vwaddu.vv v9, v8, v10 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v10 @@ -508,7 +501,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; V128-NEXT: vwaddu.vv v9, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v9, a0, v10 @@ -519,7 +512,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 2 -; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; V512-NEXT: vwaddu.vv v9, v8, v10 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v10 @@ -534,7 +526,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; V128-NEXT: vwaddu.vv v9, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v9, a0, v10 @@ -545,7 +537,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 2 -; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma ; V512-NEXT: vwaddu.vv v9, v8, v10 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v10 @@ -605,7 +596,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 4 -; V128-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; V128-NEXT: vwaddu.vv v9, v8, v10 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v9, a0, v10 @@ -616,7 +607,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 4 -; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, ma ; V512-NEXT: vwaddu.vv v9, v8, v10 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v10 @@ -631,7 +621,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 4 -; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; V128-NEXT: vwaddu.vv v9, v10, v8 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v9, a0, v8 @@ -642,7 +632,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 4 -; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma ; V512-NEXT: vwaddu.vv v9, v10, v8 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v8 @@ -657,7 +646,7 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; V128-NEXT: vslidedown.vi v12, v8, 4 -; V128-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; V128-NEXT: vwaddu.vv v10, v8, v12 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v10, a0, v12 @@ -668,7 +657,6 @@ ; V512: # %bb.0: ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma ; V512-NEXT: vslidedown.vi v10, v8, 4 -; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma ; V512-NEXT: vwaddu.vv v9, v8, v10 ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v9, a0, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -327,7 +327,7 @@ ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vrgather.vi v9, v8, 0 ; CHECK-NEXT: vrgather.vi v10, v8, 1 -; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vwaddu.vv v8, v9, v10 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vwmaccu.vx v8, a0, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll --- a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll @@ -9,11 +9,10 @@ ; RV64-1024-NEXT: vsetvli zero, a3, e16, m2, ta, ma ; RV64-1024-NEXT: vle16.v v8, (a1) ; RV64-1024-NEXT: vle16.v v10, (a2) -; RV64-1024-NEXT: li a1, 256 -; RV64-1024-NEXT: vsetvli zero, a1, e16, m2, ta, ma ; RV64-1024-NEXT: vwaddu.vv v12, v8, v10 -; RV64-1024-NEXT: li a2, -1 -; RV64-1024-NEXT: vwmaccu.vx v12, a2, v10 +; RV64-1024-NEXT: li a1, -1 +; RV64-1024-NEXT: vwmaccu.vx v12, a1, v10 +; RV64-1024-NEXT: li a1, 256 ; RV64-1024-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; RV64-1024-NEXT: vse16.v v12, (a0) ; RV64-1024-NEXT: ret @@ -24,11 +23,10 @@ ; RV64-2048-NEXT: vsetvli zero, a3, e16, m1, ta, ma ; RV64-2048-NEXT: vle16.v v8, (a1) ; RV64-2048-NEXT: vle16.v v9, (a2) -; RV64-2048-NEXT: li a1, 256 -; RV64-2048-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; RV64-2048-NEXT: vwaddu.vv v10, v8, v9 -; RV64-2048-NEXT: li a2, -1 -; RV64-2048-NEXT: vwmaccu.vx v10, a2, v9 +; RV64-2048-NEXT: li a1, -1 +; RV64-2048-NEXT: vwmaccu.vx v10, a1, v9 +; RV64-2048-NEXT: li a1, 256 ; RV64-2048-NEXT: vsetvli zero, a1, e16, m2, ta, ma ; RV64-2048-NEXT: vse16.v v10, (a0) ; RV64-2048-NEXT: ret @@ -49,11 +47,10 @@ ; RV64-1024-NEXT: vsetvli zero, a3, e16, m4, ta, ma ; RV64-1024-NEXT: vle16.v v8, (a1) ; RV64-1024-NEXT: vle16.v v12, (a2) -; RV64-1024-NEXT: li a1, 512 -; RV64-1024-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; RV64-1024-NEXT: vwaddu.vv v16, v8, v12 -; RV64-1024-NEXT: li a2, -1 -; RV64-1024-NEXT: vwmaccu.vx v16, a2, v12 +; RV64-1024-NEXT: li a1, -1 +; RV64-1024-NEXT: vwmaccu.vx v16, a1, v12 +; RV64-1024-NEXT: li a1, 512 ; RV64-1024-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; RV64-1024-NEXT: vse16.v v16, (a0) ; RV64-1024-NEXT: ret @@ -64,11 +61,10 @@ ; RV64-2048-NEXT: vsetvli zero, a3, e16, m2, ta, ma ; RV64-2048-NEXT: vle16.v v8, (a1) ; RV64-2048-NEXT: vle16.v v10, (a2) -; RV64-2048-NEXT: li a1, 512 -; RV64-2048-NEXT: vsetvli zero, a1, e16, m2, ta, ma ; RV64-2048-NEXT: vwaddu.vv v12, v8, v10 -; RV64-2048-NEXT: li a2, -1 -; RV64-2048-NEXT: vwmaccu.vx v12, a2, v10 +; RV64-2048-NEXT: li a1, -1 +; RV64-2048-NEXT: vwmaccu.vx v12, a1, v10 +; RV64-2048-NEXT: li a1, 512 ; RV64-2048-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; RV64-2048-NEXT: vse16.v v12, (a0) ; RV64-2048-NEXT: ret