diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3130,7 +3130,10 @@ MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); SDLoc DL(Op); - auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); + // TODO: Need to manually binding Mask and VL because they're captured in + // lambdas below. Use structured binding if/when we move to C++20. + auto DefVLOps = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); + SDValue Mask = DefVLOps.first, VL = DefVLOps.second; MVT XLenVT = Subtarget.getXLenVT(); unsigned NumElts = Op.getNumOperands(); @@ -3220,6 +3223,22 @@ return convertFromScalableVector(VT, Splat, DAG, Subtarget); } + std::function CheapestLowering; + // TODO: Substitute this with the cost of a constant pool load. + const unsigned MaxCost = 4; + auto AddLowering = [&CheapestLowering, CurCost = -1u]( + unsigned Cost, + std::function Lowering) mutable { + if (Cost > MaxCost) + return; + if (!CheapestLowering || Cost < CurCost) { + CheapestLowering = Lowering; + CurCost = Cost; + } + }; + + const unsigned EltBitSize = VT.getScalarSizeInBits(); + // Try and match index sequences, which we can lower to the vid instruction // with optional modifications. An all-undef vector is matched by // getSplatValue, above. @@ -3240,6 +3259,31 @@ } } + unsigned Cost = 1; // Base cost of 1 for vid + if (Addend || Negate) { + Cost++; + // Add the constant materialization cost if it won't fit into vadd.vi + if (!isInt<5>(Addend)) + Cost += RISCVMatInt::getIntMatCost(APInt(64, Addend), EltBitSize, + Subtarget.getFeatureBits()); + } + if (StepOpcode == ISD::MUL && SplatStepVal != 1) { + Cost++; + // There's no vmul.vi so always include the materialization cost. + Cost += RISCVMatInt::getIntMatCost(APInt(64, SplatStepVal), EltBitSize, + Subtarget.getFeatureBits()); + } + if (StepOpcode == ISD::SHL && SplatStepVal != 0) { + Cost++; + // Add the constant materialization cost if it won't fit into vsll.vi. + if (!isUInt<5>(SplatStepVal)) + Cost += RISCVMatInt::getIntMatCost(APInt(64, SplatStepVal), EltBitSize, + Subtarget.getFeatureBits()); + } + // May have to emit a vfwcvt. + if (VT.isFloatingPoint()) + Cost++; + // Only emit VIDs with suitably-small steps/addends. We use imm5 is a // threshold since it's the immediate value many RVV instructions accept. // There is no vmul.vi instruction so ensure multiply constant can fit in @@ -3248,44 +3292,45 @@ (StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) && isPowerOf2_32(StepDenominator) && (SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) { - MVT VIDVT = - VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; - MVT VIDContainerVT = - getContainerForFixedLengthVector(DAG, VIDVT, Subtarget); - SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL); - // Convert right out of the scalable type so we can use standard ISD - // nodes for the rest of the computation. If we used scalable types with - // these, we'd lose the fixed-length vector info and generate worse - // vsetvli code. - VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget); - if ((StepOpcode == ISD::MUL && SplatStepVal != 1) || - (StepOpcode == ISD::SHL && SplatStepVal != 0)) { - SDValue SplatStep = DAG.getSplatBuildVector( - VIDVT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT)); - VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep); - } - if (StepDenominator != 1) { - SDValue SplatStep = DAG.getSplatBuildVector( - VIDVT, DL, DAG.getConstant(Log2_64(StepDenominator), DL, XLenVT)); - VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep); - } - if (Addend != 0 || Negate) { - SDValue SplatAddend = DAG.getSplatBuildVector( - VIDVT, DL, DAG.getConstant(Addend, DL, XLenVT)); - VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, SplatAddend, - VID); - } - if (VT.isFloatingPoint()) { - // TODO: Use vfwcvt to reduce register pressure. - VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID); - } - return VID; + AddLowering(Cost, [=, &DAG, &Subtarget]() { + MVT VIDVT = + VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; + MVT VIDContainerVT = + getContainerForFixedLengthVector(DAG, VIDVT, Subtarget); + SDValue VID = + DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL); + // Convert right out of the scalable type so we can use standard ISD + // nodes for the rest of the computation. If we used scalable types with + // these, we'd lose the fixed-length vector info and generate worse + // vsetvli code. + VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget); + if ((StepOpcode == ISD::MUL && SplatStepVal != 1) || + (StepOpcode == ISD::SHL && SplatStepVal != 0)) { + SDValue SplatStep = DAG.getSplatBuildVector( + VIDVT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT)); + VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep); + } + if (StepDenominator != 1) { + SDValue SplatStep = DAG.getSplatBuildVector( + VIDVT, DL, DAG.getConstant(Log2_64(StepDenominator), DL, XLenVT)); + VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep); + } + if (Addend != 0 || Negate) { + SDValue SplatAddend = DAG.getSplatBuildVector( + VIDVT, DL, DAG.getConstant(Addend, DL, XLenVT)); + VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, + SplatAddend, VID); + } + if (VT.isFloatingPoint()) { + // TODO: Use vfwcvt to reduce register pressure. + VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID); + } + return VID; + }); } } // For very small build_vectors, use a single scalar insert of a constant. - // TODO: Base this on constant rematerialization cost, not size. - const unsigned EltBitSize = VT.getScalarSizeInBits(); if (VT.getSizeInBits() <= 32 && ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { MVT ViaIntVT = MVT::getIntegerVT(VT.getSizeInBits()); @@ -3313,15 +3358,23 @@ if (Subtarget.is64Bit() && ViaIntVT == MVT::i32) SplatValue = SignExtend64<32>(SplatValue); - SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ViaVecVT, - DAG.getUNDEF(ViaVecVT), - DAG.getConstant(SplatValue, DL, XLenVT), - DAG.getConstant(0, DL, XLenVT)); - if (ViaVecLen != 1) - Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, - MVT::getVectorVT(ViaIntVT, 1), Vec, - DAG.getConstant(0, DL, XLenVT)); - return DAG.getBitcast(VT, Vec); + // Base cost of 1 for vmv.s.x. + unsigned Cost = 1; + // We always have to materialize the constant since there's no vmv.s.i. + Cost += RISCVMatInt::getIntMatCost(APInt(64, SplatValue), EltBitSize, + Subtarget.getFeatureBits()); + + AddLowering(Cost, [=, &DAG]() { + SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ViaVecVT, + DAG.getUNDEF(ViaVecVT), + DAG.getConstant(SplatValue, DL, XLenVT), + DAG.getConstant(0, DL, XLenVT)); + if (ViaVecLen != 1) + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, + MVT::getVectorVT(ViaIntVT, 1), Vec, + DAG.getConstant(0, DL, XLenVT)); + return DAG.getBitcast(VT, Vec); + }); } @@ -3369,19 +3422,30 @@ (!Subtarget.is64Bit() && ViaIntVT == MVT::i64)) && "Unexpected bitcast sequence"); if (ViaIntVT.bitsLE(XLenVT) || isInt<32>(SplatValue)) { - SDValue ViaVL = - DAG.getConstant(ViaVecVT.getVectorNumElements(), DL, XLenVT); - MVT ViaContainerVT = - getContainerForFixedLengthVector(DAG, ViaVecVT, Subtarget); - SDValue Splat = - DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ViaContainerVT, - DAG.getUNDEF(ViaContainerVT), - DAG.getConstant(SplatValue, DL, XLenVT), ViaVL); - Splat = convertFromScalableVector(ViaVecVT, Splat, DAG, Subtarget); - return DAG.getBitcast(VT, Splat); + // Base cost of 1 for vmv.v.x + unsigned Cost = 1; + if (!isInt<5>(SplatValue)) + Cost += RISCVMatInt::getIntMatCost(APInt(64, SplatValue), EltBitSize, + Subtarget.getFeatureBits()); + + AddLowering(Cost, [=, &DAG, &Subtarget]() { + SDValue ViaVL = + DAG.getConstant(ViaVecVT.getVectorNumElements(), DL, XLenVT); + MVT ViaContainerVT = + getContainerForFixedLengthVector(DAG, ViaVecVT, Subtarget); + SDValue Splat = + DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ViaContainerVT, + DAG.getUNDEF(ViaContainerVT), + DAG.getConstant(SplatValue, DL, XLenVT), ViaVL); + Splat = convertFromScalableVector(ViaVecVT, Splat, DAG, Subtarget); + return DAG.getBitcast(VT, Splat); + }); } } + if (CheapestLowering) + return CheapestLowering(); + if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget)) return Res; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -31,20 +31,37 @@ ; expanded to 4 EXTRACT_VECTOR_ELTs and a BUILD_VECTOR. This then triggers the ; loop when expanded. define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, <8 x float> %y) optsize { -; CHECK-LABEL: hang_when_merging_stores_after_legalization: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: li a0, 7 -; CHECK-NEXT: vmul.vx v14, v12, a0 -; CHECK-NEXT: vrgather.vv v12, v8, v14 -; CHECK-NEXT: vadd.vi v8, v14, -14 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vrgather.vv v12, v10, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v12 -; CHECK-NEXT: ret +; RV32-LABEL: hang_when_merging_stores_after_legalization: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vid.v v12 +; RV32-NEXT: li a0, 7 +; RV32-NEXT: vmul.vx v14, v12, a0 +; RV32-NEXT: vrgather.vv v12, v8, v14 +; RV32-NEXT: vadd.vi v8, v14, -14 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 12 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vv v12, v10, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: hang_when_merging_stores_after_legalization: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vid.v v12 +; RV64-NEXT: li a0, 7 +; RV64-NEXT: vmul.vx v14, v12, a0 +; RV64-NEXT: vrgather.vv v12, v8, v14 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 12 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vv v12, v10, v8, v0.t +; RV64-NEXT: vmv1r.v v8, v12 +; RV64-NEXT: ret %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> ret <4 x float> %z } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -171,15 +171,15 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) { ; RV32-LABEL: vrgather_shuffle_vx_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV32-NEXT: vid.v v12 -; RV32-NEXT: li a0, 3 -; RV32-NEXT: lui a1, %hi(.LCPI8_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0) -; RV32-NEXT: vlse64.v v10, (a1), zero -; RV32-NEXT: vmul.vx v12, v12, a0 +; RV32-NEXT: lui a0, %hi(.LCPI8_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI8_0) +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: lui a0, 48 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 ; RV32-NEXT: vmv.v.i v0, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vrgatherei16.vv v10, v8, v12, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -141,10 +141,9 @@ define <4 x i8> @buildvec_vid_step2_add1_v4i8_undef1() { ; CHECK-LABEL: buildvec_vid_step2_add1_v4i8_undef1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: lui a0, 28752 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: ret ret <4 x i8> } @@ -152,10 +151,9 @@ define <4 x i8> @buildvec_vid_step2_add1_v4i8_undef2() { ; CHECK-LABEL: buildvec_vid_step2_add1_v4i8_undef2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: li a0, 1793 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: ret ret <4 x i8> } @@ -225,10 +223,9 @@ define <4 x i8> @buildvec_vid_stepn2_add0_v4i8_undef1() { ; CHECK-LABEL: buildvec_vid_stepn2_add0_v4i8_undef1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vrsub.vi v8, v8, 0 +; CHECK-NEXT: lui a0, 1028032 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: ret ret <4 x i8> } @@ -254,14 +251,21 @@ } define <4 x i8> @buildvec_vid_stepn3_add3_v4i8() { -; CHECK-LABEL: buildvec_vid_stepn3_add3_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v9, 3 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: li a0, -3 -; CHECK-NEXT: vmadd.vx v8, a0, v9 -; CHECK-NEXT: ret +; RV32-LABEL: buildvec_vid_stepn3_add3_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, 1028048 +; RV32-NEXT: addi a0, a0, 3 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: buildvec_vid_stepn3_add3_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: lui a0, 1028048 +; RV64-NEXT: addiw a0, a0, 3 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: ret ret <4 x i8> } @@ -370,21 +374,12 @@ } define <4 x i8> @buildvec_no_vid_v4i8_3() { -; RV32-LABEL: buildvec_no_vid_v4i8_3: -; RV32: # %bb.0: -; RV32-NEXT: lui a0, 28672 -; RV32-NEXT: addi a0, a0, 255 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.s.x v8, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: buildvec_no_vid_v4i8_3: -; RV64: # %bb.0: -; RV64-NEXT: lui a0, 28672 -; RV64-NEXT: addiw a0, a0, 255 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.s.x v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: buildvec_no_vid_v4i8_3: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 2047 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret ret <4 x i8> } @@ -552,24 +547,14 @@ } define void @buildvec_seq_v16i8_v2i64(ptr %x) { -; RV32-LABEL: buildvec_seq_v16i8_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: lui a1, %hi(.LCPI42_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI42_0) -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: vse8.v v8, (a0) -; RV32-NEXT: ret -; -; RV64-LABEL: buildvec_seq_v16i8_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: lui a1, %hi(.LCPI42_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI42_0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vlse64.v v8, (a1), zero -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vse8.v v8, (a0) -; RV64-NEXT: ret +; CHECK-LABEL: buildvec_seq_v16i8_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI42_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI42_0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret store <16 x i8> , ptr %x ret void } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -773,18 +773,18 @@ define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) { ; V128-LABEL: unary_interleave_10uu_v4i8: ; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V128-NEXT: vmv.v.i v10, 1 ; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; V128-NEXT: vid.v v9 -; V128-NEXT: vrsub.vi v10, v9, 1 ; V128-NEXT: vrgather.vv v9, v8, v10 ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_10uu_v4i8: ; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V512-NEXT: vmv.v.i v10, 1 ; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma -; V512-NEXT: vid.v v9 -; V512-NEXT: vrsub.vi v10, v9, 1 ; V512-NEXT: vrgather.vv v9, v8, v10 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -113,10 +113,10 @@ define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_shuffle_vx_v4i16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 48 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: vmul.vx v10, v9, a0 ; CHECK-NEXT: vmv.v.i v0, 3 ; CHECK-NEXT: vmv.v.i v9, 5 ; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1436,29 +1436,51 @@ } define void @mulhs_v6i16(ptr %x) { -; CHECK-LABEL: mulhs_v6i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vmv.v.i v9, -7 -; CHECK-NEXT: vmerge.vim v9, v9, 7, v0 -; CHECK-NEXT: vdiv.vv v9, v8, v9 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v10, 7 -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: li a1, -14 -; CHECK-NEXT: vmadd.vx v11, a1, v10 -; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 4 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vdiv.vv v8, v8, v11 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v9, v8, 4 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: mulhs_v6i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vmv.v.i v0, 6 +; RV32-NEXT: vmv.v.i v9, -7 +; RV32-NEXT: vmerge.vim v9, v9, 7, v0 +; RV32-NEXT: vdiv.vv v9, v8, v9 +; RV32-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 4 +; RV32-NEXT: lui a1, 1048464 +; RV32-NEXT: addi a1, a1, 7 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a1 +; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; RV32-NEXT: vdiv.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vslideup.vi v9, v8, 4 +; RV32-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV32-NEXT: vse16.v v9, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhs_v6i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vmv.v.i v0, 6 +; RV64-NEXT: vmv.v.i v9, -7 +; RV64-NEXT: vmerge.vim v9, v9, 7, v0 +; RV64-NEXT: vdiv.vv v9, v8, v9 +; RV64-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; RV64-NEXT: vslidedown.vi v8, v8, 4 +; RV64-NEXT: lui a1, 1048464 +; RV64-NEXT: addiw a1, a1, 7 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vmv.s.x v10, a1 +; RV64-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; RV64-NEXT: vdiv.vv v8, v8, v10 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vslideup.vi v9, v8, 4 +; RV64-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV64-NEXT: vse16.v v9, (a0) +; RV64-NEXT: ret %a = load <6 x i16>, ptr %x %b = sdiv <6 x i16> %a, store <6 x i16> %b, ptr %x @@ -1488,11 +1510,13 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: lui a1, %hi(.LCPI73_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI73_0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vlse64.v v9, (a1), zero -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: lui a1, 419430 +; RV64-NEXT: addiw a1, a1, 1639 +; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: vmv.v.i v0, 5 +; RV64-NEXT: lui a1, 629146 +; RV64-NEXT: addiw a1, a1, -1639 +; RV64-NEXT: vmerge.vxm v9, v9, a1, v0 ; RV64-NEXT: vmulh.vv v8, v8, v9 ; RV64-NEXT: vsra.vi v8, v8, 1 ; RV64-NEXT: vsrl.vi v9, v8, 31 @@ -5555,11 +5579,16 @@ ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI187_0) -; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI187_0) -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV64-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV64-NEXT: lui a1, 419430 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 1639 +; LMULMAX2-RV64-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV64-NEXT: li a1, 85 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; LMULMAX2-RV64-NEXT: vmv.v.x v0, a1 +; LMULMAX2-RV64-NEXT: lui a1, 629146 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1639 ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 ; LMULMAX2-RV64-NEXT: vmulh.vv v8, v8, v10 ; LMULMAX2-RV64-NEXT: vsra.vi v8, v8, 1 ; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 31 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll @@ -214,12 +214,10 @@ define void @store_constant_v2i8_align1(ptr %p) { ; CHECK-LABEL: store_constant_v2i8_align1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 3 -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: li a1, 3 -; CHECK-NEXT: vmadd.vx v9, a1, v8 -; CHECK-NEXT: vse8.v v9, (a0) +; CHECK-NEXT: li a1, 1539 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret store <2 x i8> , ptr %p, align 1 ret void