diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3130,7 +3130,10 @@ MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); SDLoc DL(Op); - auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); + // TODO: Need to manually binding Mask and VL because they're captured in + // lambdas below. Use structured binding if/when we move to C++20. + auto DefVLOps = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); + SDValue Mask = DefVLOps.first, VL = DefVLOps.second; MVT XLenVT = Subtarget.getXLenVT(); unsigned NumElts = Op.getNumOperands(); @@ -3220,6 +3223,22 @@ return convertFromScalableVector(VT, Splat, DAG, Subtarget); } + std::function CheapestLowering; + // TODO: Substitute this with the cost of a constant pool load. + const unsigned MaxCost = 4; + auto AddLowering = [&CheapestLowering, + CurCost = -1u](unsigned Cost, + std::function Lowering) mutable { + if (Cost > MaxCost) + return; + if (!CheapestLowering || Cost < CurCost) { + CheapestLowering = Lowering; + CurCost = Cost; + } + }; + + const unsigned EltBitSize = VT.getScalarSizeInBits(); + // Try and match index sequences, which we can lower to the vid instruction // with optional modifications. An all-undef vector is matched by // getSplatValue, above. @@ -3240,52 +3259,78 @@ } } + unsigned Cost = 1; // Base cost of 1 for vid + if (Addend || Negate) { + Cost++; + // Add the constant materialization cost if it won't fit into vadd.vi + if (!isInt<5>(Addend)) + Cost += RISCVMatInt::getIntMatCost( + APInt(64, Addend), EltBitSize, Subtarget.getFeatureBits()); + } + if (StepOpcode == ISD::MUL && SplatStepVal != 1) { + Cost++; + // There's no vmul.vi so always include the materialization cost. + Cost += + RISCVMatInt::getIntMatCost(APInt(64, SplatStepVal), + EltBitSize, Subtarget.getFeatureBits()); + } + if (StepOpcode == ISD::SHL && SplatStepVal != 0) { + Cost++; + // Add the constant materialization cost if it won't fit into vsll.vi. + if (!isUInt<5>(SplatStepVal)) + Cost += + RISCVMatInt::getIntMatCost(APInt(64, SplatStepVal), + EltBitSize, Subtarget.getFeatureBits()); + } + // May have to emit a vfwcvt. + if (VT.isFloatingPoint()) + Cost++; + // Only emit VIDs with suitably-small steps/addends. We use imm5 is a // threshold since it's the immediate value many RVV instructions accept. // There is no vmul.vi instruction so ensure multiply constant can fit in // a single addi instruction. - if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) || - (StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) && - isPowerOf2_32(StepDenominator) && - (SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) { - MVT VIDVT = - VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; - MVT VIDContainerVT = - getContainerForFixedLengthVector(DAG, VIDVT, Subtarget); - SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL); - // Convert right out of the scalable type so we can use standard ISD - // nodes for the rest of the computation. If we used scalable types with - // these, we'd lose the fixed-length vector info and generate worse - // vsetvli code. - VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget); - if ((StepOpcode == ISD::MUL && SplatStepVal != 1) || - (StepOpcode == ISD::SHL && SplatStepVal != 0)) { - SDValue SplatStep = DAG.getSplatBuildVector( - VIDVT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT)); - VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep); - } - if (StepDenominator != 1) { - SDValue SplatStep = DAG.getSplatBuildVector( - VIDVT, DL, DAG.getConstant(Log2_64(StepDenominator), DL, XLenVT)); - VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep); - } - if (Addend != 0 || Negate) { - SDValue SplatAddend = DAG.getSplatBuildVector( - VIDVT, DL, DAG.getConstant(Addend, DL, XLenVT)); - VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, SplatAddend, - VID); - } - if (VT.isFloatingPoint()) { - // TODO: Use vfwcvt to reduce register pressure. - VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID); - } - return VID; + if (isPowerOf2_32(StepDenominator) && (SplatStepVal >= 0 || StepDenominator == 1)) { + AddLowering(Cost, [=, &DAG, &Subtarget]() { + MVT VIDVT = + VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; + MVT VIDContainerVT = + getContainerForFixedLengthVector(DAG, VIDVT, Subtarget); + SDValue VID = + DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL); + // Convert right out of the scalable type so we can use standard ISD + // nodes for the rest of the computation. If we used scalable types with + // these, we'd lose the fixed-length vector info and generate worse + // vsetvli code. + VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget); + if ((StepOpcode == ISD::MUL && SplatStepVal != 1) || + (StepOpcode == ISD::SHL && SplatStepVal != 0)) { + SDValue SplatStep = DAG.getSplatBuildVector( + VIDVT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT)); + VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep); + } + if (StepDenominator != 1) { + SDValue SplatStep = DAG.getSplatBuildVector( + VIDVT, DL, DAG.getConstant(Log2_64(StepDenominator), DL, XLenVT)); + VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep); + } + if (Addend != 0 || Negate) { + SDValue SplatAddend = DAG.getSplatBuildVector( + VIDVT, DL, DAG.getConstant(Addend, DL, XLenVT)); + VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, + SplatAddend, VID); + } + if (VT.isFloatingPoint()) { + // TODO: Use vfwcvt to reduce register pressure. + VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID); + } + return VID; + }); } } // For very small build_vectors, use a single scalar insert of a constant. // TODO: Base this on constant rematerialization cost, not size. - const unsigned EltBitSize = VT.getScalarSizeInBits(); if (VT.getSizeInBits() <= 32 && ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { MVT ViaIntVT = MVT::getIntegerVT(VT.getSizeInBits()); @@ -3305,7 +3350,7 @@ const auto &SeqV = OpIdx.value(); if (!SeqV.isUndef()) SplatValue |= ((cast(SeqV)->getZExtValue() & EltMask) - << (OpIdx.index() * EltBitSize)); + << (OpIdx.index() * EltBitSize)); } // On RV64, sign-extend from 32 to 64 bits where possible in order to @@ -3313,15 +3358,22 @@ if (Subtarget.is64Bit() && ViaIntVT == MVT::i32) SplatValue = SignExtend64<32>(SplatValue); - SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ViaVecVT, - DAG.getUNDEF(ViaVecVT), - DAG.getConstant(SplatValue, DL, XLenVT), - DAG.getConstant(0, DL, XLenVT)); - if (ViaVecLen != 1) - Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, - MVT::getVectorVT(ViaIntVT, 1), Vec, - DAG.getConstant(0, DL, XLenVT)); - return DAG.getBitcast(VT, Vec); + // Base cost of 1 for vmv.s.x. + unsigned Cost = 1; + // We always have to materialize the constant since there's no vmv.s.i. + Cost += RISCVMatInt::getIntMatCost(APInt(64, SplatValue), EltBitSize, Subtarget.getFeatureBits()); + + AddLowering(Cost, [=, &DAG]() { + SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ViaVecVT, + DAG.getUNDEF(ViaVecVT), + DAG.getConstant(SplatValue, DL, XLenVT), + DAG.getConstant(0, DL, XLenVT)); + if (ViaVecLen != 1) + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, + MVT::getVectorVT(ViaIntVT, 1), Vec, + DAG.getConstant(0, DL, XLenVT)); + return DAG.getBitcast(VT, Vec); + }); } @@ -3369,19 +3421,29 @@ (!Subtarget.is64Bit() && ViaIntVT == MVT::i64)) && "Unexpected bitcast sequence"); if (ViaIntVT.bitsLE(XLenVT) || isInt<32>(SplatValue)) { - SDValue ViaVL = - DAG.getConstant(ViaVecVT.getVectorNumElements(), DL, XLenVT); - MVT ViaContainerVT = - getContainerForFixedLengthVector(DAG, ViaVecVT, Subtarget); - SDValue Splat = - DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ViaContainerVT, - DAG.getUNDEF(ViaContainerVT), - DAG.getConstant(SplatValue, DL, XLenVT), ViaVL); - Splat = convertFromScalableVector(ViaVecVT, Splat, DAG, Subtarget); - return DAG.getBitcast(VT, Splat); + // Base cost of 1 for vmv.v.x + unsigned Cost = 1; + if (!isInt<5>(SplatValue)) + Cost += RISCVMatInt::getIntMatCost(APInt(64, SplatValue), EltBitSize, Subtarget.getFeatureBits()); + + AddLowering(Cost, [=, &DAG, &Subtarget]() { + SDValue ViaVL = + DAG.getConstant(ViaVecVT.getVectorNumElements(), DL, XLenVT); + MVT ViaContainerVT = + getContainerForFixedLengthVector(DAG, ViaVecVT, Subtarget); + SDValue Splat = + DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ViaContainerVT, + DAG.getUNDEF(ViaContainerVT), + DAG.getConstant(SplatValue, DL, XLenVT), ViaVL); + Splat = convertFromScalableVector(ViaVecVT, Splat, DAG, Subtarget); + return DAG.getBitcast(VT, Splat); + }); } } + if (CheapestLowering) + return CheapestLowering(); + if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget)) return Res; diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -104,16 +104,15 @@ ; CHECK-LABEL: fv32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: lui a0, %hi(.LCPI8_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: vadd.vx v16, v8, a0 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v0, v16, a2 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 +; CHECK-NEXT: vslideup.vi v0, v24, 2 ; CHECK-NEXT: ret %mask = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 %index, i64 %tc) ret <32 x i1> %mask @@ -123,28 +122,25 @@ ; CHECK-LABEL: fv64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: lui a0, %hi(.LCPI9_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: vadd.vx v16, v8, a0 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 +; CHECK-NEXT: vsaddu.vx v16, v8, a1 ; CHECK-NEXT: vmsltu.vx v0, v16, a2 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI9_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1) +; CHECK-NEXT: vslideup.vi v0, v24, 2 +; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 4 -; CHECK-NEXT: lui a0, %hi(.LCPI9_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) +; CHECK-NEXT: vslideup.vi v0, v24, 4 +; CHECK-NEXT: li a0, 48 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vadd.vx v8, v8, a0 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma @@ -158,60 +154,53 @@ ; CHECK-LABEL: fv128: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: lui a0, %hi(.LCPI10_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: vadd.vx v16, v8, a0 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 +; CHECK-NEXT: vsaddu.vx v16, v8, a1 ; CHECK-NEXT: vmsltu.vx v0, v16, a2 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI10_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1) +; CHECK-NEXT: vslideup.vi v0, v24, 2 +; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 4 -; CHECK-NEXT: lui a0, %hi(.LCPI10_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) +; CHECK-NEXT: vslideup.vi v0, v24, 4 +; CHECK-NEXT: li a0, 48 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 6 -; CHECK-NEXT: lui a0, %hi(.LCPI10_3) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) +; CHECK-NEXT: vslideup.vi v0, v24, 6 +; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 8 -; CHECK-NEXT: lui a0, %hi(.LCPI10_4) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) +; CHECK-NEXT: vslideup.vi v0, v24, 8 +; CHECK-NEXT: li a0, 80 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 10 -; CHECK-NEXT: lui a0, %hi(.LCPI10_5) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) +; CHECK-NEXT: vslideup.vi v0, v24, 10 +; CHECK-NEXT: li a0, 96 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 12 -; CHECK-NEXT: lui a0, %hi(.LCPI10_6) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) +; CHECK-NEXT: vslideup.vi v0, v24, 12 +; CHECK-NEXT: li a0, 112 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vadd.vx v8, v8, a0 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -45,21 +45,6 @@ ; LMULMAX1-NEXT: vmv.v.v v11, v12 ; LMULMAX1-NEXT: vmv1r.v v8, v11 ; LMULMAX1-NEXT: ret -; -; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization: -; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-NEXT: vid.v v12 -; LMULMAX2-NEXT: li a0, 7 -; LMULMAX2-NEXT: vmul.vx v14, v12, a0 -; LMULMAX2-NEXT: vrgather.vv v12, v8, v14 -; LMULMAX2-NEXT: vadd.vi v8, v14, -14 -; LMULMAX2-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; LMULMAX2-NEXT: vmv.v.i v0, 12 -; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-NEXT: vrgather.vv v12, v10, v8, v0.t -; LMULMAX2-NEXT: vmv1r.v v8, v12 -; LMULMAX2-NEXT: ret %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> ret <4 x float> %z } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -255,38 +255,25 @@ ; RV32-V128-NEXT: addi sp, sp, -16 ; RV32-V128-NEXT: .cfi_def_cfa_offset 16 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: sub sp, sp, a0 -; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) -; RV32-V128-NEXT: li a1, 32 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-V128-NEXT: vle32.v v0, (a0) -; RV32-V128-NEXT: vmv8r.v v24, v8 +; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-V128-NEXT: vmv8r.v v0, v8 ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 -; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) -; RV32-V128-NEXT: vle32.v v24, (a0) -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: lui a0, 699051 -; RV32-V128-NEXT: addi a0, a0, -1366 +; RV32-V128-NEXT: li a0, 32 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-V128-NEXT: vid.v v8 +; RV32-V128-NEXT: vsrl.vi v8, v8, 1 +; RV32-V128-NEXT: li a1, 16 +; RV32-V128-NEXT: vadd.vx v8, v8, a1 +; RV32-V128-NEXT: vrgather.vv v24, v0, v8 +; RV32-V128-NEXT: lui a1, 699051 +; RV32-V128-NEXT: addi a1, a1, -1366 ; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-V128-NEXT: vmv.v.x v0, a0 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV32-V128-NEXT: vmv.v.v v24, v8 +; RV32-V128-NEXT: vmv.v.x v0, a1 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -296,7 +283,7 @@ ; RV32-V128-NEXT: vmv8r.v v8, v0 ; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: add sp, sp, a0 ; RV32-V128-NEXT: addi sp, sp, 16 ; RV32-V128-NEXT: ret @@ -306,38 +293,25 @@ ; RV64-V128-NEXT: addi sp, sp, -16 ; RV64-V128-NEXT: .cfi_def_cfa_offset 16 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: sub sp, sp, a0 -; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) -; RV64-V128-NEXT: li a1, 32 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-V128-NEXT: vle32.v v0, (a0) -; RV64-V128-NEXT: vmv8r.v v24, v8 +; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-V128-NEXT: vmv8r.v v0, v8 ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 -; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) -; RV64-V128-NEXT: vle32.v v24, (a0) -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: lui a0, 699051 -; RV64-V128-NEXT: addiw a0, a0, -1366 +; RV64-V128-NEXT: li a0, 32 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-V128-NEXT: vid.v v8 +; RV64-V128-NEXT: vsrl.vi v8, v8, 1 +; RV64-V128-NEXT: li a1, 16 +; RV64-V128-NEXT: vadd.vx v8, v8, a1 +; RV64-V128-NEXT: vrgather.vv v24, v0, v8 +; RV64-V128-NEXT: lui a1, 699051 +; RV64-V128-NEXT: addiw a1, a1, -1366 ; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-V128-NEXT: vmv.v.x v0, a0 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV64-V128-NEXT: vmv.v.v v24, v8 +; RV64-V128-NEXT: vmv.v.x v0, a1 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -347,7 +321,7 @@ ; RV64-V128-NEXT: vmv8r.v v8, v0 ; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: add sp, sp, a0 ; RV64-V128-NEXT: addi sp, sp, 16 ; RV64-V128-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -171,15 +171,15 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) { ; RV32-LABEL: vrgather_shuffle_vx_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV32-NEXT: vid.v v12 -; RV32-NEXT: li a0, 3 -; RV32-NEXT: lui a1, %hi(.LCPI8_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0) -; RV32-NEXT: vlse64.v v10, (a1), zero -; RV32-NEXT: vmul.vx v12, v12, a0 +; RV32-NEXT: lui a0, %hi(.LCPI8_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI8_0) +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: lui a0, 48 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 ; RV32-NEXT: vmv.v.i v0, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vrgatherei16.vv v10, v8, v12, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -53,10 +53,10 @@ define void @buildvec_vid_plus_nonimm_v16i8(ptr %x) { ; CHECK-LABEL: buildvec_vid_plus_nonimm_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI4_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI4_0) ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a1, 100 +; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret store <16 x i8> , ptr %x @@ -141,10 +141,9 @@ define <4 x i8> @buildvec_vid_step2_add1_v4i8_undef1() { ; CHECK-LABEL: buildvec_vid_step2_add1_v4i8_undef1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: lui a0, 28752 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: ret ret <4 x i8> } @@ -152,10 +151,9 @@ define <4 x i8> @buildvec_vid_step2_add1_v4i8_undef2() { ; CHECK-LABEL: buildvec_vid_step2_add1_v4i8_undef2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: li a0, 1793 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: ret ret <4 x i8> } @@ -225,10 +223,9 @@ define <4 x i8> @buildvec_vid_stepn2_add0_v4i8_undef1() { ; CHECK-LABEL: buildvec_vid_stepn2_add0_v4i8_undef1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vrsub.vi v8, v8, 0 +; CHECK-NEXT: lui a0, 1028032 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: ret ret <4 x i8> } @@ -254,14 +251,21 @@ } define <4 x i8> @buildvec_vid_stepn3_add3_v4i8() { -; CHECK-LABEL: buildvec_vid_stepn3_add3_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v9, 3 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: li a0, -3 -; CHECK-NEXT: vmadd.vx v8, a0, v9 -; CHECK-NEXT: ret +; RV32-LABEL: buildvec_vid_stepn3_add3_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, 1028048 +; RV32-NEXT: addi a0, a0, 3 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: buildvec_vid_stepn3_add3_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: lui a0, 1028048 +; RV64-NEXT: addiw a0, a0, 3 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: ret ret <4 x i8> } @@ -370,21 +374,12 @@ } define <4 x i8> @buildvec_no_vid_v4i8_3() { -; RV32-LABEL: buildvec_no_vid_v4i8_3: -; RV32: # %bb.0: -; RV32-NEXT: lui a0, 28672 -; RV32-NEXT: addi a0, a0, 255 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.s.x v8, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: buildvec_no_vid_v4i8_3: -; RV64: # %bb.0: -; RV64-NEXT: lui a0, 28672 -; RV64-NEXT: addiw a0, a0, 255 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.s.x v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: buildvec_no_vid_v4i8_3: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 2047 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret ret <4 x i8> } @@ -552,24 +547,14 @@ } define void @buildvec_seq_v16i8_v2i64(ptr %x) { -; RV32-LABEL: buildvec_seq_v16i8_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: lui a1, %hi(.LCPI42_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI42_0) -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: vse8.v v8, (a0) -; RV32-NEXT: ret -; -; RV64-LABEL: buildvec_seq_v16i8_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: lui a1, %hi(.LCPI42_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI42_0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vlse64.v v8, (a1), zero -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vse8.v v8, (a0) -; RV64-NEXT: ret +; CHECK-LABEL: buildvec_seq_v16i8_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI42_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI42_0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret store <16 x i8> , ptr %x ret void } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -414,38 +414,25 @@ ; RV32-V128-NEXT: addi sp, sp, -16 ; RV32-V128-NEXT: .cfi_def_cfa_offset 16 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: sub sp, sp, a0 -; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV32-V128-NEXT: lui a0, %hi(.LCPI17_0) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) -; RV32-V128-NEXT: li a1, 32 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-V128-NEXT: vle32.v v0, (a0) -; RV32-V128-NEXT: vmv8r.v v24, v8 +; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-V128-NEXT: vmv8r.v v0, v8 ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 -; RV32-V128-NEXT: lui a0, %hi(.LCPI17_1) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) -; RV32-V128-NEXT: vle32.v v24, (a0) -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: lui a0, 699051 -; RV32-V128-NEXT: addi a0, a0, -1366 +; RV32-V128-NEXT: li a0, 32 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-V128-NEXT: vid.v v8 +; RV32-V128-NEXT: vsrl.vi v8, v8, 1 +; RV32-V128-NEXT: li a1, 16 +; RV32-V128-NEXT: vadd.vx v8, v8, a1 +; RV32-V128-NEXT: vrgather.vv v24, v0, v8 +; RV32-V128-NEXT: lui a1, 699051 +; RV32-V128-NEXT: addi a1, a1, -1366 ; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-V128-NEXT: vmv.v.x v0, a0 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV32-V128-NEXT: vmv.v.v v24, v8 +; RV32-V128-NEXT: vmv.v.x v0, a1 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -455,7 +442,7 @@ ; RV32-V128-NEXT: vmv8r.v v8, v0 ; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: add sp, sp, a0 ; RV32-V128-NEXT: addi sp, sp, 16 ; RV32-V128-NEXT: ret @@ -465,38 +452,25 @@ ; RV64-V128-NEXT: addi sp, sp, -16 ; RV64-V128-NEXT: .cfi_def_cfa_offset 16 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: sub sp, sp, a0 -; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV64-V128-NEXT: lui a0, %hi(.LCPI17_0) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) -; RV64-V128-NEXT: li a1, 32 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-V128-NEXT: vle32.v v0, (a0) -; RV64-V128-NEXT: vmv8r.v v24, v8 +; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-V128-NEXT: vmv8r.v v0, v8 ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 -; RV64-V128-NEXT: lui a0, %hi(.LCPI17_1) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) -; RV64-V128-NEXT: vle32.v v24, (a0) -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: lui a0, 699051 -; RV64-V128-NEXT: addiw a0, a0, -1366 +; RV64-V128-NEXT: li a0, 32 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-V128-NEXT: vid.v v8 +; RV64-V128-NEXT: vsrl.vi v8, v8, 1 +; RV64-V128-NEXT: li a1, 16 +; RV64-V128-NEXT: vadd.vx v8, v8, a1 +; RV64-V128-NEXT: vrgather.vv v24, v0, v8 +; RV64-V128-NEXT: lui a1, 699051 +; RV64-V128-NEXT: addiw a1, a1, -1366 ; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-V128-NEXT: vmv.v.x v0, a0 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV64-V128-NEXT: vmv.v.v v24, v8 +; RV64-V128-NEXT: vmv.v.x v0, a1 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -506,7 +480,7 @@ ; RV64-V128-NEXT: vmv8r.v v8, v0 ; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: add sp, sp, a0 ; RV64-V128-NEXT: addi sp, sp, 16 ; RV64-V128-NEXT: ret @@ -773,18 +747,18 @@ define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) { ; V128-LABEL: unary_interleave_10uu_v4i8: ; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V128-NEXT: vmv.v.i v10, 1 ; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; V128-NEXT: vid.v v9 -; V128-NEXT: vrsub.vi v10, v9, 1 ; V128-NEXT: vrgather.vv v9, v8, v10 ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_10uu_v4i8: ; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V512-NEXT: vmv.v.i v10, 1 ; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma -; V512-NEXT: vid.v v9 -; V512-NEXT: vrsub.vi v10, v9, 1 ; V512-NEXT: vrgather.vv v9, v8, v10 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -113,10 +113,10 @@ define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_shuffle_vx_v4i16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 48 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: vmul.vx v10, v9, a0 ; CHECK-NEXT: vmv.v.i v0, 3 ; CHECK-NEXT: vmv.v.i v9, 5 ; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1436,29 +1436,51 @@ } define void @mulhs_v6i16(ptr %x) { -; CHECK-LABEL: mulhs_v6i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vmv.v.i v9, -7 -; CHECK-NEXT: vmerge.vim v9, v9, 7, v0 -; CHECK-NEXT: vdiv.vv v9, v8, v9 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v10, 7 -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: li a1, -14 -; CHECK-NEXT: vmadd.vx v11, a1, v10 -; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 4 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vdiv.vv v8, v8, v11 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v9, v8, 4 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: mulhs_v6i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vmv.v.i v0, 6 +; RV32-NEXT: vmv.v.i v9, -7 +; RV32-NEXT: vmerge.vim v9, v9, 7, v0 +; RV32-NEXT: vdiv.vv v9, v8, v9 +; RV32-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 4 +; RV32-NEXT: lui a1, 1048464 +; RV32-NEXT: addi a1, a1, 7 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a1 +; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; RV32-NEXT: vdiv.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vslideup.vi v9, v8, 4 +; RV32-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV32-NEXT: vse16.v v9, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhs_v6i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vmv.v.i v0, 6 +; RV64-NEXT: vmv.v.i v9, -7 +; RV64-NEXT: vmerge.vim v9, v9, 7, v0 +; RV64-NEXT: vdiv.vv v9, v8, v9 +; RV64-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; RV64-NEXT: vslidedown.vi v8, v8, 4 +; RV64-NEXT: lui a1, 1048464 +; RV64-NEXT: addiw a1, a1, 7 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vmv.s.x v10, a1 +; RV64-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; RV64-NEXT: vdiv.vv v8, v8, v10 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vslideup.vi v9, v8, 4 +; RV64-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV64-NEXT: vse16.v v9, (a0) +; RV64-NEXT: ret %a = load <6 x i16>, ptr %x %b = sdiv <6 x i16> %a, store <6 x i16> %b, ptr %x @@ -1488,11 +1510,13 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: lui a1, %hi(.LCPI73_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI73_0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vlse64.v v9, (a1), zero -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: lui a1, 419430 +; RV64-NEXT: addiw a1, a1, 1639 +; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: vmv.v.i v0, 5 +; RV64-NEXT: lui a1, 629146 +; RV64-NEXT: addiw a1, a1, -1639 +; RV64-NEXT: vmerge.vxm v9, v9, a1, v0 ; RV64-NEXT: vmulh.vv v8, v8, v9 ; RV64-NEXT: vsra.vi v8, v8, 1 ; RV64-NEXT: vsrl.vi v9, v8, 31 @@ -5555,11 +5579,16 @@ ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI187_0) -; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI187_0) -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV64-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV64-NEXT: lui a1, 419430 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 1639 +; LMULMAX2-RV64-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV64-NEXT: li a1, 85 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; LMULMAX2-RV64-NEXT: vmv.v.x v0, a1 +; LMULMAX2-RV64-NEXT: lui a1, 629146 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1639 ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 ; LMULMAX2-RV64-NEXT: vmulh.vv v8, v8, v10 ; LMULMAX2-RV64-NEXT: vsra.vi v8, v8, 1 ; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 31 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -1793,23 +1793,21 @@ ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: lui a2, %hi(.LCPI72_0) -; RV32-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a2) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: vmsltu.vx v12, v16, a1 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV32-NEXT: vid.v v16 +; RV32-NEXT: vadd.vx v24, v16, a0 +; RV32-NEXT: vmsltu.vx v12, v24, a1 ; RV32-NEXT: vmsltu.vx v13, v16, a1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vi v13, v12, 4 -; RV32-NEXT: li a0, 64 -; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV32-NEXT: li a1, 64 +; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; RV32-NEXT: vmand.mm v0, v13, v0 ; RV32-NEXT: vmv.v.i v12, 1 ; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 -; RV32-NEXT: vslidedown.vx v12, v8, a3 +; RV32-NEXT: vslidedown.vx v12, v8, a0 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 16 ; RV32-NEXT: vmul.vv v8, v8, v12 @@ -1836,23 +1834,21 @@ ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: lui a2, %hi(.LCPI72_0) -; RV64-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV64-NEXT: li a3, 32 -; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV64-NEXT: vle32.v v16, (a2) ; RV64-NEXT: mv a2, a0 -; RV64-NEXT: vmsltu.vx v12, v16, a1 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV64-NEXT: vid.v v16 +; RV64-NEXT: vadd.vx v24, v16, a0 +; RV64-NEXT: vmsltu.vx v12, v24, a1 ; RV64-NEXT: vmsltu.vx v13, v16, a1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vi v13, v12, 4 -; RV64-NEXT: li a0, 64 -; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV64-NEXT: li a1, 64 +; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; RV64-NEXT: vmand.mm v0, v13, v0 ; RV64-NEXT: vmv.v.i v12, 1 ; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 -; RV64-NEXT: vslidedown.vx v12, v8, a3 +; RV64-NEXT: vslidedown.vx v12, v8, a0 ; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vmul.vv v8, v8, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -72,11 +72,11 @@ define <32 x i1> @reverse_v32i1(<32 x i1> %a) { ; CHECK-LABEL: reverse_v32i1: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 ; CHECK-NEXT: vrgather.vv v12, v10, v8 @@ -89,11 +89,11 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) { ; CHECK-LABEL: reverse_v64i1: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: vrgather.vv v16, v12, v8 @@ -166,11 +166,11 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) { ; CHECK-LABEL: reverse_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI11_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -181,11 +181,11 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) { ; CHECK-LABEL: reverse_v64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI12_0) -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -255,11 +255,11 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) { ; CHECK-LABEL: reverse_v32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI18_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -531,11 +531,11 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) { ; CHECK-LABEL: reverse_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI33_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI33_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll @@ -214,12 +214,10 @@ define void @store_constant_v2i8_align1(ptr %p) { ; CHECK-LABEL: store_constant_v2i8_align1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 3 -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: li a1, 3 -; CHECK-NEXT: vmadd.vx v9, a1, v8 -; CHECK-NEXT: vse8.v v9, (a0) +; CHECK-NEXT: li a1, 1539 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret store <2 x i8> , ptr %p, align 1 ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll --- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll @@ -106,41 +106,39 @@ define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) { ; RV32-LABEL: v16i8_2: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI7_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; RV32-NEXT: vle8.v v12, (a0) -; RV32-NEXT: vmv1r.v v14, v9 -; RV32-NEXT: vrgather.vv v10, v8, v12 -; RV32-NEXT: vid.v v8 -; RV32-NEXT: vrsub.vi v8, v8, 15 -; RV32-NEXT: lui a0, 16 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vmv1r.v v12, v9 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV32-NEXT: vid.v v14 +; RV32-NEXT: li a1, 31 +; RV32-NEXT: vrsub.vx v16, v14, a1 +; RV32-NEXT: vrgather.vv v10, v8, v16 +; RV32-NEXT: vrsub.vi v8, v14, 15 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; RV32-NEXT: vrgather.vv v10, v14, v8, v0.t +; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vsetvli zero, a0, e8, m2, ta, mu +; RV32-NEXT: vrgather.vv v10, v12, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: v16i8_2: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI7_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0) -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; RV64-NEXT: vle8.v v12, (a0) -; RV64-NEXT: vmv1r.v v14, v9 -; RV64-NEXT: vrgather.vv v10, v8, v12 -; RV64-NEXT: vid.v v8 -; RV64-NEXT: vrsub.vi v8, v8, 15 -; RV64-NEXT: lui a0, 16 -; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vmv1r.v v12, v9 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV64-NEXT: vid.v v14 +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vrsub.vx v16, v14, a1 +; RV64-NEXT: vrgather.vv v10, v8, v16 +; RV64-NEXT: vrsub.vi v8, v14, 15 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v0, a0 -; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; RV64-NEXT: vrgather.vv v10, v14, v8, v0.t +; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vsetvli zero, a0, e8, m2, ta, mu +; RV64-NEXT: vrgather.vv v10, v12, v8, v0.t ; RV64-NEXT: vmv.v.v v8, v10 ; RV64-NEXT: ret %v32i8 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> @@ -252,41 +250,39 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV32-LABEL: v16i16_2: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI15_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI15_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: vmv2r.v v16, v10 ; RV32-NEXT: vmv2r.v v12, v8 -; RV32-NEXT: vrgather.vv v8, v12, v20 -; RV32-NEXT: vid.v v12 -; RV32-NEXT: vrsub.vi v12, v12, 15 -; RV32-NEXT: lui a0, 16 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; RV32-NEXT: vid.v v20 +; RV32-NEXT: li a1, 31 +; RV32-NEXT: vrsub.vx v24, v20, a1 +; RV32-NEXT: vrgather.vv v8, v12, v24 +; RV32-NEXT: vrsub.vi v12, v20, 15 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; RV32-NEXT: vrgather.vv v8, v16, v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: v16i16_2: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI15_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI15_0) -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV64-NEXT: vle16.v v20, (a0) ; RV64-NEXT: vmv2r.v v16, v10 ; RV64-NEXT: vmv2r.v v12, v8 -; RV64-NEXT: vrgather.vv v8, v12, v20 -; RV64-NEXT: vid.v v12 -; RV64-NEXT: vrsub.vi v12, v12, 15 -; RV64-NEXT: lui a0, 16 -; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; RV64-NEXT: vid.v v20 +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vrsub.vx v24, v20, a1 +; RV64-NEXT: vrgather.vv v8, v12, v24 +; RV64-NEXT: vrsub.vi v12, v20, 15 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v0, a0 -; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; RV64-NEXT: vrgather.vv v8, v16, v12, v0.t ; RV64-NEXT: ret %v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> @@ -401,42 +397,88 @@ define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) { ; RV32-LABEL: v16i32_2: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI23_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI23_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vle32.v v0, (a0) -; RV32-NEXT: vmv4r.v v24, v12 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV32-NEXT: vmv4r.v v16, v12 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vmv4r.v v16, v8 -; RV32-NEXT: vrgather.vv v8, v16, v0 -; RV32-NEXT: vid.v v16 -; RV32-NEXT: vrsub.vi v16, v16, 15 -; RV32-NEXT: lui a0, 16 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vid.v v0 +; RV32-NEXT: li a1, 31 +; RV32-NEXT: vrsub.vx v8, v0, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v8, v16, v24 +; RV32-NEXT: vrsub.vi v16, v0, 15 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: v16i32_2: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI23_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI23_0) -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-NEXT: vle32.v v0, (a0) -; RV64-NEXT: vmv4r.v v24, v12 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: vmv4r.v v16, v12 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vmv4r.v v16, v8 -; RV64-NEXT: vrgather.vv v8, v16, v0 -; RV64-NEXT: vid.v v16 -; RV64-NEXT: vrsub.vi v16, v16, 15 -; RV64-NEXT: lui a0, 16 -; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vid.v v0 +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vrsub.vx v8, v0, a1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v8, v16, v24 +; RV64-NEXT: vrsub.vi v16, v0, 15 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v0, a0 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v32i32 = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> ret <32 x i32> %v32i32 @@ -632,11 +674,11 @@ define <32 x half> @v16f16_2(<16 x half> %a) { ; CHECK-LABEL: v16f16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI35_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI35_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -824,11 +866,11 @@ define <32 x i8> @v32i8(<32 x i8> %a) { ; CHECK-LABEL: v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI46_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret