Index: llvm/lib/Target/RISCV/RISCVISelLowering.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -73,6 +73,12 @@ "use for creating a floating-point immediate value"), cl::init(2)); +static cl::opt + VectorImmCost(DEBUG_TYPE "-vecimm-cost", cl::Hidden, + cl::desc("Give the maximum number of instructions that we will " + "use for creating a vector immediate value"), + cl::init(4)); + RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, const RISCVSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -2970,6 +2976,10 @@ assert(SeqAddend && "Must have an addend if we have a step"); + // Would require a udiv in the expansion - never profitable. + if (!isPowerOf2_32(*SeqStepDenom)) + return std::nullopt; + return VIDSequence{*SeqStepNum, *SeqStepDenom, *SeqAddend}; } @@ -3220,6 +3230,8 @@ return convertFromScalableVector(VT, Splat, DAG, Subtarget); } + const unsigned EltBitSize = VT.getScalarSizeInBits(); + // Try and match index sequences, which we can lower to the vid instruction // with optional modifications. An all-undef vector is matched by // getSplatValue, above. @@ -3229,6 +3241,7 @@ int64_t Addend = SimpleVID->Addend; assert(StepNumerator != 0 && "Invalid step"); + assert(isPowerOf2_32(StepDenominator) && "unexpected denominator"); bool Negate = false; int64_t SplatStepVal = StepNumerator; unsigned StepOpcode = ISD::MUL; @@ -3240,44 +3253,84 @@ } } - // Only emit VIDs with suitably-small steps/addends. We use imm5 is a - // threshold since it's the immediate value many RVV instructions accept. - // There is no vmul.vi instruction so ensure multiply constant can fit in - // a single addi instruction. - if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) || - (StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) && - isPowerOf2_32(StepDenominator) && - (SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) { + // Pairs of Opcode, and Immediate. + SmallVector, 8> Seq; + Seq.emplace_back(RISCVISD::VID_VL, 0); + if ((StepOpcode == ISD::MUL && SplatStepVal != 1) || + (StepOpcode == ISD::SHL && SplatStepVal != 0)) + Seq.emplace_back(StepOpcode, SplatStepVal); + if (StepDenominator != 1) + Seq.emplace_back(ISD::SRL, Log2_64(StepDenominator)); + if (Addend != 0 || Negate) + Seq.emplace_back(Negate ? ISD::SUB : ISD::ADD, Addend); + if (VT.isFloatingPoint()) + Seq.emplace_back(ISD::SINT_TO_FP, 0); + + + int Cost = 0; + for (const auto &[Opc, Imm] : Seq) { + Cost++; + switch (Opc) { + case RISCVISD::VID_VL: + case ISD::SINT_TO_FP: + // No immediates + break; + case ISD::SHL: + case ISD::SRL: + case ISD::ADD: + case ISD::SUB: + // VI form supports 5 bit constants + if (!isInt<5>(Imm)) + Cost += RISCVMatInt::getIntMatCost(APInt(64, Imm), EltBitSize, + Subtarget.getFeatureBits()); + break; + case ISD::MUL: + Cost += RISCVMatInt::getIntMatCost(APInt(64, Imm), EltBitSize, + Subtarget.getFeatureBits()); + break; + } + } + + if (Cost <= VectorImmCost) { MVT VIDVT = - VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; + VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; MVT VIDContainerVT = - getContainerForFixedLengthVector(DAG, VIDVT, Subtarget); - SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL); - // Convert right out of the scalable type so we can use standard ISD - // nodes for the rest of the computation. If we used scalable types with - // these, we'd lose the fixed-length vector info and generate worse - // vsetvli code. - VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget); - if ((StepOpcode == ISD::MUL && SplatStepVal != 1) || - (StepOpcode == ISD::SHL && SplatStepVal != 0)) { - SDValue SplatStep = DAG.getSplatBuildVector( - VIDVT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT)); - VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep); - } - if (StepDenominator != 1) { - SDValue SplatStep = DAG.getSplatBuildVector( - VIDVT, DL, DAG.getConstant(Log2_64(StepDenominator), DL, XLenVT)); - VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep); - } - if (Addend != 0 || Negate) { - SDValue SplatAddend = DAG.getSplatBuildVector( - VIDVT, DL, DAG.getConstant(Addend, DL, XLenVT)); - VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, SplatAddend, - VID); - } - if (VT.isFloatingPoint()) { - // TODO: Use vfwcvt to reduce register pressure. - VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID); + getContainerForFixedLengthVector(DAG, VIDVT, Subtarget); + + SDValue VID; + assert(!Seq.empty()); + for (const auto &[Opc, Imm] : Seq) { + switch (Opc) { + case RISCVISD::VID_VL: + VID = DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL); + // Convert right out of the scalable type so we can use standard ISD + // nodes for the rest of the computation. If we used scalable types with + // these, we'd lose the fixed-length vector info and generate worse + // vsetvli code. + VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget); + break; + case ISD::SHL: + case ISD::SRL: + case ISD::MUL: { + SDValue Splat = DAG.getSplatBuildVector( + VIDVT, DL, DAG.getConstant(Imm, DL, XLenVT)); + VID = DAG.getNode(Opc, DL, VIDVT, VID, Splat); + break; + } + case ISD::ADD: + case ISD::SUB: { + // Note the swapped operands here. This means that sub + // is actually vrsub (i.e. commonly negate) + SDValue Splat = DAG.getSplatBuildVector( + VIDVT, DL, DAG.getConstant(Imm, DL, XLenVT)); + VID = DAG.getNode(Opc, DL, VIDVT, Splat, VID); + break; + } + case ISD::SINT_TO_FP: + // TODO: Use vfwcvt to reduce register pressure. + VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID); + break; + } } return VID; } @@ -3285,7 +3338,6 @@ // For very small build_vectors, use a single scalar insert of a constant. // TODO: Base this on constant rematerialization cost, not size. - const unsigned EltBitSize = VT.getScalarSizeInBits(); if (VT.getSizeInBits() <= 32 && ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { MVT ViaIntVT = MVT::getIntegerVT(VT.getSizeInBits()); Index: llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -103,17 +103,16 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI8_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 +; CHECK-NEXT: vslideup.vi v0, v24, 2 ; CHECK-NEXT: ret %mask = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 %index, i64 %tc) ret <32 x i1> %mask @@ -122,29 +121,26 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI9_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 +; CHECK-NEXT: vsaddu.vx v16, v8, a1 +; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI9_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1) +; CHECK-NEXT: vslideup.vi v0, v24, 2 +; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 4 -; CHECK-NEXT: lui a0, %hi(.LCPI9_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) +; CHECK-NEXT: vslideup.vi v0, v24, 4 +; CHECK-NEXT: li a0, 48 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vadd.vx v8, v8, a0 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma @@ -157,61 +153,54 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv128: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI10_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 +; CHECK-NEXT: vsaddu.vx v16, v8, a1 +; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI10_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1) +; CHECK-NEXT: vslideup.vi v0, v24, 2 +; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 4 -; CHECK-NEXT: lui a0, %hi(.LCPI10_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) +; CHECK-NEXT: vslideup.vi v0, v24, 4 +; CHECK-NEXT: li a0, 48 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 6 -; CHECK-NEXT: lui a0, %hi(.LCPI10_3) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) +; CHECK-NEXT: vslideup.vi v0, v24, 6 +; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 8 -; CHECK-NEXT: lui a0, %hi(.LCPI10_4) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) +; CHECK-NEXT: vslideup.vi v0, v24, 8 +; CHECK-NEXT: li a0, 80 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 10 -; CHECK-NEXT: lui a0, %hi(.LCPI10_5) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) +; CHECK-NEXT: vslideup.vi v0, v24, 10 +; CHECK-NEXT: li a0, 96 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 12 -; CHECK-NEXT: lui a0, %hi(.LCPI10_6) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) +; CHECK-NEXT: vslideup.vi v0, v24, 12 +; CHECK-NEXT: li a0, 112 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vadd.vx v8, v8, a0 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -255,38 +255,25 @@ ; RV32-V128-NEXT: addi sp, sp, -16 ; RV32-V128-NEXT: .cfi_def_cfa_offset 16 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: sub sp, sp, a0 -; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) -; RV32-V128-NEXT: li a1, 32 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-V128-NEXT: vle32.v v0, (a0) -; RV32-V128-NEXT: vmv8r.v v24, v8 +; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-V128-NEXT: vmv8r.v v0, v8 ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 -; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) -; RV32-V128-NEXT: vle32.v v24, (a0) -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: lui a0, 699051 -; RV32-V128-NEXT: addi a0, a0, -1366 +; RV32-V128-NEXT: li a0, 32 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-V128-NEXT: vid.v v8 +; RV32-V128-NEXT: vsrl.vi v8, v8, 1 +; RV32-V128-NEXT: li a1, 16 +; RV32-V128-NEXT: vadd.vx v8, v8, a1 +; RV32-V128-NEXT: vrgather.vv v24, v0, v8 +; RV32-V128-NEXT: lui a1, 699051 +; RV32-V128-NEXT: addi a1, a1, -1366 ; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-V128-NEXT: vmv.v.x v0, a0 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV32-V128-NEXT: vmv.v.v v24, v8 +; RV32-V128-NEXT: vmv.v.x v0, a1 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -296,7 +283,7 @@ ; RV32-V128-NEXT: vmv8r.v v8, v0 ; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: add sp, sp, a0 ; RV32-V128-NEXT: addi sp, sp, 16 ; RV32-V128-NEXT: ret @@ -306,38 +293,25 @@ ; RV64-V128-NEXT: addi sp, sp, -16 ; RV64-V128-NEXT: .cfi_def_cfa_offset 16 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: sub sp, sp, a0 -; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) -; RV64-V128-NEXT: li a1, 32 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-V128-NEXT: vle32.v v0, (a0) -; RV64-V128-NEXT: vmv8r.v v24, v8 +; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-V128-NEXT: vmv8r.v v0, v8 ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 -; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) -; RV64-V128-NEXT: vle32.v v24, (a0) -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: lui a0, 699051 -; RV64-V128-NEXT: addiw a0, a0, -1366 +; RV64-V128-NEXT: li a0, 32 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-V128-NEXT: vid.v v8 +; RV64-V128-NEXT: vsrl.vi v8, v8, 1 +; RV64-V128-NEXT: li a1, 16 +; RV64-V128-NEXT: vadd.vx v8, v8, a1 +; RV64-V128-NEXT: vrgather.vv v24, v0, v8 +; RV64-V128-NEXT: lui a1, 699051 +; RV64-V128-NEXT: addiw a1, a1, -1366 ; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-V128-NEXT: vmv.v.x v0, a0 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV64-V128-NEXT: vmv.v.v v24, v8 +; RV64-V128-NEXT: vmv.v.x v0, a1 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -347,7 +321,7 @@ ; RV64-V128-NEXT: vmv8r.v v8, v0 ; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: add sp, sp, a0 ; RV64-V128-NEXT: addi sp, sp, 16 ; RV64-V128-NEXT: ret Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -414,38 +414,25 @@ ; RV32-V128-NEXT: addi sp, sp, -16 ; RV32-V128-NEXT: .cfi_def_cfa_offset 16 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: sub sp, sp, a0 -; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV32-V128-NEXT: lui a0, %hi(.LCPI17_0) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) -; RV32-V128-NEXT: li a1, 32 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-V128-NEXT: vle32.v v0, (a0) -; RV32-V128-NEXT: vmv8r.v v24, v8 +; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-V128-NEXT: vmv8r.v v0, v8 ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 -; RV32-V128-NEXT: lui a0, %hi(.LCPI17_1) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) -; RV32-V128-NEXT: vle32.v v24, (a0) -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: lui a0, 699051 -; RV32-V128-NEXT: addi a0, a0, -1366 +; RV32-V128-NEXT: li a0, 32 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-V128-NEXT: vid.v v8 +; RV32-V128-NEXT: vsrl.vi v8, v8, 1 +; RV32-V128-NEXT: li a1, 16 +; RV32-V128-NEXT: vadd.vx v8, v8, a1 +; RV32-V128-NEXT: vrgather.vv v24, v0, v8 +; RV32-V128-NEXT: lui a1, 699051 +; RV32-V128-NEXT: addi a1, a1, -1366 ; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-V128-NEXT: vmv.v.x v0, a0 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV32-V128-NEXT: vmv.v.v v24, v8 +; RV32-V128-NEXT: vmv.v.x v0, a1 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -455,7 +442,7 @@ ; RV32-V128-NEXT: vmv8r.v v8, v0 ; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: add sp, sp, a0 ; RV32-V128-NEXT: addi sp, sp, 16 ; RV32-V128-NEXT: ret @@ -465,38 +452,25 @@ ; RV64-V128-NEXT: addi sp, sp, -16 ; RV64-V128-NEXT: .cfi_def_cfa_offset 16 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: sub sp, sp, a0 -; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV64-V128-NEXT: lui a0, %hi(.LCPI17_0) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) -; RV64-V128-NEXT: li a1, 32 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-V128-NEXT: vle32.v v0, (a0) -; RV64-V128-NEXT: vmv8r.v v24, v8 +; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-V128-NEXT: vmv8r.v v0, v8 ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 -; RV64-V128-NEXT: lui a0, %hi(.LCPI17_1) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) -; RV64-V128-NEXT: vle32.v v24, (a0) -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: lui a0, 699051 -; RV64-V128-NEXT: addiw a0, a0, -1366 +; RV64-V128-NEXT: li a0, 32 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-V128-NEXT: vid.v v8 +; RV64-V128-NEXT: vsrl.vi v8, v8, 1 +; RV64-V128-NEXT: li a1, 16 +; RV64-V128-NEXT: vadd.vx v8, v8, a1 +; RV64-V128-NEXT: vrgather.vv v24, v0, v8 +; RV64-V128-NEXT: lui a1, 699051 +; RV64-V128-NEXT: addiw a1, a1, -1366 ; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-V128-NEXT: vmv.v.x v0, a0 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV64-V128-NEXT: vmv.v.v v24, v8 +; RV64-V128-NEXT: vmv.v.x v0, a1 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -506,7 +480,7 @@ ; RV64-V128-NEXT: vmv8r.v v8, v0 ; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: add sp, sp, a0 ; RV64-V128-NEXT: addi sp, sp, 16 ; RV64-V128-NEXT: ret Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -1793,23 +1793,21 @@ ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: lui a2, %hi(.LCPI72_0) -; RV32-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV32-NEXT: vle32.v v16, (a2) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: vid.v v24 -; RV32-NEXT: vmsltu.vx v12, v24, a1 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vid.v v16 +; RV32-NEXT: vmsltu.vx v12, v16, a1 +; RV32-NEXT: vadd.vx v16, v16, a0 ; RV32-NEXT: vmsltu.vx v13, v16, a1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vi v12, v13, 4 -; RV32-NEXT: li a0, 64 -; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV32-NEXT: li a1, 64 +; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; RV32-NEXT: vmand.mm v0, v12, v0 ; RV32-NEXT: vmv.v.i v12, 1 ; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 -; RV32-NEXT: vslidedown.vx v12, v8, a3 +; RV32-NEXT: vslidedown.vx v12, v8, a0 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 16 ; RV32-NEXT: vmul.vv v8, v8, v12 @@ -1836,23 +1834,21 @@ ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: li a3, 32 -; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV64-NEXT: lui a2, %hi(.LCPI72_0) -; RV64-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV64-NEXT: vle32.v v16, (a2) ; RV64-NEXT: mv a2, a0 -; RV64-NEXT: vid.v v24 -; RV64-NEXT: vmsltu.vx v12, v24, a1 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vid.v v16 +; RV64-NEXT: vmsltu.vx v12, v16, a1 +; RV64-NEXT: vadd.vx v16, v16, a0 ; RV64-NEXT: vmsltu.vx v13, v16, a1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vi v12, v13, 4 -; RV64-NEXT: li a0, 64 -; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV64-NEXT: li a1, 64 +; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; RV64-NEXT: vmand.mm v0, v12, v0 ; RV64-NEXT: vmv.v.i v12, 1 ; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 -; RV64-NEXT: vslidedown.vx v12, v8, a3 +; RV64-NEXT: vslidedown.vx v12, v8, a0 ; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vmul.vv v8, v8, v12 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -102,9 +102,9 @@ ; NO-ZVBB: # %bb.0: ; NO-ZVBB-NEXT: li a0, 32 ; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; NO-ZVBB-NEXT: lui a0, %hi(.LCPI4_0) -; NO-ZVBB-NEXT: addi a0, a0, %lo(.LCPI4_0) -; NO-ZVBB-NEXT: vle8.v v8, (a0) +; NO-ZVBB-NEXT: vid.v v8 +; NO-ZVBB-NEXT: li a0, 31 +; NO-ZVBB-NEXT: vrsub.vx v8, v8, a0 ; NO-ZVBB-NEXT: vmv.v.i v10, 0 ; NO-ZVBB-NEXT: vmerge.vim v10, v10, 1, v0 ; NO-ZVBB-NEXT: vrgather.vv v12, v10, v8 @@ -125,9 +125,9 @@ ; NO-ZVBB: # %bb.0: ; NO-ZVBB-NEXT: li a0, 64 ; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NO-ZVBB-NEXT: lui a0, %hi(.LCPI5_0) -; NO-ZVBB-NEXT: addi a0, a0, %lo(.LCPI5_0) -; NO-ZVBB-NEXT: vle8.v v8, (a0) +; NO-ZVBB-NEXT: vid.v v8 +; NO-ZVBB-NEXT: li a0, 63 +; NO-ZVBB-NEXT: vrsub.vx v8, v8, a0 ; NO-ZVBB-NEXT: vmv.v.i v12, 0 ; NO-ZVBB-NEXT: vmerge.vim v12, v12, 1, v0 ; NO-ZVBB-NEXT: vrgather.vv v16, v12, v8 @@ -148,9 +148,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: lui a0, %hi(.LCPI6_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0) -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: vrgather.vv v24, v16, v8 @@ -222,11 +222,11 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) { ; CHECK-LABEL: reverse_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI12_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -237,11 +237,11 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) { ; CHECK-LABEL: reverse_v64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI13_0) -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -311,11 +311,11 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) { ; CHECK-LABEL: reverse_v32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -625,11 +625,11 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) { ; CHECK-LABEL: reverse_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI34_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI34_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll +++ llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll @@ -106,41 +106,39 @@ define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) { ; RV32-LABEL: v16i8_2: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI7_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; RV32-NEXT: vle8.v v12, (a0) -; RV32-NEXT: vmv1r.v v14, v9 -; RV32-NEXT: vrgather.vv v10, v8, v12 -; RV32-NEXT: vid.v v8 -; RV32-NEXT: vrsub.vi v8, v8, 15 -; RV32-NEXT: lui a0, 16 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vmv1r.v v12, v9 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV32-NEXT: vid.v v14 +; RV32-NEXT: li a1, 31 +; RV32-NEXT: vrsub.vx v16, v14, a1 +; RV32-NEXT: vrgather.vv v10, v8, v16 +; RV32-NEXT: vrsub.vi v8, v14, 15 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; RV32-NEXT: vrgather.vv v10, v14, v8, v0.t +; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vsetvli zero, a0, e8, m2, ta, mu +; RV32-NEXT: vrgather.vv v10, v12, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: v16i8_2: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI7_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0) -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; RV64-NEXT: vle8.v v12, (a0) -; RV64-NEXT: vmv1r.v v14, v9 -; RV64-NEXT: vrgather.vv v10, v8, v12 -; RV64-NEXT: vid.v v8 -; RV64-NEXT: vrsub.vi v8, v8, 15 -; RV64-NEXT: lui a0, 16 -; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vmv1r.v v12, v9 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV64-NEXT: vid.v v14 +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vrsub.vx v16, v14, a1 +; RV64-NEXT: vrgather.vv v10, v8, v16 +; RV64-NEXT: vrsub.vi v8, v14, 15 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v0, a0 -; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; RV64-NEXT: vrgather.vv v10, v14, v8, v0.t +; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vsetvli zero, a0, e8, m2, ta, mu +; RV64-NEXT: vrgather.vv v10, v12, v8, v0.t ; RV64-NEXT: vmv.v.v v8, v10 ; RV64-NEXT: ret %v32i8 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> @@ -252,41 +250,39 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV32-LABEL: v16i16_2: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI15_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI15_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV32-NEXT: vle16.v v16, (a0) -; RV32-NEXT: vmv2r.v v20, v10 -; RV32-NEXT: vrgather.vv v12, v8, v16 -; RV32-NEXT: vid.v v8 -; RV32-NEXT: vrsub.vi v8, v8, 15 -; RV32-NEXT: lui a0, 16 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vmv2r.v v16, v10 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; RV32-NEXT: vid.v v20 +; RV32-NEXT: li a1, 31 +; RV32-NEXT: vrsub.vx v24, v20, a1 +; RV32-NEXT: vrgather.vv v12, v8, v24 +; RV32-NEXT: vrsub.vi v8, v20, 15 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV32-NEXT: vrgather.vv v12, v20, v8, v0.t +; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; RV32-NEXT: vrgather.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v12 ; RV32-NEXT: ret ; ; RV64-LABEL: v16i16_2: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI15_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI15_0) -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV64-NEXT: vle16.v v16, (a0) -; RV64-NEXT: vmv2r.v v20, v10 -; RV64-NEXT: vrgather.vv v12, v8, v16 -; RV64-NEXT: vid.v v8 -; RV64-NEXT: vrsub.vi v8, v8, 15 -; RV64-NEXT: lui a0, 16 -; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vmv2r.v v16, v10 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; RV64-NEXT: vid.v v20 +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vrsub.vx v24, v20, a1 +; RV64-NEXT: vrgather.vv v12, v8, v24 +; RV64-NEXT: vrsub.vi v8, v20, 15 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v0, a0 -; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV64-NEXT: vrgather.vv v12, v20, v8, v0.t +; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; RV64-NEXT: vrgather.vv v12, v16, v8, v0.t ; RV64-NEXT: vmv.v.v v8, v12 ; RV64-NEXT: ret %v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> @@ -401,42 +397,68 @@ define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) { ; RV32-LABEL: v16i32_2: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI23_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI23_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vle32.v v0, (a0) -; RV32-NEXT: vmv4r.v v24, v12 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: vmv4r.v v16, v12 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vmv4r.v v16, v8 -; RV32-NEXT: vrgather.vv v8, v16, v0 -; RV32-NEXT: vid.v v16 -; RV32-NEXT: vrsub.vi v16, v16, 15 -; RV32-NEXT: lui a0, 16 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vid.v v0 +; RV32-NEXT: li a1, 31 +; RV32-NEXT: vrsub.vx v24, v0, a1 +; RV32-NEXT: vrgather.vv v8, v16, v24 +; RV32-NEXT: vrsub.vi v16, v0, 15 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: v16i32_2: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI23_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI23_0) -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-NEXT: vle32.v v0, (a0) -; RV64-NEXT: vmv4r.v v24, v12 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: vmv4r.v v16, v12 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vmv4r.v v16, v8 -; RV64-NEXT: vrgather.vv v8, v16, v0 -; RV64-NEXT: vid.v v16 -; RV64-NEXT: vrsub.vi v16, v16, 15 -; RV64-NEXT: lui a0, 16 -; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vid.v v0 +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vrsub.vx v24, v0, a1 +; RV64-NEXT: vrgather.vv v8, v16, v24 +; RV64-NEXT: vrsub.vi v16, v0, 15 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v0, a0 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v32i32 = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> ret <32 x i32> %v32i32 @@ -632,11 +654,11 @@ define <32 x half> @v16f16_2(<16 x half> %a) { ; CHECK-LABEL: v16f16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI35_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI35_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -824,11 +846,11 @@ define <32 x i8> @v32i8(<32 x i8> %a) { ; CHECK-LABEL: v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI46_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret