diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3288,10 +3288,8 @@ // threshold since it's the immediate value many RVV instructions accept. // There is no vmul.vi instruction so ensure multiply constant can fit in // a single addi instruction. - if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) || - (StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) && - isPowerOf2_32(StepDenominator) && - (SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) { + if (isPowerOf2_32(StepDenominator) && + (SplatStepVal >= 0 || StepDenominator == 1)) { AddLowering(Cost, [=, &DAG, &Subtarget]() { MVT VIDVT = VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -104,16 +104,15 @@ ; CHECK-LABEL: fv32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: lui a0, %hi(.LCPI8_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: vadd.vx v16, v8, a0 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v0, v16, a2 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 +; CHECK-NEXT: vslideup.vi v0, v24, 2 ; CHECK-NEXT: ret %mask = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 %index, i64 %tc) ret <32 x i1> %mask @@ -123,28 +122,25 @@ ; CHECK-LABEL: fv64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: lui a0, %hi(.LCPI9_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: vadd.vx v16, v8, a0 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 +; CHECK-NEXT: vsaddu.vx v16, v8, a1 ; CHECK-NEXT: vmsltu.vx v0, v16, a2 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI9_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1) +; CHECK-NEXT: vslideup.vi v0, v24, 2 +; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 4 -; CHECK-NEXT: lui a0, %hi(.LCPI9_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) +; CHECK-NEXT: vslideup.vi v0, v24, 4 +; CHECK-NEXT: li a0, 48 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vadd.vx v8, v8, a0 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma @@ -158,60 +154,53 @@ ; CHECK-LABEL: fv128: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: lui a0, %hi(.LCPI10_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: vadd.vx v16, v8, a0 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 +; CHECK-NEXT: vsaddu.vx v16, v8, a1 ; CHECK-NEXT: vmsltu.vx v0, v16, a2 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI10_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1) +; CHECK-NEXT: vslideup.vi v0, v24, 2 +; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 4 -; CHECK-NEXT: lui a0, %hi(.LCPI10_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) +; CHECK-NEXT: vslideup.vi v0, v24, 4 +; CHECK-NEXT: li a0, 48 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 6 -; CHECK-NEXT: lui a0, %hi(.LCPI10_3) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) +; CHECK-NEXT: vslideup.vi v0, v24, 6 +; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 8 -; CHECK-NEXT: lui a0, %hi(.LCPI10_4) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) +; CHECK-NEXT: vslideup.vi v0, v24, 8 +; CHECK-NEXT: li a0, 80 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 10 -; CHECK-NEXT: lui a0, %hi(.LCPI10_5) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) +; CHECK-NEXT: vslideup.vi v0, v24, 10 +; CHECK-NEXT: li a0, 96 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v24, v16, a2 ; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 12 -; CHECK-NEXT: lui a0, %hi(.LCPI10_6) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) +; CHECK-NEXT: vslideup.vi v0, v24, 12 +; CHECK-NEXT: li a0, 112 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vadd.vx v8, v8, a0 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -255,38 +255,25 @@ ; RV32-V128-NEXT: addi sp, sp, -16 ; RV32-V128-NEXT: .cfi_def_cfa_offset 16 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: sub sp, sp, a0 -; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) -; RV32-V128-NEXT: li a1, 32 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-V128-NEXT: vle32.v v0, (a0) -; RV32-V128-NEXT: vmv8r.v v24, v8 +; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-V128-NEXT: vmv8r.v v0, v8 ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 -; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) -; RV32-V128-NEXT: vle32.v v24, (a0) -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: lui a0, 699051 -; RV32-V128-NEXT: addi a0, a0, -1366 +; RV32-V128-NEXT: li a0, 32 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-V128-NEXT: vid.v v8 +; RV32-V128-NEXT: vsrl.vi v8, v8, 1 +; RV32-V128-NEXT: li a1, 16 +; RV32-V128-NEXT: vadd.vx v8, v8, a1 +; RV32-V128-NEXT: vrgather.vv v24, v0, v8 +; RV32-V128-NEXT: lui a1, 699051 +; RV32-V128-NEXT: addi a1, a1, -1366 ; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-V128-NEXT: vmv.v.x v0, a0 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV32-V128-NEXT: vmv.v.v v24, v8 +; RV32-V128-NEXT: vmv.v.x v0, a1 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -296,7 +283,7 @@ ; RV32-V128-NEXT: vmv8r.v v8, v0 ; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: add sp, sp, a0 ; RV32-V128-NEXT: addi sp, sp, 16 ; RV32-V128-NEXT: ret @@ -306,38 +293,25 @@ ; RV64-V128-NEXT: addi sp, sp, -16 ; RV64-V128-NEXT: .cfi_def_cfa_offset 16 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: sub sp, sp, a0 -; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) -; RV64-V128-NEXT: li a1, 32 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-V128-NEXT: vle32.v v0, (a0) -; RV64-V128-NEXT: vmv8r.v v24, v8 +; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-V128-NEXT: vmv8r.v v0, v8 ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 -; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) -; RV64-V128-NEXT: vle32.v v24, (a0) -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: lui a0, 699051 -; RV64-V128-NEXT: addiw a0, a0, -1366 +; RV64-V128-NEXT: li a0, 32 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-V128-NEXT: vid.v v8 +; RV64-V128-NEXT: vsrl.vi v8, v8, 1 +; RV64-V128-NEXT: li a1, 16 +; RV64-V128-NEXT: vadd.vx v8, v8, a1 +; RV64-V128-NEXT: vrgather.vv v24, v0, v8 +; RV64-V128-NEXT: lui a1, 699051 +; RV64-V128-NEXT: addiw a1, a1, -1366 ; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-V128-NEXT: vmv.v.x v0, a0 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV64-V128-NEXT: vmv.v.v v24, v8 +; RV64-V128-NEXT: vmv.v.x v0, a1 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -347,7 +321,7 @@ ; RV64-V128-NEXT: vmv8r.v v8, v0 ; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: add sp, sp, a0 ; RV64-V128-NEXT: addi sp, sp, 16 ; RV64-V128-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -53,10 +53,10 @@ define void @buildvec_vid_plus_nonimm_v16i8(ptr %x) { ; CHECK-LABEL: buildvec_vid_plus_nonimm_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI4_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI4_0) ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a1, 100 +; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret store <16 x i8> , ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -414,38 +414,25 @@ ; RV32-V128-NEXT: addi sp, sp, -16 ; RV32-V128-NEXT: .cfi_def_cfa_offset 16 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: sub sp, sp, a0 -; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV32-V128-NEXT: lui a0, %hi(.LCPI17_0) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) -; RV32-V128-NEXT: li a1, 32 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-V128-NEXT: vle32.v v0, (a0) -; RV32-V128-NEXT: vmv8r.v v24, v8 +; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-V128-NEXT: vmv8r.v v0, v8 ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 -; RV32-V128-NEXT: lui a0, %hi(.LCPI17_1) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) -; RV32-V128-NEXT: vle32.v v24, (a0) -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: lui a0, 699051 -; RV32-V128-NEXT: addi a0, a0, -1366 +; RV32-V128-NEXT: li a0, 32 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-V128-NEXT: vid.v v8 +; RV32-V128-NEXT: vsrl.vi v8, v8, 1 +; RV32-V128-NEXT: li a1, 16 +; RV32-V128-NEXT: vadd.vx v8, v8, a1 +; RV32-V128-NEXT: vrgather.vv v24, v0, v8 +; RV32-V128-NEXT: lui a1, 699051 +; RV32-V128-NEXT: addi a1, a1, -1366 ; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-V128-NEXT: vmv.v.x v0, a0 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV32-V128-NEXT: vmv.v.v v24, v8 +; RV32-V128-NEXT: vmv.v.x v0, a1 +; RV32-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -455,7 +442,7 @@ ; RV32-V128-NEXT: vmv8r.v v8, v0 ; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: add sp, sp, a0 ; RV32-V128-NEXT: addi sp, sp, 16 ; RV32-V128-NEXT: ret @@ -465,38 +452,25 @@ ; RV64-V128-NEXT: addi sp, sp, -16 ; RV64-V128-NEXT: .cfi_def_cfa_offset 16 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: sub sp, sp, a0 -; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV64-V128-NEXT: lui a0, %hi(.LCPI17_0) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) -; RV64-V128-NEXT: li a1, 32 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-V128-NEXT: vle32.v v0, (a0) -; RV64-V128-NEXT: vmv8r.v v24, v8 +; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-V128-NEXT: vmv8r.v v0, v8 ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 -; RV64-V128-NEXT: lui a0, %hi(.LCPI17_1) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) -; RV64-V128-NEXT: vle32.v v24, (a0) -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: lui a0, 699051 -; RV64-V128-NEXT: addiw a0, a0, -1366 +; RV64-V128-NEXT: li a0, 32 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-V128-NEXT: vid.v v8 +; RV64-V128-NEXT: vsrl.vi v8, v8, 1 +; RV64-V128-NEXT: li a1, 16 +; RV64-V128-NEXT: vadd.vx v8, v8, a1 +; RV64-V128-NEXT: vrgather.vv v24, v0, v8 +; RV64-V128-NEXT: lui a1, 699051 +; RV64-V128-NEXT: addiw a1, a1, -1366 ; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-V128-NEXT: vmv.v.x v0, a0 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV64-V128-NEXT: vmv.v.v v24, v8 +; RV64-V128-NEXT: vmv.v.x v0, a1 +; RV64-V128-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-V128-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -506,7 +480,7 @@ ; RV64-V128-NEXT: vmv8r.v v8, v0 ; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: add sp, sp, a0 ; RV64-V128-NEXT: addi sp, sp, 16 ; RV64-V128-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -1793,23 +1793,21 @@ ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: lui a2, %hi(.LCPI72_0) -; RV32-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a2) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: vmsltu.vx v12, v16, a1 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV32-NEXT: vid.v v16 +; RV32-NEXT: vadd.vx v24, v16, a0 +; RV32-NEXT: vmsltu.vx v12, v24, a1 ; RV32-NEXT: vmsltu.vx v13, v16, a1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vi v13, v12, 4 -; RV32-NEXT: li a0, 64 -; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV32-NEXT: li a1, 64 +; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; RV32-NEXT: vmand.mm v0, v13, v0 ; RV32-NEXT: vmv.v.i v12, 1 ; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 -; RV32-NEXT: vslidedown.vx v12, v8, a3 +; RV32-NEXT: vslidedown.vx v12, v8, a0 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 16 ; RV32-NEXT: vmul.vv v8, v8, v12 @@ -1836,23 +1834,21 @@ ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: lui a2, %hi(.LCPI72_0) -; RV64-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV64-NEXT: li a3, 32 -; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV64-NEXT: vle32.v v16, (a2) ; RV64-NEXT: mv a2, a0 -; RV64-NEXT: vmsltu.vx v12, v16, a1 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV64-NEXT: vid.v v16 +; RV64-NEXT: vadd.vx v24, v16, a0 +; RV64-NEXT: vmsltu.vx v12, v24, a1 ; RV64-NEXT: vmsltu.vx v13, v16, a1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vi v13, v12, 4 -; RV64-NEXT: li a0, 64 -; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV64-NEXT: li a1, 64 +; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; RV64-NEXT: vmand.mm v0, v13, v0 ; RV64-NEXT: vmv.v.i v12, 1 ; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 -; RV64-NEXT: vslidedown.vx v12, v8, a3 +; RV64-NEXT: vslidedown.vx v12, v8, a0 ; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vmul.vv v8, v8, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -72,11 +72,11 @@ define <32 x i1> @reverse_v32i1(<32 x i1> %a) { ; CHECK-LABEL: reverse_v32i1: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 ; CHECK-NEXT: vrgather.vv v12, v10, v8 @@ -89,11 +89,11 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) { ; CHECK-LABEL: reverse_v64i1: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: vrgather.vv v16, v12, v8 @@ -166,11 +166,11 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) { ; CHECK-LABEL: reverse_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI11_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -181,11 +181,11 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) { ; CHECK-LABEL: reverse_v64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI12_0) -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -255,11 +255,11 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) { ; CHECK-LABEL: reverse_v32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI18_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -531,11 +531,11 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) { ; CHECK-LABEL: reverse_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI33_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI33_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll --- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll @@ -106,41 +106,39 @@ define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) { ; RV32-LABEL: v16i8_2: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI7_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; RV32-NEXT: vle8.v v12, (a0) -; RV32-NEXT: vmv1r.v v14, v9 -; RV32-NEXT: vrgather.vv v10, v8, v12 -; RV32-NEXT: vid.v v8 -; RV32-NEXT: vrsub.vi v8, v8, 15 -; RV32-NEXT: lui a0, 16 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vmv1r.v v12, v9 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV32-NEXT: vid.v v14 +; RV32-NEXT: li a1, 31 +; RV32-NEXT: vrsub.vx v16, v14, a1 +; RV32-NEXT: vrgather.vv v10, v8, v16 +; RV32-NEXT: vrsub.vi v8, v14, 15 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; RV32-NEXT: vrgather.vv v10, v14, v8, v0.t +; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vsetvli zero, a0, e8, m2, ta, mu +; RV32-NEXT: vrgather.vv v10, v12, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: v16i8_2: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI7_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0) -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; RV64-NEXT: vle8.v v12, (a0) -; RV64-NEXT: vmv1r.v v14, v9 -; RV64-NEXT: vrgather.vv v10, v8, v12 -; RV64-NEXT: vid.v v8 -; RV64-NEXT: vrsub.vi v8, v8, 15 -; RV64-NEXT: lui a0, 16 -; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vmv1r.v v12, v9 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV64-NEXT: vid.v v14 +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vrsub.vx v16, v14, a1 +; RV64-NEXT: vrgather.vv v10, v8, v16 +; RV64-NEXT: vrsub.vi v8, v14, 15 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v0, a0 -; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; RV64-NEXT: vrgather.vv v10, v14, v8, v0.t +; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vsetvli zero, a0, e8, m2, ta, mu +; RV64-NEXT: vrgather.vv v10, v12, v8, v0.t ; RV64-NEXT: vmv.v.v v8, v10 ; RV64-NEXT: ret %v32i8 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> @@ -252,41 +250,39 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV32-LABEL: v16i16_2: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI15_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI15_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: vmv2r.v v16, v10 ; RV32-NEXT: vmv2r.v v12, v8 -; RV32-NEXT: vrgather.vv v8, v12, v20 -; RV32-NEXT: vid.v v12 -; RV32-NEXT: vrsub.vi v12, v12, 15 -; RV32-NEXT: lui a0, 16 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; RV32-NEXT: vid.v v20 +; RV32-NEXT: li a1, 31 +; RV32-NEXT: vrsub.vx v24, v20, a1 +; RV32-NEXT: vrgather.vv v8, v12, v24 +; RV32-NEXT: vrsub.vi v12, v20, 15 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; RV32-NEXT: vrgather.vv v8, v16, v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: v16i16_2: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI15_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI15_0) -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV64-NEXT: vle16.v v20, (a0) ; RV64-NEXT: vmv2r.v v16, v10 ; RV64-NEXT: vmv2r.v v12, v8 -; RV64-NEXT: vrgather.vv v8, v12, v20 -; RV64-NEXT: vid.v v12 -; RV64-NEXT: vrsub.vi v12, v12, 15 -; RV64-NEXT: lui a0, 16 -; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; RV64-NEXT: vid.v v20 +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vrsub.vx v24, v20, a1 +; RV64-NEXT: vrgather.vv v8, v12, v24 +; RV64-NEXT: vrsub.vi v12, v20, 15 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v0, a0 -; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; RV64-NEXT: vrgather.vv v8, v16, v12, v0.t ; RV64-NEXT: ret %v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> @@ -401,42 +397,88 @@ define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) { ; RV32-LABEL: v16i32_2: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI23_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI23_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vle32.v v0, (a0) -; RV32-NEXT: vmv4r.v v24, v12 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV32-NEXT: vmv4r.v v16, v12 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vmv4r.v v16, v8 -; RV32-NEXT: vrgather.vv v8, v16, v0 -; RV32-NEXT: vid.v v16 -; RV32-NEXT: vrsub.vi v16, v16, 15 -; RV32-NEXT: lui a0, 16 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vid.v v0 +; RV32-NEXT: li a1, 31 +; RV32-NEXT: vrsub.vx v8, v0, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v8, v16, v24 +; RV32-NEXT: vrsub.vi v16, v0, 15 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: v16i32_2: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI23_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI23_0) -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-NEXT: vle32.v v0, (a0) -; RV64-NEXT: vmv4r.v v24, v12 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: vmv4r.v v16, v12 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vmv4r.v v16, v8 -; RV64-NEXT: vrgather.vv v8, v16, v0 -; RV64-NEXT: vid.v v16 -; RV64-NEXT: vrsub.vi v16, v16, 15 -; RV64-NEXT: lui a0, 16 -; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vid.v v0 +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vrsub.vx v8, v0, a1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v8, v16, v24 +; RV64-NEXT: vrsub.vi v16, v0, 15 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v0, a0 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v32i32 = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> ret <32 x i32> %v32i32 @@ -632,11 +674,11 @@ define <32 x half> @v16f16_2(<16 x half> %a) { ; CHECK-LABEL: v16f16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI35_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI35_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -824,11 +866,11 @@ define <32 x i8> @v32i8(<32 x i8> %a) { ; CHECK-LABEL: v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI46_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: li a0, 31 +; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret