Index: llvm/lib/Target/RISCV/RISCVISelLowering.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9712,6 +9712,12 @@ if (!NeedsIdxLegalization) break; + // HACK: Using the DL of the index instead of the memory operation causes + // default scheduling to keep the scale with the indexing. This is often + // profitable as the SEW for the indexed operation is likely to differ + // from the SEW of scale, and if we're e.g. interwoven in the vectorizer + // we'd really like to not mix computation in two data widths. + SDLoc IndexDL(Index); SDLoc DL(N); // Any index legalization should first promote to XLenVT, so we don't lose @@ -9721,7 +9727,7 @@ if (IndexVT.getVectorElementType().bitsLT(XLenVT)) { IndexVT = IndexVT.changeVectorElementType(XLenVT); Index = DAG.getNode(IsIndexSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, - DL, IndexVT, Index); + IndexDL, IndexVT, Index); } if (IsIndexScaled) { @@ -9730,9 +9736,9 @@ // TODO: For VP nodes, should we use VP_SHL here? unsigned Scale = cast(ScaleOp)->getZExtValue(); assert(isPowerOf2_32(Scale) && "Expecting power-of-two types"); - SDValue SplatScale = DAG.getConstant(Log2_32(Scale), DL, IndexVT); - Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, SplatScale); - ScaleOp = DAG.getTargetConstant(1, DL, ScaleOp.getValueType()); + SDValue SplatScale = DAG.getConstant(Log2_32(Scale), IndexDL, IndexVT); + Index = DAG.getNode(ISD::SHL, IndexDL, IndexVT, Index, SplatScale); + ScaleOp = DAG.getTargetConstant(1, IndexDL, ScaleOp.getValueType()); } ISD::MemIndexType NewIndexTy = ISD::UNSIGNED_SCALED; Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -12657,24 +12657,23 @@ ; ; RV64V-LABEL: mgather_baseidx_v32i8: ; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64V-NEXT: vslidedown.vi v12, v8, 16 ; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsext.vf8 v16, v12 +; RV64V-NEXT: vsext.vf8 v24, v8 ; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64V-NEXT: vmv1r.v v12, v10 -; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: vluxei64.v v8, (a0), v24, v0.t ; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64V-NEXT: vslidedown.vi v10, v10, 16 -; RV64V-NEXT: vslidedown.vi v8, v8, 16 -; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64V-NEXT: vsext.vf8 v16, v8 ; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64V-NEXT: vslidedown.vi v0, v0, 2 ; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64V-NEXT: vluxei64.v v10, (a0), v16, v0.t ; RV64V-NEXT: li a0, 32 ; RV64V-NEXT: vsetvli zero, a0, e8, m2, tu, mu -; RV64V-NEXT: vslideup.vi v12, v10, 16 -; RV64V-NEXT: vmv2r.v v8, v12 +; RV64V-NEXT: vslideup.vi v8, v10, 16 ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_baseidx_v32i8: Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -11014,15 +11014,15 @@ ; ; RV64-LABEL: mscatter_baseidx_v32i8: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64-NEXT: vslidedown.vi v12, v10, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsext.vf8 v24, v10 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64-NEXT: vslidedown.vi v8, v8, 16 -; RV64-NEXT: vslidedown.vi v10, v10, 16 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v10 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -285,28 +285,27 @@ ; ; RV64-LABEL: vpgather_baseidx_v32i8: ; RV64: # %bb.0: -; RV64-NEXT: addi a3, a1, -16 ; RV64-NEXT: vmv1r.v v10, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64-NEXT: vslidedown.vi v12, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vsext.vf8 v24, v12 ; RV64-NEXT: bltu a1, a3, .LBB13_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB13_2: -; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu -; RV64-NEXT: vslidedown.vi v12, v8, 16 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v10, 2 ; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t ; RV64-NEXT: li a2, 16 ; RV64-NEXT: bltu a1, a2, .LBB13_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB13_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, mu ; RV64-NEXT: vmv1r.v v0, v10 ; RV64-NEXT: vluxei64.v v8, (a0), v16, v0.t @@ -1977,16 +1976,16 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(double* %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v32i8_v32f64: ; RV32: # %bb.0: +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vsext.vf4 v16, v8 ; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: mv a2, a1 ; RV32-NEXT: bltu a1, a3, .LBB87_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB87_2: -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu -; RV32-NEXT: vsext.vf4 v16, v8 -; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a3, a1, -16 @@ -2005,20 +2004,22 @@ ; ; RV64-LABEL: vpgather_baseidx_v32i8_v32f64: ; RV64: # %bb.0: -; RV64-NEXT: addi a3, a1, -16 ; RV64-NEXT: vmv1r.v v10, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64-NEXT: vslidedown.vi v12, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vsext.vf8 v24, v8 ; RV64-NEXT: bltu a1, a3, .LBB87_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB87_2: +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v10, 2 -; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu -; RV64-NEXT: vslidedown.vi v12, v8, 16 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v12 -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 @@ -2026,9 +2027,6 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB87_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v24, v8 -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v10 ; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t @@ -2046,14 +2044,15 @@ ; RV32-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV32-NEXT: vslidedown.vi v12, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: addi a3, a1, -16 ; RV32-NEXT: vsext.vf8 v16, v12 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vsext.vf8 v24, v8 ; RV32-NEXT: bltu a1, a3, .LBB88_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB88_2: -; RV32-NEXT: vsext.vf8 v24, v8 -; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v12, v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu @@ -2065,8 +2064,6 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB88_4: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v4, v24, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -2081,17 +2078,17 @@ ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: addi a3, a1, -16 ; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vsext.vf8 v24, v8 ; RV64-NEXT: bltu a1, a3, .LBB88_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB88_2: -; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v10, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 @@ -2099,8 +2096,6 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB88_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v10 ; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t @@ -2119,14 +2114,15 @@ ; RV32-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV32-NEXT: vslidedown.vi v12, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: addi a3, a1, -16 ; RV32-NEXT: vzext.vf8 v16, v12 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vzext.vf8 v24, v8 ; RV32-NEXT: bltu a1, a3, .LBB89_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB89_2: -; RV32-NEXT: vzext.vf8 v24, v8 -; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v12, v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu @@ -2138,8 +2134,6 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB89_4: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v4, v24, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -2154,17 +2148,17 @@ ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: addi a3, a1, -16 ; RV64-NEXT: vzext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vzext.vf8 v24, v8 ; RV64-NEXT: bltu a1, a3, .LBB89_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB89_2: -; RV64-NEXT: vzext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v10, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 @@ -2172,8 +2166,6 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB89_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v10 ; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t @@ -2187,16 +2179,16 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(double* %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v32i16_v32f64: ; RV32: # %bb.0: +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vsext.vf2 v16, v8 ; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: mv a2, a1 ; RV32-NEXT: bltu a1, a3, .LBB90_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB90_2: -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu -; RV32-NEXT: vsext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a3, a1, -16 @@ -2215,20 +2207,22 @@ ; ; RV64-LABEL: vpgather_baseidx_v32i16_v32f64: ; RV64: # %bb.0: -; RV64-NEXT: addi a3, a1, -16 ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu +; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vsext.vf4 v24, v8 ; RV64-NEXT: bltu a1, a3, .LBB90_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB90_2: +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v12, 2 -; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu -; RV64-NEXT: vslidedown.vi v16, v8, 16 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v24, v16 -; RV64-NEXT: vsll.vi v16, v24, 3 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 @@ -2236,9 +2230,6 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB90_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v24, v8 -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t @@ -2254,16 +2245,17 @@ ; RV32-NEXT: vmv1r.v v12, v0 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, mu -; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vslidedown.vi v16, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 ; RV32-NEXT: addi a3, a1, -16 -; RV32-NEXT: vsext.vf4 v16, v24 +; RV32-NEXT: vsext.vf4 v24, v8 ; RV32-NEXT: bltu a1, a3, .LBB91_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB91_2: -; RV32-NEXT: vsext.vf4 v24, v8 -; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v8, v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu @@ -2275,8 +2267,6 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB91_4: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v4, v24, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -2289,19 +2279,19 @@ ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: li a2, 0 ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 ; RV64-NEXT: addi a3, a1, -16 -; RV64-NEXT: vsext.vf4 v16, v24 +; RV64-NEXT: vsext.vf4 v24, v8 ; RV64-NEXT: bltu a1, a3, .LBB91_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB91_2: -; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v12, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 @@ -2309,8 +2299,6 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB91_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t @@ -2327,16 +2315,17 @@ ; RV32-NEXT: vmv1r.v v12, v0 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, mu -; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vslidedown.vi v16, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vzext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 ; RV32-NEXT: addi a3, a1, -16 -; RV32-NEXT: vzext.vf4 v16, v24 +; RV32-NEXT: vzext.vf4 v24, v8 ; RV32-NEXT: bltu a1, a3, .LBB92_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB92_2: -; RV32-NEXT: vzext.vf4 v24, v8 -; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v8, v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu @@ -2348,8 +2337,6 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB92_4: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v4, v24, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -2362,19 +2349,19 @@ ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: li a2, 0 ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vzext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 ; RV64-NEXT: addi a3, a1, -16 -; RV64-NEXT: vzext.vf4 v16, v24 +; RV64-NEXT: vzext.vf4 v24, v8 ; RV64-NEXT: bltu a1, a3, .LBB92_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB92_2: -; RV64-NEXT: vzext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v12, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 @@ -2382,8 +2369,6 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB92_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t @@ -2397,15 +2382,15 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v32i32_v32f64: ; RV32: # %bb.0: +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v8, 3 ; RV32-NEXT: mv a2, a1 ; RV32-NEXT: bltu a1, a3, .LBB93_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB93_2: -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu -; RV32-NEXT: vsll.vi v16, v8, 3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a3, a1, -16 @@ -2424,20 +2409,22 @@ ; ; RV64-LABEL: vpgather_baseidx_v32i32_v32f64: ; RV64: # %bb.0: -; RV64-NEXT: addi a3, a1, -16 -; RV64-NEXT: vmv1r.v v1, v0 +; RV64-NEXT: vmv1r.v v24, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v0, v16 +; RV64-NEXT: vsll.vi v16, v0, 3 +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vsext.vf2 v0, v8 ; RV64-NEXT: bltu a1, a3, .LBB93_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB93_2: +; RV64-NEXT: vsll.vi v8, v0, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v0, v1, 2 -; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v16, v8, 16 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf2 v24, v16 -; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vslidedown.vi v0, v24, 2 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 @@ -2445,11 +2432,8 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB93_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf2 v24, v8 -; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, <32 x i32> %idxs @@ -2460,68 +2444,65 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_sext_v32i32_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: vmv1r.v v24, v0 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vslidedown.vi v16, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsext.vf2 v0, v16 +; RV32-NEXT: vsll.vi v16, v0, 3 ; RV32-NEXT: addi a3, a1, -16 -; RV32-NEXT: vsext.vf2 v16, v24 +; RV32-NEXT: vsext.vf2 v0, v8 ; RV32-NEXT: bltu a1, a3, .LBB94_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB94_2: -; RV32-NEXT: vsext.vf2 v24, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsll.vi v8, v0, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v4, v8, 0 +; RV32-NEXT: vnsrl.wi v28, v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v1, 2 +; RV32-NEXT: vslidedown.vi v0, v24, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu -; RV32-NEXT: vluxei32.v v16, (a0), v4, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: li a2, 16 ; RV32-NEXT: bltu a1, a2, .LBB94_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB94_4: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v24, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vnsrl.wi v28, v8, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vmv1r.v v0, v24 +; RV32-NEXT: vluxei32.v v8, (a0), v28, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_sext_v32i32_v32f64: ; RV64: # %bb.0: -; RV64-NEXT: vmv1r.v v1, v0 +; RV64-NEXT: vmv1r.v v24, v0 ; RV64-NEXT: li a2, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v0, v16 +; RV64-NEXT: vsll.vi v16, v0, 3 ; RV64-NEXT: addi a3, a1, -16 -; RV64-NEXT: vsext.vf2 v16, v24 +; RV64-NEXT: vsext.vf2 v0, v8 ; RV64-NEXT: bltu a1, a3, .LBB94_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB94_2: -; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v0, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v0, v1, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vslidedown.vi v0, v24, 2 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu -; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 ; RV64-NEXT: bltu a1, a2, .LBB94_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB94_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = sext <32 x i32> %idxs to <32 x i64> @@ -2533,68 +2514,65 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v32i32_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: vmv1r.v v24, v0 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vslidedown.vi v16, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vzext.vf2 v0, v16 +; RV32-NEXT: vsll.vi v16, v0, 3 ; RV32-NEXT: addi a3, a1, -16 -; RV32-NEXT: vzext.vf2 v16, v24 +; RV32-NEXT: vzext.vf2 v0, v8 ; RV32-NEXT: bltu a1, a3, .LBB95_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB95_2: -; RV32-NEXT: vzext.vf2 v24, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsll.vi v8, v0, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v4, v8, 0 +; RV32-NEXT: vnsrl.wi v28, v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v1, 2 +; RV32-NEXT: vslidedown.vi v0, v24, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu -; RV32-NEXT: vluxei32.v v16, (a0), v4, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: li a2, 16 ; RV32-NEXT: bltu a1, a2, .LBB95_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB95_4: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v24, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vnsrl.wi v28, v8, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vmv1r.v v0, v24 +; RV32-NEXT: vluxei32.v v8, (a0), v28, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v32i32_v32f64: ; RV64: # %bb.0: -; RV64-NEXT: vmv1r.v v1, v0 +; RV64-NEXT: vmv1r.v v24, v0 ; RV64-NEXT: li a2, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vzext.vf2 v0, v16 +; RV64-NEXT: vsll.vi v16, v0, 3 ; RV64-NEXT: addi a3, a1, -16 -; RV64-NEXT: vzext.vf2 v16, v24 +; RV64-NEXT: vzext.vf2 v0, v8 ; RV64-NEXT: bltu a1, a3, .LBB95_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB95_2: -; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v0, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v0, v1, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vslidedown.vi v0, v24, 2 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu -; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 ; RV64-NEXT: bltu a1, a2, .LBB95_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB95_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = zext <32 x i32> %idxs to <32 x i64> @@ -2606,15 +2584,16 @@ define <32 x double> @vpgather_baseidx_v32f64(double* %base, <32 x i64> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: addi a3, a1, -16 ; RV32-NEXT: vmv1r.v v24, v0 ; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: bltu a1, a3, .LBB96_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB96_2: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsll.vi v8, v8, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v28, v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu @@ -2626,8 +2605,6 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB96_4: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v8, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v28, v8, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -2637,17 +2614,18 @@ ; ; RV64-LABEL: vpgather_baseidx_v32f64: ; RV64: # %bb.0: -; RV64-NEXT: addi a3, a1, -16 ; RV64-NEXT: vmv1r.v v24, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: bltu a1, a3, .LBB96_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB96_2: +; RV64-NEXT: vsll.vi v8, v8, 3 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v24, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: li a2, 16 @@ -2655,8 +2633,6 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB96_4: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v8, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll @@ -1805,26 +1805,26 @@ ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; RV32-NEXT: vle32.v v24, (a1) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: mv a1, a2 -; RV32-NEXT: bltu a2, a3, .LBB80_2 -; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB80_2: -; RV32-NEXT: li a3, 0 ; RV32-NEXT: vsll.vi v24, v24, 3 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu -; RV32-NEXT: addi a1, a2, -16 +; RV32-NEXT: mv a3, a2 +; RV32-NEXT: bltu a2, a1, .LBB80_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a3, 16 +; RV32-NEXT: .LBB80_2: +; RV32-NEXT: li a1, 0 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: addi a3, a2, -16 ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t -; RV32-NEXT: bltu a2, a1, .LBB80_4 +; RV32-NEXT: bltu a2, a3, .LBB80_4 ; RV32-NEXT: # %bb.3: -; RV32-NEXT: mv a3, a1 +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: .LBB80_4: ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: ret ; @@ -1833,57 +1833,52 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 10 -; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; RV64-NEXT: vle32.v v24, (a1) ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a1, 16 -; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill -; RV64-NEXT: mv a3, a2 -; RV64-NEXT: bltu a2, a1, .LBB80_2 -; RV64-NEXT: # %bb.1: +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v16, v24, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v8, v16 +; RV64-NEXT: vsext.vf2 v16, v24 ; RV64-NEXT: li a3, 16 +; RV64-NEXT: vsll.vi v24, v16, 3 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: bltu a2, a3, .LBB80_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB80_2: -; RV64-NEXT: li a1, 0 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: add a4, sp, a4 -; RV64-NEXT: addi a4, a4, 16 -; RV64-NEXT: vl8re8.v v0, (a4) # Unknown-size Folded Reload -; RV64-NEXT: vsext.vf2 v24, v0 -; RV64-NEXT: vsll.vi v24, v24, 3 -; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu -; RV64-NEXT: addi a3, a2, -16 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: vsll.vi v16, v8, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: addi a4, sp, 16 -; RV64-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vl8re8.v v8, (a4) # Unknown-size Folded Reload ; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t -; RV64-NEXT: bltu a2, a3, .LBB80_4 +; RV64-NEXT: bltu a2, a1, .LBB80_4 ; RV64-NEXT: # %bb.3: -; RV64-NEXT: mv a1, a3 +; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB80_4: ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8re8.v v8, (a2) # Unknown-size Folded Reload -; RV64-NEXT: vslidedown.vi v8, v8, 16 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf2 v24, v8 -; RV64-NEXT: vsll.vi v8, v24, 3 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 10 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -1911,33 +1906,32 @@ ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vslidedown.vi v16, v24, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsext.vf2 v8, v16 ; RV32-NEXT: vsext.vf2 v16, v24 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: mv a1, a2 ; RV32-NEXT: bltu a2, a3, .LBB81_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB81_2: ; RV32-NEXT: li a3, 0 -; RV32-NEXT: vsext.vf2 v24, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsll.vi v16, v8, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vnsrl.wi v8, v24, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV32-NEXT: addi a1, a2, -16 ; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vl8re8.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v24, (a0), v8, v0.t ; RV32-NEXT: bltu a2, a1, .LBB81_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: .LBB81_4: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v24, 3 ; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vnsrl.wi v8, v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu @@ -1945,8 +1939,8 @@ ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -1958,65 +1952,52 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 24 -; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; RV64-NEXT: vle32.v v24, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v8, v24, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v16, v24, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v8, v16 +; RV64-NEXT: vsext.vf2 v16, v24 ; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsext.vf2 v8, v24 +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB81_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB81_2: ; RV64-NEXT: li a3, 0 -; RV64-NEXT: addi a4, sp, 16 -; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload -; RV64-NEXT: vsext.vf2 v16, v24 -; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsll.vi v16, v8, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: addi a1, a2, -16 -; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 3 -; RV64-NEXT: add a4, sp, a4 -; RV64-NEXT: addi a4, a4, 16 -; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: bltu a2, a1, .LBB81_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB81_4: ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 24 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -2045,33 +2026,32 @@ ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vslidedown.vi v16, v24, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: li a3, 16 +; RV32-NEXT: vzext.vf2 v8, v16 ; RV32-NEXT: vzext.vf2 v16, v24 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: mv a1, a2 ; RV32-NEXT: bltu a2, a3, .LBB82_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB82_2: ; RV32-NEXT: li a3, 0 -; RV32-NEXT: vzext.vf2 v24, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsll.vi v16, v8, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vnsrl.wi v8, v24, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV32-NEXT: addi a1, a2, -16 ; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vl8re8.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v24, (a0), v8, v0.t ; RV32-NEXT: bltu a2, a1, .LBB82_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: .LBB82_4: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v24, 3 ; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vnsrl.wi v8, v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu @@ -2079,8 +2059,8 @@ ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -2092,65 +2072,52 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 24 -; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; RV64-NEXT: vle32.v v24, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v8, v24, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v16, v24, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vzext.vf2 v8, v16 +; RV64-NEXT: vzext.vf2 v16, v24 ; RV64-NEXT: li a3, 16 -; RV64-NEXT: vzext.vf2 v8, v24 +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB82_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB82_2: ; RV64-NEXT: li a3, 0 -; RV64-NEXT: addi a4, sp, 16 -; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload -; RV64-NEXT: vzext.vf2 v16, v24 -; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsll.vi v16, v8, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: addi a1, a2, -16 -; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 3 -; RV64-NEXT: add a4, sp, a4 -; RV64-NEXT: addi a4, a4, 16 -; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: bltu a2, a1, .LBB82_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB82_4: ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 24 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret Index: llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -2210,16 +2210,15 @@ ; RV64-LABEL: mgather_baseidx_nxv16i8: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsext.vf8 v16, v9 +; RV64-NEXT: vsext.vf8 v24, v8 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vluxei64.v v10, (a0), v24, v0.t ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vx v0, v0, a1 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v9 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v11, (a0), v16, v0.t ; RV64-NEXT: vmv2r.v v8, v10 ; RV64-NEXT: ret @@ -2234,24 +2233,44 @@ ; RV32-LABEL: mgather_baseidx_nxv32i8: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu -; RV32-NEXT: vsext.vf4 v16, v8 +; RV32-NEXT: vsext.vf4 v16, v10 +; RV32-NEXT: vsext.vf4 v24, v8 ; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32-NEXT: vluxei32.v v12, (a0), v24, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, mu ; RV32-NEXT: vslidedown.vx v0, v0, a1 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu -; RV32-NEXT: vsext.vf4 v16, v10 -; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV32-NEXT: vsetvli a1, zero, e8, m2, ta, mu ; RV32-NEXT: vluxei32.v v14, (a0), v16, v0.t ; RV32-NEXT: vmv4r.v v8, v12 ; RV32-NEXT: ret ; ; RV64-LABEL: mgather_baseidx_nxv32i8: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: vmv1r.v v16, v0 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v24, v11 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsext.vf8 v24, v10 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsext.vf8 v24, v9 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsext.vf8 v24, v8 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t @@ -2259,24 +2278,35 @@ ; RV64-NEXT: srli a2, a1, 3 ; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vx v0, v0, a2 -; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v24, v9 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vsetvli a3, zero, e8, m1, ta, mu +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vluxei64.v v13, (a0), v24, v0.t ; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, mu ; RV64-NEXT: vslidedown.vx v0, v16, a1 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v10 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t ; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vx v0, v0, a2 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v11 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vluxei64.v v15, (a0), v16, v0.t ; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, %idxs %v = call @llvm.masked.gather.nxv32i8.nxv32p0i8( %ptrs, i32 2, %m, %passthru) Index: llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll @@ -1895,13 +1895,13 @@ ; RV64-NEXT: vsext.vf8 v24, v2 ; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: vsext.vf8 v8, v3 +; RV64-NEXT: vsll.vi v8, v8, 3 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v8, v3 -; RV64-NEXT: vsll.vi v8, v8, 3 ; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, %idxs @@ -1935,13 +1935,13 @@ ; RV64-NEXT: vsext.vf4 v24, v4 ; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: vsext.vf4 v8, v6 +; RV64-NEXT: vsll.vi v8, v8, 3 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v8, v6 -; RV64-NEXT: vsll.vi v8, v8, 3 ; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, %idxs Index: llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll @@ -258,6 +258,8 @@ ; RV32: # %bb.0: ; RV32-NEXT: vmv1r.v v12, v0 ; RV32-NEXT: li a3, 0 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, mu +; RV32-NEXT: vsext.vf4 v24, v10 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: srli a5, a2, 2 ; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, mu @@ -269,83 +271,92 @@ ; RV32-NEXT: mv a3, a4 ; RV32-NEXT: .LBB12_2: ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, mu -; RV32-NEXT: vsext.vf4 v24, v10 +; RV32-NEXT: vsext.vf4 v16, v8 ; RV32-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV32-NEXT: vluxei32.v v18, (a0), v24, v0.t +; RV32-NEXT: vluxei32.v v10, (a0), v24, v0.t ; RV32-NEXT: bltu a1, a2, .LBB12_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB12_4: -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, mu -; RV32-NEXT: vsext.vf4 v24, v8 ; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; RV32-NEXT: vmv1r.v v0, v12 -; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t -; RV32-NEXT: vmv4r.v v8, v16 +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_nxv32i8: ; RV64: # %bb.0: -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a5, a3, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a5, a2, 1 ; RV64-NEXT: sub a6, a1, a5 ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: li a4, 0 -; RV64-NEXT: li a2, 0 +; RV64-NEXT: li a3, 0 ; RV64-NEXT: bltu a1, a6, .LBB12_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a2, a6 +; RV64-NEXT: mv a3, a6 ; RV64-NEXT: .LBB12_2: -; RV64-NEXT: sub a6, a2, a3 +; RV64-NEXT: vsetvli a6, zero, e64, m8, ta, mu +; RV64-NEXT: sub a6, a3, a2 +; RV64-NEXT: vsext.vf8 v24, v11 ; RV64-NEXT: mv a7, a4 -; RV64-NEXT: bltu a2, a6, .LBB12_4 +; RV64-NEXT: bltu a3, a6, .LBB12_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a7, a6 ; RV64-NEXT: .LBB12_4: -; RV64-NEXT: srli a6, a3, 2 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a6, vlenb +; RV64-NEXT: slli a6, a6, 3 +; RV64-NEXT: sub sp, sp, a6 +; RV64-NEXT: srli a6, a2, 2 ; RV64-NEXT: vsetvli t0, zero, e8, mf2, ta, mu -; RV64-NEXT: vslidedown.vx v13, v12, a6 -; RV64-NEXT: srli a6, a3, 3 +; RV64-NEXT: vslidedown.vx v20, v12, a6 +; RV64-NEXT: srli a6, a2, 3 ; RV64-NEXT: vsetvli t0, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v13, a6 -; RV64-NEXT: vsetvli t0, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v24, v11 +; RV64-NEXT: vslidedown.vx v0, v20, a6 ; RV64-NEXT: vsetvli zero, a7, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v19, (a0), v24, v0.t ; RV64-NEXT: bltu a1, a5, .LBB12_6 ; RV64-NEXT: # %bb.5: ; RV64-NEXT: mv a1, a5 ; RV64-NEXT: .LBB12_6: -; RV64-NEXT: sub a5, a1, a3 +; RV64-NEXT: vsetvli a5, zero, e64, m8, ta, mu +; RV64-NEXT: sub a5, a1, a2 +; RV64-NEXT: vsext.vf8 v24, v9 ; RV64-NEXT: bltu a1, a5, .LBB12_8 ; RV64-NEXT: # %bb.7: ; RV64-NEXT: mv a4, a5 ; RV64-NEXT: .LBB12_8: +; RV64-NEXT: vsext.vf8 v0, v8 +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vs8r.v v0, (a5) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vx v0, v12, a6 -; RV64-NEXT: vsetvli a5, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v24, v9 ; RV64-NEXT: vsetvli zero, a4, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v17, (a0), v24, v0.t -; RV64-NEXT: bltu a1, a3, .LBB12_10 +; RV64-NEXT: bltu a1, a2, .LBB12_10 ; RV64-NEXT: # %bb.9: -; RV64-NEXT: mv a1, a3 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB12_10: ; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsext.vf8 v24, v10 ; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, mu ; RV64-NEXT: vmv1r.v v0, v12 -; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t -; RV64-NEXT: bltu a2, a3, .LBB12_12 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: bltu a3, a2, .LBB12_12 ; RV64-NEXT: # %bb.11: -; RV64-NEXT: mv a2, a3 +; RV64-NEXT: mv a3, a2 ; RV64-NEXT: .LBB12_12: -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v24, v10 -; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, mu -; RV64-NEXT: vmv1r.v v0, v13 +; RV64-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v20 ; RV64-NEXT: vluxei64.v v18, (a0), v24, v0.t ; RV64-NEXT: vmv4r.v v8, v16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, %idxs %v = call @llvm.vp.gather.nxv32i8.nxv32p0i8( %ptrs, %m, i32 %evl) @@ -2394,6 +2405,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: vmv1r.v v12, v0 ; RV32-NEXT: li a3, 0 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, mu +; RV32-NEXT: vsext.vf2 v16, v8 +; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: srli a5, a2, 3 ; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, mu @@ -2403,9 +2417,6 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a4 ; RV32-NEXT: .LBB103_2: -; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, mu -; RV32-NEXT: vsext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV32-NEXT: vluxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: bltu a1, a2, .LBB103_4 @@ -2421,6 +2432,10 @@ ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: li a3, 0 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsext.vf4 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: srli a5, a2, 3 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu @@ -2431,17 +2446,13 @@ ; RV64-NEXT: mv a3, a4 ; RV64-NEXT: .LBB103_2: ; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v16, v10 -; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: bltu a1, a2, .LBB103_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB103_4: -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v24, v8 -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t @@ -2455,20 +2466,21 @@ ; RV32-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV32-NEXT: vsext.vf4 v16, v10 +; RV32-NEXT: vsext.vf4 v24, v8 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vsext.vf4 v16, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: bltu a1, a2, .LBB104_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: .LBB104_2: ; RV32-NEXT: li a4, 0 -; RV32-NEXT: vsext.vf4 v24, v10 -; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vnsrl.wi v24, v8, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: srli a3, a2, 3 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a2, a1, a2 @@ -2477,8 +2489,6 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a4, a2 ; RV32-NEXT: .LBB104_4: -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v16, v24, 3 ; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v24, v16, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -2490,7 +2500,9 @@ ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: li a3, 0 ; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v24, v8 ; RV64-NEXT: vsext.vf4 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: srli a5, a2, 3 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu @@ -2501,16 +2513,13 @@ ; RV64-NEXT: mv a3, a4 ; RV64-NEXT: .LBB104_2: ; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v24, v8 -; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: bltu a1, a2, .LBB104_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB104_4: -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t @@ -2525,20 +2534,21 @@ ; RV32-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV32-NEXT: vzext.vf4 v16, v10 +; RV32-NEXT: vzext.vf4 v24, v8 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vzext.vf4 v16, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: bltu a1, a2, .LBB105_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: .LBB105_2: ; RV32-NEXT: li a4, 0 -; RV32-NEXT: vzext.vf4 v24, v10 -; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vnsrl.wi v24, v8, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: srli a3, a2, 3 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a2, a1, a2 @@ -2547,8 +2557,6 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a4, a2 ; RV32-NEXT: .LBB105_4: -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v16, v24, 3 ; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, mu ; RV32-NEXT: vnsrl.wi v24, v16, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -2560,7 +2568,9 @@ ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: li a3, 0 ; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vzext.vf4 v24, v8 ; RV64-NEXT: vzext.vf4 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: srli a5, a2, 3 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu @@ -2571,16 +2581,13 @@ ; RV64-NEXT: mv a3, a4 ; RV64-NEXT: .LBB105_2: ; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, mu -; RV64-NEXT: vzext.vf4 v24, v8 -; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: bltu a1, a2, .LBB105_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB105_4: -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t Index: llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll @@ -2146,16 +2146,16 @@ ; RV32-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64: ; RV32: # %bb.0: ; RV32-NEXT: vl4re16.v v4, (a1) +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu +; RV32-NEXT: vsext.vf2 v24, v4 ; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: bltu a2, a1, .LBB96_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: .LBB96_2: ; RV32-NEXT: li a4, 0 -; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, mu -; RV32-NEXT: vsext.vf2 v24, v4 -; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: srli a3, a1, 3 @@ -2172,19 +2172,28 @@ ; ; RV64-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: vl4re16.v v4, (a1) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v16, v4 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: mv a3, a2 +; RV64-NEXT: vsext.vf4 v24, v6 ; RV64-NEXT: bltu a2, a1, .LBB96_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB96_2: ; RV64-NEXT: li a4, 0 -; RV64-NEXT: vsetvli a5, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v24, v4 ; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu -; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: srli a3, a1, 3 ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a1, a2, a1 @@ -2193,11 +2202,14 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a4, a1 ; RV64-NEXT: .LBB96_4: -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v8, v6 -; RV64-NEXT: vsll.vi v8, v8, 3 ; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, %idxs call void @llvm.vp.scatter.nxv16f64.nxv16p0f64( %val, %ptrs, %m, i32 %evl) @@ -2221,22 +2233,23 @@ ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV32-NEXT: vsext.vf4 v8, v26 +; RV32-NEXT: vsext.vf4 v16, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vsext.vf4 v8, v24 +; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: bltu a2, a1, .LBB97_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: .LBB97_2: ; RV32-NEXT: li a4, 0 -; RV32-NEXT: vsext.vf4 v16, v26 -; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsll.vi v16, v8, 3 ; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vnsrl.wi v8, v24, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vl8re8.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v24, (a0), v8, v0.t ; RV32-NEXT: srli a3, a1, 3 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a1, a2, a1 @@ -2245,17 +2258,15 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a4, a1 ; RV32-NEXT: .LBB97_4: -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v16, 3 ; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vnsrl.wi v8, v16, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -2267,31 +2278,25 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: slli a3, a3, 3 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: vl4re16.v v24, (a1) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vl4re16.v v4, (a1) ; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v16, v4 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: vsext.vf4 v8, v24 ; RV64-NEXT: mv a3, a2 +; RV64-NEXT: vsext.vf4 v24, v6 ; RV64-NEXT: bltu a2, a1, .LBB97_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB97_2: ; RV64-NEXT: li a4, 0 -; RV64-NEXT: vsext.vf4 v16, v26 -; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu -; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: srli a3, a1, 3 ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a1, a2, a1 @@ -2300,17 +2305,12 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a4, a1 ; RV64-NEXT: .LBB97_4: -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 ; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -2337,22 +2337,23 @@ ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV32-NEXT: vzext.vf4 v8, v26 +; RV32-NEXT: vzext.vf4 v16, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vzext.vf4 v8, v24 +; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: bltu a2, a1, .LBB98_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: .LBB98_2: ; RV32-NEXT: li a4, 0 -; RV32-NEXT: vzext.vf4 v16, v26 -; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsll.vi v16, v8, 3 ; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vnsrl.wi v8, v24, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vl8re8.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v24, (a0), v8, v0.t ; RV32-NEXT: srli a3, a1, 3 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a1, a2, a1 @@ -2361,17 +2362,15 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a4, a1 ; RV32-NEXT: .LBB98_4: -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v16, 3 ; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, mu -; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vnsrl.wi v8, v16, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -2383,31 +2382,25 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: slli a3, a3, 3 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: vl4re16.v v24, (a1) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vl4re16.v v4, (a1) ; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vzext.vf4 v16, v4 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: vzext.vf4 v8, v24 ; RV64-NEXT: mv a3, a2 +; RV64-NEXT: vzext.vf4 v24, v6 ; RV64-NEXT: bltu a2, a1, .LBB98_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB98_2: ; RV64-NEXT: li a4, 0 -; RV64-NEXT: vzext.vf4 v16, v26 -; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu -; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: srli a3, a1, 3 ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a1, a2, a1 @@ -2416,17 +2409,12 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a4, a1 ; RV64-NEXT: .LBB98_4: -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 ; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret