diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3014,7 +3014,10 @@ MVT VT = Op.getSimpleValueType(); assert(VT.isFixedLengthVector() && "Unexpected vector!"); + MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); + SDLoc DL(Op); + auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); MVT XLenVT = Subtarget.getXLenVT(); unsigned NumElts = Op.getNumOperands(); @@ -3065,6 +3068,24 @@ SDValue Vec = DAG.getSplatBuildVector(VT, DL, DominantValue); DenseSet Processed{DominantValue}; + + // We can handle an insert into the last element (of a splat) via + // v(f)slide1down. This is slightly better than the vslideup insert + // lowering as it avoids the need for a vector group temporary. It + // is also better than using vmerge.vx as it avoids the need to + // materialize the mask in a vector register. + if (SDValue LastOp = Op->getOperand(Op->getNumOperands() - 1); + !LastOp.isUndef() && ValueCounts[LastOp] == 1 && + LastOp != DominantValue) { + Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); + auto OpCode = + VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL; + Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec, + LastOp, Mask, VL); + Vec = convertFromScalableVector(VT, Vec, DAG, Subtarget); + Processed.insert(LastOp); + } + MVT SelMaskTy = VT.changeVectorElementType(MVT::i1); for (const auto &OpIdx : enumerate(Op->ops())) { const SDValue &V = OpIdx.value(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -693,18 +693,18 @@ ; RV32NOM-LABEL: extractelt_sdiv_v4i32: ; RV32NOM: # %bb.0: ; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32NOM-NEXT: vmv.v.i v9, -1 -; RV32NOM-NEXT: vmv.v.i v10, 0 -; RV32NOM-NEXT: vslideup.vi v10, v9, 3 +; RV32NOM-NEXT: vmv.v.i v9, 0 +; RV32NOM-NEXT: li a0, -1 +; RV32NOM-NEXT: vslide1down.vx v9, v9, a0 ; RV32NOM-NEXT: lui a0, %hi(.LCPI38_0) ; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI38_0) -; RV32NOM-NEXT: vle32.v v9, (a0) +; RV32NOM-NEXT: vle32.v v10, (a0) ; RV32NOM-NEXT: lui a0, %hi(.LCPI38_1) ; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI38_1) ; RV32NOM-NEXT: vle32.v v11, (a0) -; RV32NOM-NEXT: vand.vv v10, v8, v10 -; RV32NOM-NEXT: vmulh.vv v8, v8, v9 -; RV32NOM-NEXT: vadd.vv v8, v8, v10 +; RV32NOM-NEXT: vand.vv v9, v8, v9 +; RV32NOM-NEXT: vmulh.vv v8, v8, v10 +; RV32NOM-NEXT: vadd.vv v8, v8, v9 ; RV32NOM-NEXT: vsra.vv v9, v8, v11 ; RV32NOM-NEXT: vsrl.vi v8, v8, 31 ; RV32NOM-NEXT: vadd.vv v8, v9, v8 @@ -728,18 +728,18 @@ ; RV64NOM-LABEL: extractelt_sdiv_v4i32: ; RV64NOM: # %bb.0: ; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64NOM-NEXT: vmv.v.i v9, -1 -; RV64NOM-NEXT: vmv.v.i v10, 0 -; RV64NOM-NEXT: vslideup.vi v10, v9, 3 +; RV64NOM-NEXT: vmv.v.i v9, 0 +; RV64NOM-NEXT: li a0, -1 +; RV64NOM-NEXT: vslide1down.vx v9, v9, a0 ; RV64NOM-NEXT: lui a0, %hi(.LCPI38_0) ; RV64NOM-NEXT: addi a0, a0, %lo(.LCPI38_0) -; RV64NOM-NEXT: vle32.v v9, (a0) +; RV64NOM-NEXT: vle32.v v10, (a0) ; RV64NOM-NEXT: lui a0, %hi(.LCPI38_1) ; RV64NOM-NEXT: addi a0, a0, %lo(.LCPI38_1) ; RV64NOM-NEXT: vle32.v v11, (a0) -; RV64NOM-NEXT: vand.vv v10, v8, v10 -; RV64NOM-NEXT: vmulh.vv v8, v8, v9 -; RV64NOM-NEXT: vadd.vv v8, v8, v10 +; RV64NOM-NEXT: vand.vv v9, v8, v9 +; RV64NOM-NEXT: vmulh.vv v8, v8, v10 +; RV64NOM-NEXT: vadd.vv v8, v8, v9 ; RV64NOM-NEXT: vsra.vv v8, v8, v11 ; RV64NOM-NEXT: vsrl.vi v9, v8, 31 ; RV64NOM-NEXT: vadd.vv v8, v8, v9 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -480,27 +480,49 @@ } define void @buildvec_vid_step1o2_v4i32(ptr %z0, ptr %z1, ptr %z2, ptr %z3, ptr %z4, ptr %z5, ptr %z6) { -; CHECK-LABEL: buildvec_vid_step1o2_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsrl.vi v8, v8, 1 -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: vse32.v v8, (a1) -; CHECK-NEXT: vse32.v v8, (a2) -; CHECK-NEXT: vse32.v v8, (a3) -; CHECK-NEXT: vse32.v v8, (a4) -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vmv.v.i v9, 1 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: vslideup.vi v10, v8, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v10, (a5) -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vse32.v v8, (a6) -; CHECK-NEXT: ret +; RV32-LABEL: buildvec_vid_step1o2_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vid.v v8 +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: vmv.v.i v9, 1 +; RV32-NEXT: vse32.v v8, (a2) +; RV32-NEXT: vse32.v v8, (a3) +; RV32-NEXT: vse32.v v8, (a4) +; RV32-NEXT: vmv.s.x v8, zero +; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV32-NEXT: vslideup.vi v9, v8, 1 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vse32.v v9, (a5) +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: vse32.v v8, (a6) +; RV32-NEXT: ret +; +; RV64-LABEL: buildvec_vid_step1o2_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vid.v v8 +; RV64-NEXT: vsrl.vi v8, v8, 1 +; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: vmv.v.i v9, 1 +; RV64-NEXT: vse32.v v8, (a1) +; RV64-NEXT: vse32.v v8, (a2) +; RV64-NEXT: vse32.v v8, (a3) +; RV64-NEXT: vse32.v v8, (a4) +; RV64-NEXT: vmv.s.x v8, zero +; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64-NEXT: vslideup.vi v9, v8, 1 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vse32.v v9, (a5) +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: vse32.v v8, (a6) +; RV64-NEXT: ret store <4 x i32> , ptr %z0 store <4 x i32> , ptr %z1 store <4 x i32> , ptr %z2 @@ -528,11 +550,11 @@ ; CHECK-NEXT: vmv.v.i v8, 3 ; CHECK-NEXT: vmv.v.i v9, 4 ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vse16.v v10, (a5) -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vse16.v v9, (a5) +; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vslide1down.vx v8, v8, a0 ; CHECK-NEXT: vse16.v v8, (a6) ; CHECK-NEXT: ret store <4 x i16> , ptr %z0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -177,39 +177,39 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) { ; RV32-LABEL: vrgather_shuffle_vv_v8i64: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vmv.v.i v16, 2 ; RV32-NEXT: lui a0, %hi(.LCPI11_0) ; RV32-NEXT: addi a0, a0, %lo(.LCPI11_0) -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle16.v v20, (a0) +; RV32-NEXT: li a0, 5 +; RV32-NEXT: vslide1down.vx v21, v16, a0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v8, v20 -; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32-NEXT: vmv.v.i v8, 5 -; RV32-NEXT: vmv.v.i v9, 2 -; RV32-NEXT: vslideup.vi v9, v8, 7 ; RV32-NEXT: li a0, 164 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV32-NEXT: vmv.v.x v0, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v16, v12, v9, v0.t +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vrgatherei16.vv v16, v12, v21, v0.t ; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_vv_v8i64: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: lui a0, %hi(.LCPI11_0) ; RV64-NEXT: addi a0, a0, %lo(.LCPI11_0) -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vle64.v v20, (a0) -; RV64-NEXT: vmv4r.v v16, v8 -; RV64-NEXT: vrgather.vv v8, v16, v20 -; RV64-NEXT: li a0, 5 -; RV64-NEXT: vmv.s.x v20, a0 ; RV64-NEXT: vmv.v.i v16, 2 -; RV64-NEXT: vslideup.vi v16, v20, 7 +; RV64-NEXT: li a0, 5 +; RV64-NEXT: vslide1down.vx v24, v16, a0 +; RV64-NEXT: vrgather.vv v16, v8, v20 ; RV64-NEXT: li a0, 164 ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV64-NEXT: vmv.v.x v0, a0 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vv v8, v12, v16, v0.t +; RV64-NEXT: vrgather.vv v16, v12, v24, v0.t +; RV64-NEXT: vmv.v.v v8, v16 ; RV64-NEXT: ret %s = shufflevector <8 x i64> %x, <8 x i64> %y, <8 x i32> ret <8 x i64> %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1285,10 +1285,10 @@ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmulhu.vv v8, v8, v11 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vmv.v.i v9, 1 -; CHECK-NEXT: vmv.v.i v10, 2 -; CHECK-NEXT: vslideup.vi v10, v9, 3 -; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vmv.v.i v9, 2 +; CHECK-NEXT: li a1, 1 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 +; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x @@ -5216,15 +5216,15 @@ ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmulhu.vv v8, v8, v13 ; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v11 -; LMULMAX1-RV32-NEXT: vmv.v.i v11, 1 -; LMULMAX1-RV32-NEXT: vmv.v.i v12, 2 -; LMULMAX1-RV32-NEXT: vslideup.vi v12, v11, 3 -; LMULMAX1-RV32-NEXT: vsrl.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: vmv.v.i v11, 2 +; LMULMAX1-RV32-NEXT: li a2, 1 +; LMULMAX1-RV32-NEXT: vslide1down.vx v11, v11, a2 +; LMULMAX1-RV32-NEXT: vsrl.vv v8, v8, v11 ; LMULMAX1-RV32-NEXT: vmulhu.vv v9, v10, v9 ; LMULMAX1-RV32-NEXT: vsub.vv v10, v10, v9 ; LMULMAX1-RV32-NEXT: vmulhu.vv v10, v10, v13 ; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vv v9, v9, v12 +; LMULMAX1-RV32-NEXT: vsrl.vv v9, v9, v11 ; LMULMAX1-RV32-NEXT: vse32.v v9, (a0) ; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) ; LMULMAX1-RV32-NEXT: ret