diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3731,6 +3731,20 @@ MVT XLenVT = Subtarget.getXLenVT(); MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); auto TrueMask = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).first; + if (Index == 1 && NumSubElts + Index == (int)NumElts && + isa(InPlace)) { + if (SDValue Splat = cast(InPlace)->getSplatValue()) { + auto OpCode = + VT.isFloatingPoint() ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VSLIDE1UP_VL; + auto Vec = DAG.getNode(OpCode, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), + convertToScalableVector(ContainerVT, ToInsert, DAG, Subtarget), + Splat, TrueMask, + DAG.getConstant(NumSubElts + Index, DL, XLenVT)); + return convertFromScalableVector(VT, Vec, DAG, Subtarget); + } + } + // We slide up by the index that the subvector is being inserted at, and set // VL to the index + the number of elements being inserted. unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED | RISCVII::MASK_AGNOSTIC; @@ -3967,6 +3981,10 @@ Subtarget, DAG); } + if (SDValue V = + lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + // Detect an interleave shuffle and lower to // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1)) int EvenSrc, OddSrc; @@ -3989,10 +4007,6 @@ return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget); } - if (SDValue V = - lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG)) - return V; - // Detect shuffles which can be re-expressed as vector selects; these are // shuffles in which each element in the destination is taken from an element // at the corresponding index in either source vectors. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll @@ -171,11 +171,8 @@ define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: trn1.v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> ret <2 x i32> %tmp0 @@ -256,11 +253,8 @@ define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) { ; CHECK-LABEL: trn1.v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> ret <2 x float> %tmp0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll @@ -8,11 +8,7 @@ ; CHECK-LABEL: vslide1up_2xi8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vwaddu.vv v9, v10, v8 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v9, a0, v8 +; CHECK-NEXT: vslide1up.vx v9, v8, a0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <2 x i8> poison, i8 %b, i64 0 @@ -33,8 +29,7 @@ ; RV64-LABEL: vslide1up_4xi8: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vslideup.vi v9, v8, 1 +; RV64-NEXT: vslide1up.vx v9, v8, a0 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <4 x i8> poison, i8 %b, i64 0 @@ -55,8 +50,7 @@ ; RV64-LABEL: vslide1up_4xi8_swapped: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vslideup.vi v9, v8, 1 +; RV64-NEXT: vslide1up.vx v9, v8, a0 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <4 x i8> poison, i8 %b, i64 0 @@ -68,22 +62,16 @@ ; RV32-LABEL: vslide1up_2xi16: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.s.x v10, a0 -; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; RV32-NEXT: vwaddu.vv v9, v10, v8 -; RV32-NEXT: li a0, -1 -; RV32-NEXT: vwmaccu.vx v9, a0, v8 +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vslideup.vi v9, v8, 1 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: vslide1up_2xi16: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV64-NEXT: vwaddu.vv v9, v10, v8 -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vwmaccu.vx v9, a0, v8 +; RV64-NEXT: vslide1up.vx v9, v8, a0 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <2 x i16> poison, i16 %b, i64 0 @@ -95,8 +83,7 @@ ; RV32-LABEL: vslide1up_4xi16: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vslideup.vi v9, v8, 1 +; RV32-NEXT: vslide1up.vx v9, v8, a0 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; @@ -117,22 +104,16 @@ ; RV32-LABEL: vslide1up_2xi32: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v9, v10, v8 -; RV32-NEXT: li a0, -1 -; RV32-NEXT: vwmaccu.vx v9, a0, v8 +; RV32-NEXT: vslide1up.vx v9, v8, a0 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: vslide1up_2xi32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64-NEXT: vwaddu.vv v9, v10, v8 -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vwmaccu.vx v9, a0, v8 +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vslideup.vi v9, v8, 1 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <2 x i32> poison, i32 %b, i64 0 @@ -144,8 +125,7 @@ ; CHECK-LABEL: vslide1up_4xi32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vslide1up.vx v9, v8, a0 ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <4 x i32> poison, i32 %b, i64 0 @@ -171,8 +151,7 @@ ; RV64-LABEL: vslide1up_2xi64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vslideup.vi v9, v8, 1 +; RV64-NEXT: vslide1up.vx v9, v8, a0 ; RV64-NEXT: vmv.v.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <2 x i64> poison, i64 %b, i64 0 @@ -198,8 +177,7 @@ ; RV64-LABEL: vslide1up_4xi64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: vslideup.vi v10, v8, 1 +; RV64-NEXT: vslide1up.vx v10, v8, a0 ; RV64-NEXT: vmv.v.v v8, v10 ; RV64-NEXT: ret %vb = insertelement <4 x i64> poison, i64 %b, i64 0 @@ -211,11 +189,7 @@ ; CHECK-LABEL: vslide1up_2xf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vwaddu.vv v9, v10, v8 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v9, a0, v8 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <2 x half> poison, half %b, i64 0 @@ -227,8 +201,7 @@ ; CHECK-LABEL: vslide1up_4xf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <4 x half> poison, half %b, i64 0 @@ -240,11 +213,7 @@ ; CHECK-LABEL: vslide1up_2xf32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v9, v10, v8 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v9, a0, v8 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <2 x float> poison, float %b, i64 0 @@ -256,8 +225,7 @@ ; CHECK-LABEL: vslide1up_4xf32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <4 x float> poison, float %b, i64 0 @@ -269,8 +237,7 @@ ; CHECK-LABEL: vslide1up_2xf64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <2 x double> poison, double %b, i64 0 @@ -291,6 +258,24 @@ ret <4 x double> %v1 } +define <4 x i8> @vslide1up_4xi8_with_splat(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_4xi8_with_splat: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 14 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vadd.vi v10, v9, -1 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %vb = insertelement <4 x i8> poison, i8 %b, i64 0 + %v1 = shufflevector <4 x i8> %vb, <4 x i8> poison, <4 x i32> zeroinitializer + %v2 = shufflevector <4 x i8> %v1, <4 x i8> %v, <4 x i32> + ret <4 x i8> %v2 +} + define <2 x double> @vslide1up_v2f64_inverted(<2 x double> %v, double %b) { ; CHECK-LABEL: vslide1up_v2f64_inverted: ; CHECK: # %bb.0: @@ -320,7 +305,8 @@ } -; The length of the shift is less than the suffix +; The length of the shift is less than the suffix, since we'd have to +; materailize the splat, using the vslide1up doesn't help us. define <4 x i32> @vslide1up_4xi32_neg1(<4 x i32> %v, i32 %b) { ; CHECK-LABEL: vslide1up_4xi32_neg1: ; CHECK: # %bb.0: @@ -335,3 +321,15 @@ %v1 = shufflevector <4 x i32> %v, <4 x i32> %vb2, <4 x i32> ret <4 x i32> %v1 } + +; We don't know the scalar to do the vslide1up +define <4 x i32> @vslide1up_4xi32_neg2(<4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: vslide1up_4xi32_neg2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret + %res = shufflevector <4 x i32> %v1, <4 x i32> %v2, <4 x i32> + ret <4 x i32> %res +}