diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3315,6 +3315,48 @@ DAG.getConstant(0, DL, XLenVT)); } +// Because vslideup leaves the destination elements at the start intact, we can +// use it to perform shuffles that insert subvectors: +// +// vector_shuffle v8:v8i8, v9:v8i8, <0, 1, 2, 3, 8, 9, 10, 11> +// -> +// vsetvli zero, 8, e8, mf2, tu,ma +// vslideup.vi v8, v9, 4 +// +// vector_shuffle v8:v8i8, v9:v8i8 <0, 1, 8, 9, 10, 5, 6, 7> +// -> +// vsetvli zero, 5, e8, mf2, tu,ma +// vslideup.v1 v8, v9, 2 +static SDValue lowerVECTOR_SHUFFLEAsVSlideup(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + const RISCVSubtarget &Subtarget, + SelectionDAG &DAG) { + unsigned NumElts = VT.getVectorNumElements(); + int NumSubElts, Index; + if (!ShuffleVectorInst::isInsertSubvectorMask(Mask, NumElts, NumSubElts, + Index)) + return SDValue(); + + bool OpsSwapped = Mask[Index] < (int)NumElts; + SDValue InPlace = OpsSwapped ? V2 : V1; + SDValue ToInsert = OpsSwapped ? V1 : V2; + + MVT XLenVT = Subtarget.getXLenVT(); + MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); + auto TrueMask = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).first; + // We slide up by the index that the subvector is being inserted at, and set + // VL to the index + the number of elements being inserted + SDValue Slideup = getVSlideup( + DAG, Subtarget, DL, ContainerVT, + convertToScalableVector(ContainerVT, InPlace, DAG, Subtarget), + convertToScalableVector(ContainerVT, ToInsert, DAG, Subtarget), + DAG.getConstant(Index, DL, XLenVT), TrueMask, + DAG.getConstant(NumSubElts + Index, DL, XLenVT), + RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED | RISCVII::MASK_AGNOSTIC); + return convertFromScalableVector(VT, Slideup, DAG, Subtarget); +} + // Given two input vectors of <[vscale x ]n x ty>, use vwaddu.vv and vwmaccu.vx // to create an interleaved vector of <[vscale x] n*2 x ty>. // This requires that the size of ty is less than the subtarget's maximum ELEN. @@ -3551,6 +3593,10 @@ return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget); } + if (SDValue V = + lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + // Detect shuffles which can be re-expressed as vector selects; these are // shuffles in which each element in the destination is taken from an element // at the corresponding index in either source vectors. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -41,14 +41,11 @@ ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vrgather.vi v12, v8, 0 ; LMULMAX1-NEXT: vrgather.vi v12, v9, 3, v0.t -; LMULMAX1-NEXT: li a0, 8 -; LMULMAX1-NEXT: vmv.s.x v0, a0 -; LMULMAX1-NEXT: vrgather.vi v9, v10, 0 -; LMULMAX1-NEXT: li a0, 3 -; LMULMAX1-NEXT: vmv.s.x v8, a0 -; LMULMAX1-NEXT: vrgather.vi v9, v11, 3, v0.t -; LMULMAX1-NEXT: vmv.v.v v0, v8 -; LMULMAX1-NEXT: vmerge.vvm v8, v9, v12, v0 +; LMULMAX1-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; LMULMAX1-NEXT: vslideup.vi v11, v10, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; LMULMAX1-NEXT: vslideup.vi v11, v12, 0 +; LMULMAX1-NEXT: vmv1r.v v8, v11 ; LMULMAX1-NEXT: ret ; ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -645,19 +645,39 @@ ret <4 x i16> %5 } -define <8 x i8> @merge_start_into_end(<8 x i8> %v, <8 x i8> %w) { -; CHECK-LABEL: merge_start_into_end: +define <8 x i8> @slideup_into_end(<8 x i8> %v, <8 x i8> %w) { +; CHECK-LABEL: slideup_into_end: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: ret + %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i8> @slideup_into_end_undef(<8 x i8> %v, <8 x i8> %w) { +; CHECK-LABEL: slideup_into_end_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: ret + %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i8> @slideup_into_end_undef_at_start(<8 x i8> %v, <8 x i8> %w) { +; CHECK-LABEL: slideup_into_end_undef_at_start: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vid.v v11 ; CHECK-NEXT: vrgather.vv v10, v8, v11 -; CHECK-NEXT: li a0, 240 +; CHECK-NEXT: li a0, 224 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vadd.vi v8, v11, -4 ; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret - %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> + %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res } @@ -677,41 +697,32 @@ ret <8 x i8> %res } -define <8 x i8> @merge_end_into_end(<8 x i8> %v, <8 x i8> %w) { -; CHECK-LABEL: merge_end_into_end: +define <8 x i8> @slideup_end_into_end(<8 x i8> %v, <8 x i8> %w) { +; CHECK-LABEL: slideup_end_into_end: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 15 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v9, v8, 0 +; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res } -define <8 x i8> @merge_start_into_middle(<8 x i8> %v, <8 x i8> %w) { -; CHECK-LABEL: merge_start_into_middle: +define <8 x i8> @slideup_into_middle(<8 x i8> %v, <8 x i8> %w) { +; CHECK-LABEL: slideup_into_middle: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 -; CHECK-NEXT: li a0, 30 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetivli zero, 5, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res } -define <8 x i8> @merge_start_into_start(<8 x i8> %v, <8 x i8> %w) { -; CHECK-LABEL: merge_start_into_start: +define <8 x i8> @slideup_start_into_start(<8 x i8> %v, <8 x i8> %w) { +; CHECK-LABEL: slideup_start_into_start: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 240 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 0 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -758,8 +769,8 @@ ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vadd.vi v11, v10, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI44_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI44_0) +; CHECK-NEXT: lui a0, %hi(.LCPI46_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) ; CHECK-NEXT: vle8.v v12, (a0) ; CHECK-NEXT: li a0, 234 ; CHECK-NEXT: vmv.s.x v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -150,13 +150,11 @@ ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 -; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 0 -; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; ZVE32F-NEXT: vse32.v v8, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x i32>, ptr %in, align 4 @@ -209,13 +207,11 @@ ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 -; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 0 -; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; ZVE32F-NEXT: vse32.v v8, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x float>, ptr %in, align 4 @@ -259,13 +255,11 @@ ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 -; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 0 -; V-NEXT: vrgather.vi v10, v9, 0, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; V-NEXT: vslideup.vi v8, v9, 1 +; V-NEXT: vse64.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_i64: @@ -315,13 +309,11 @@ ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 -; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 0 -; V-NEXT: vrgather.vi v10, v9, 0, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; V-NEXT: vslideup.vi v8, v9, 1 +; V-NEXT: vse64.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_double: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -292,15 +292,14 @@ ; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: li a0, 2 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t ; CHECK-NEXT: vrgather.vi v9, v8, 1 -; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: ret %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) ret {<2 x i64>, <2 x i64>} %retval @@ -381,15 +380,14 @@ ; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: li a0, 2 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t ; CHECK-NEXT: vrgather.vi v9, v8, 1 -; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: ret %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) ret {<2 x double>, <2 x double>} %retval