diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3089,6 +3089,36 @@ return true; } +/// Returns true if a shuffle can be represented as a +/// (select (slide v1, v1offset), (slide v2, v2offset), selectmask) +static bool isSelectShuffle(ArrayRef Mask, int &V1Offset, int &V2Offset) { + unsigned NumElts = Mask.size(); + std::optional V1OffsetOpt, V2OffsetOpt; + for (auto [i, MaskIndex] : enumerate(Mask)) { + // TODO: We could handle undef mask indices + if (MaskIndex < 0) + return false; + if ((unsigned)MaskIndex < NumElts) { + // The element is from v1 + if (!V1OffsetOpt) + V1OffsetOpt = i - MaskIndex; + else if ((unsigned)MaskIndex != i - *V1OffsetOpt) + return false; + } else { + // The element is from v2 + if (!V2OffsetOpt) + V2OffsetOpt = (NumElts + i) - MaskIndex; + else if ((unsigned)MaskIndex != i + NumElts - *V2OffsetOpt) + return false; + } + } + // If we didn't encounter an element from v1 or v2 then we can just report the + // offset as 0 + V1Offset = V1OffsetOpt.value_or(0); + V2Offset = V2OffsetOpt.value_or(0); + return true; +} + /// Match shuffles that concatenate two vectors, rotate the concatenation, /// and then extract the original number of elements from the rotated result. /// This is equivalent to vector.splice or X86's PALIGNR instruction. The @@ -3553,11 +3583,10 @@ // Detect shuffles which can be re-expressed as vector selects; these are // shuffles in which each element in the destination is taken from an element - // at the corresponding index in either source vectors. - bool IsSelect = all_of(enumerate(Mask), [&](const auto &MaskIdx) { - int MaskIndex = MaskIdx.value(); - return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts; - }); + // at the corresponding index in either source vectors, where the source + // vectors can be slid up or down to make it match. + int V1SelectOffset, V2SelectOffset; + bool IsSelect = isSelectShuffle(Mask, V1SelectOffset, V2SelectOffset); assert(!V1.isUndef() && "Unexpected shuffle canonicalization"); @@ -3607,8 +3636,28 @@ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals); - if (IsSelect) - return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2); + if (IsSelect) { + // vslideup/vslidedown either vector if it means we can perform the shuffle + // by a vmerge + auto SlideIfNeeded = [&VT, &DL, &DAG, &NumElts](SDValue V, + unsigned Offset) { + if (Offset == 0) + return V; + // Create a mask like <1, 2, 3, -1> or <-1, 0, 1, 2> + SmallVector SlideMask(NumElts, -1); + for (unsigned i = 0; i < NumElts; i++) { + unsigned MaskIdx = i + Offset; + if (MaskIdx < 0 || MaskIdx >= NumElts) + continue; + SlideMask[MaskIdx] = i; + } + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), SlideMask); + }; + + return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, + SlideIfNeeded(V1, V1SelectOffset), + SlideIfNeeded(V2, V2SelectOffset)); + } if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) { // On such a large vector we're unable to use i8 as the index type. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -648,14 +648,11 @@ define <8 x i8> @merge_start_into_end(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_start_into_end: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 -; CHECK-NEXT: li a0, 240 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: li a0, 15 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -4 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -664,14 +661,11 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_start_into_end_non_contiguous: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 -; CHECK-NEXT: li a0, 144 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: li a0, 111 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -4 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -692,14 +686,11 @@ define <8 x i8> @merge_start_into_middle(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_start_into_middle: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 -; CHECK-NEXT: li a0, 30 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: li a0, 225 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -720,14 +711,11 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_slidedown: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 -; CHECK-NEXT: li a0, 195 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: li a0, 60 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -737,15 +725,12 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 2 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: li a0, 234 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: li a0, 21 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -150,13 +150,13 @@ ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 +; ZVE32F-NEXT: li a0, 1 ; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 0 -; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vrgather.vi v10, v9, 0 +; ZVE32F-NEXT: vmerge.vvm v8, v10, v8, v0 +; ZVE32F-NEXT: vse32.v v8, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x i32>, ptr %in, align 4 @@ -181,12 +181,11 @@ ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) ; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 +; ZVE32F-NEXT: li a0, 1 ; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 1 -; ZVE32F-NEXT: vrgather.vi v10, v9, 1, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t +; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x i32>, ptr %in, align 4 @@ -209,13 +208,13 @@ ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 +; ZVE32F-NEXT: li a0, 1 ; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 0 -; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vrgather.vi v10, v9, 0 +; ZVE32F-NEXT: vmerge.vvm v8, v10, v8, v0 +; ZVE32F-NEXT: vse32.v v8, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x float>, ptr %in, align 4 @@ -240,12 +239,11 @@ ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) ; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 +; ZVE32F-NEXT: li a0, 1 ; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 1 -; ZVE32F-NEXT: vrgather.vi v10, v9, 1, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t +; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x float>, ptr %in, align 4 @@ -259,13 +257,13 @@ ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 +; V-NEXT: li a0, 1 ; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 0 -; V-NEXT: vrgather.vi v10, v9, 0, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vrgather.vi v10, v9, 0 +; V-NEXT: vmerge.vvm v8, v10, v8, v0 +; V-NEXT: vse64.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_i64: @@ -288,12 +286,11 @@ ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) ; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 +; V-NEXT: li a0, 1 ; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 1 -; V-NEXT: vrgather.vi v10, v9, 1, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vslidedown.vi v9, v8, 2 +; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_64_i64: @@ -315,13 +312,13 @@ ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 +; V-NEXT: li a0, 1 ; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 0 -; V-NEXT: vrgather.vi v10, v9, 0, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vrgather.vi v10, v9, 0 +; V-NEXT: vmerge.vvm v8, v10, v8, v0 +; V-NEXT: vse64.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_double: @@ -344,12 +341,11 @@ ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) ; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 +; V-NEXT: li a0, 1 ; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 1 -; V-NEXT: vrgather.vi v10, v9, 1, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vslidedown.vi v9, v8, 2 +; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_64_double: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -291,16 +291,17 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { ; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v8, 2 -; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t -; CHECK-NEXT: vrgather.vi v9, v8, 1 -; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vi v9, v10, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 +; CHECK-NEXT: vrgather.vi v10, v8, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: ret %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) ret {<2 x i64>, <2 x i64>} %retval @@ -380,16 +381,17 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) { ; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v8, 2 -; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t -; CHECK-NEXT: vrgather.vi v9, v8, 1 -; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vi v9, v10, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 +; CHECK-NEXT: vrgather.vi v10, v8, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: ret %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) ret {<2 x double>, <2 x double>} %retval