diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3267,9 +3267,9 @@ /// Is this shuffle interleaving contiguous elements from one vector into the /// even elements and contiguous elements from another vector into the odd -/// elements. \p Src1 will contain the element that should be in the first even -/// element. \p Src2 will contain the element that should be in the first odd -/// element. These can be the first element in a source or the element half +/// elements. \p EvenSrc will contain the element that should be in the first +/// even element. \p OddSrc will contain the element that should be in the first +/// odd element. These can be the first element in a source or the element half /// way through the source. static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, int &EvenSrc, int &OddSrc, const RISCVSubtarget &Subtarget) { @@ -3278,7 +3278,8 @@ return false; int Size = Mask.size(); - assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + int NumElts = VT.getVectorNumElements(); + assert(Size == (int)NumElts && "Unexpected mask size"); SmallVector StartIndexes; if (!ShuffleVectorInst::isInterleaveMask(Mask, 2, Size * 2, StartIndexes)) @@ -3291,6 +3292,33 @@ if (EvenSrc != 0 && OddSrc != 0) return false; + // We only interleave the lower halves of the input vectors + int NumEltsPerOp = NumElts / 2; + + int MaskRange; + if (EvenSrc < NumElts && OddSrc < NumElts) + // If EvenSrc and OddSrc are smaller than the size of the VT, then it's an + // unary interleave like: + // (vector_shuffle <0,2,1,3> x:v4i8, y:v4i8) + // i.e. the first operand extracted into two subvectors which we then + // interleave. + // So the mask indices must all select from the first vector operand, e.g + // 0...4 + MaskRange = NumElts; + else + // If EvenSrc and OddSrc are selecting from both input operands, then it's a + // binary interleave like: + // (vector_shuffle <0,4,1,5> x:v4i8, y:v4i8) + // So the mask indices must select from somewhere in the concatenation of + // both vector operands, e.g. 0...8 + MaskRange = NumElts * 2; + + // Make sure that indices lie within 0...MaskRange + // e.g we can't interleave (vector_shuffle <0,3,1,4> x:v2i8, y:v2i8) + if ((EvenSrc + NumEltsPerOp) > MaskRange || + (OddSrc + NumEltsPerOp) > MaskRange) + return false; + return true; } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -172,6 +172,58 @@ ret <8 x i32> %a } +; %y should be slid down by 2 +define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) { +; V128-LABEL: interleave_v4i32_offset_2: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v9, 2 +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v4i32_offset_2: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v9, 2 +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> + ret <4 x i32> %a +} + +; %y should be slid down by 1 +define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) { +; V128-LABEL: interleave_v4i32_offset_1: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v9, 1 +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v4i32_offset_1: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v9, 1 +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> + ret <4 x i32> %a +} + define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) { ; V128-LABEL: interleave_v8i8: ; V128: # %bb.0: @@ -362,8 +414,8 @@ ; RV32-V128-NEXT: slli a0, a0, 4 ; RV32-V128-NEXT: sub sp, sp, a0 ; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV32-V128-NEXT: lui a0, %hi(.LCPI15_0) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI15_0) +; RV32-V128-NEXT: lui a0, %hi(.LCPI17_0) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) ; RV32-V128-NEXT: li a1, 32 ; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-V128-NEXT: vle32.v v0, (a0) @@ -371,8 +423,8 @@ ; RV32-V128-NEXT: vrgather.vv v8, v24, v0 ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-V128-NEXT: lui a0, %hi(.LCPI15_1) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI15_1) +; RV32-V128-NEXT: lui a0, %hi(.LCPI17_1) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) ; RV32-V128-NEXT: vle32.v v24, (a0) ; RV32-V128-NEXT: csrr a0, vlenb ; RV32-V128-NEXT: slli a0, a0, 3 @@ -413,8 +465,8 @@ ; RV64-V128-NEXT: slli a0, a0, 4 ; RV64-V128-NEXT: sub sp, sp, a0 ; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV64-V128-NEXT: lui a0, %hi(.LCPI15_0) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI15_0) +; RV64-V128-NEXT: lui a0, %hi(.LCPI17_0) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) ; RV64-V128-NEXT: li a1, 32 ; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-V128-NEXT: vle32.v v0, (a0) @@ -422,8 +474,8 @@ ; RV64-V128-NEXT: vrgather.vv v8, v24, v0 ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-V128-NEXT: lui a0, %hi(.LCPI15_1) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI15_1) +; RV64-V128-NEXT: lui a0, %hi(.LCPI17_1) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) ; RV64-V128-NEXT: vle32.v v24, (a0) ; RV64-V128-NEXT: csrr a0, vlenb ; RV64-V128-NEXT: slli a0, a0, 3 @@ -494,6 +546,31 @@ ret <4 x i8> %a } +; This shouldn't be interleaved +define <4 x i8> @unary_interleave_v4i8_invalid(<4 x i8> %x) { +; V128-LABEL: unary_interleave_v4i8_invalid: +; V128: # %bb.0: +; V128-NEXT: lui a0, %hi(.LCPI19_0) +; V128-NEXT: addi a0, a0, %lo(.LCPI19_0) +; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; V128-NEXT: vle8.v v10, (a0) +; V128-NEXT: vrgather.vv v9, v8, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4i8_invalid: +; V512: # %bb.0: +; V512-NEXT: lui a0, %hi(.LCPI19_0) +; V512-NEXT: addi a0, a0, %lo(.LCPI19_0) +; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V512-NEXT: vle8.v v10, (a0) +; V512-NEXT: vrgather.vv v9, v8, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> + ret <4 x i8> %a +} + define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) { ; V128-LABEL: unary_interleave_v4i16: ; V128: # %bb.0: @@ -548,8 +625,8 @@ define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) { ; RV32-V128-LABEL: unary_interleave_v4i64: ; RV32-V128: # %bb.0: -; RV32-V128-NEXT: lui a0, %hi(.LCPI19_0) -; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV32-V128-NEXT: lui a0, %hi(.LCPI22_0) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI22_0) ; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-V128-NEXT: vle16.v v12, (a0) ; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v12 @@ -558,8 +635,8 @@ ; ; RV64-V128-LABEL: unary_interleave_v4i64: ; RV64-V128: # %bb.0: -; RV64-V128-NEXT: lui a0, %hi(.LCPI19_0) -; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV64-V128-NEXT: lui a0, %hi(.LCPI22_0) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI22_0) ; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-V128-NEXT: vle64.v v12, (a0) ; RV64-V128-NEXT: vrgather.vv v10, v8, v12 @@ -568,8 +645,8 @@ ; ; RV32-V512-LABEL: unary_interleave_v4i64: ; RV32-V512: # %bb.0: -; RV32-V512-NEXT: lui a0, %hi(.LCPI19_0) -; RV32-V512-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV32-V512-NEXT: lui a0, %hi(.LCPI22_0) +; RV32-V512-NEXT: addi a0, a0, %lo(.LCPI22_0) ; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; RV32-V512-NEXT: vle16.v v10, (a0) ; RV32-V512-NEXT: vrgatherei16.vv v9, v8, v10 @@ -578,8 +655,8 @@ ; ; RV64-V512-LABEL: unary_interleave_v4i64: ; RV64-V512: # %bb.0: -; RV64-V512-NEXT: lui a0, %hi(.LCPI19_0) -; RV64-V512-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV64-V512-NEXT: lui a0, %hi(.LCPI22_0) +; RV64-V512-NEXT: addi a0, a0, %lo(.LCPI22_0) ; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; RV64-V512-NEXT: vle64.v v10, (a0) ; RV64-V512-NEXT: vrgather.vv v9, v8, v10