diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1688,6 +1688,21 @@ } } + /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation + /// imposed by AVX and specific to the unary pattern. Example: + /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> + /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> + template + void createSplat2ShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Lo) { + assert(Mask.empty() && "Expected an empty shuffle mask vector"); + int NumElts = VT.getVectorNumElements(); + for (int i = 0; i < NumElts; ++i) { + int Pos = i / 2; + Pos += (Lo ? 0 : NumElts / 2); + Mask.push_back(Pos); + } + } + /// Helper function to scale a shuffle or target shuffle mask, replacing each /// mask index with the scaled sequential indices for an equivalent narrowed /// mask. This is the reverse process to canWidenShuffleElements, but can diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10929,6 +10929,32 @@ return SDValue(); } +/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) +/// followed by unpack 256-bit. +static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + SmallVector Unpckl, Unpckh; + createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true); + createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false); + + unsigned UnpackOpcode; + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + UnpackOpcode = X86ISD::UNPCKL; + else if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + UnpackOpcode = X86ISD::UNPCKH; + else + return SDValue(); + + // This is a "natural" unpack operation (rather than the 128-bit sectored + // operation implemented by AVX). We need to rearrange 64-bit chunks of the + // input in order to use the x86 instruction. + V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1), + DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3}); + V1 = DAG.getBitcast(VT, V1); + return DAG.getNode(UnpackOpcode, DL, VT, V1, V1); +} + static bool matchShuffleAsVPMOV(ArrayRef Mask, bool SwappedOps, int Delta) { int Size = (int)Mask.size(); @@ -16210,9 +16236,14 @@ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; - // If the shuffle patterns aren't repeated but it is a single input, directly - // generate a cross-lane VPERMD instruction. if (V2.isUndef()) { + // Try to produce a fixed cross-128-bit lane permute followed by unpack + // because that should be faster than the variable permute alternatives. + if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG)) + return V; + + // If the shuffle patterns aren't repeated but it's a single input, directly + // generate a cross-lane VPERMD instruction. SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); } @@ -16294,6 +16325,11 @@ return V; if (V2.isUndef()) { + // Try to produce a fixed cross-128-bit lane permute followed by unpack + // because that should be faster than the variable permute alternatives. + if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG)) + return V; + // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { @@ -16396,6 +16432,11 @@ // There are no generalized cross-lane shuffle operations available on i8 // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { + // Try to produce a fixed cross-128-bit lane permute followed by unpack + // because that should be faster than the variable permute alternatives. + if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG)) + return V; + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -171,12 +171,9 @@ ; ; AVX2-LABEL: splat2_i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[0,2,1,3] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) ; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper @@ -221,12 +218,9 @@ ; ; AVX2-LABEL: splat2_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7,8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[0,2,1,3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) ; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper @@ -269,11 +263,9 @@ ; ; AVX2-LABEL: splat2_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,2,1,3] +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vmovups %ymm0, 32(%rsi) ; AVX2-NEXT: vmovups %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1517,11 +1517,29 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_00112233: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i32_00112233: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_00112233: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_00112233: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_00112233: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1689,10 +1707,10 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_08991abb: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq ; @@ -1700,7 +1718,7 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,1,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: retq