Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10771,6 +10771,59 @@ return SDValue(); } +// Attempt to lower a shuffle where one lane comes from V1 and the other +// lane comes from V2 and the lanes do the same operation. We can create +// a new V1 with the lower lane of V1 and the lower lane of V2. And a new +// V2 with the upper lane of V1 and the upper lane of V2 and then do a +// repeated lane shuffle. +static SDValue lowerVectorShuffleSplitLowHigh(const SDLoc &DL, + MVT VT, SDValue V1, + SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { + int Size = Mask.size(); + SmallVector RepeatMask(Size, -1); + + int LaneSize = 128 / VT.getScalarSizeInBits(); + for (int i = 0; i != Size; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + + // Make sure the element comes from the same source as the half. + if ((M / Size) != (i / LaneSize)) + return SDValue(); + + int LocalM = M % Size; + if (RepeatMask[i % LaneSize] < 0) + RepeatMask[i % LaneSize] = LocalM; + else if (RepeatMask[i % LaneSize] != LocalM) + return SDValue(); + } + + SmallVector PermuteMask(Size, -1); + for (int i = 0; i != Size; ++i) + PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size; + SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); + for (int i = 0; i != Size; ++i) + PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size + LaneSize; + SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); + + for (int i = 0; i != Size; ++i) { + int M = RepeatMask[i % LaneSize]; + PermuteMask[i] = M; + if (PermuteMask[i] < 0) + continue; + + if (PermuteMask[i] >= LaneSize) + PermuteMask[i] += Size - LaneSize; + + PermuteMask[i] += (i / LaneSize) * LaneSize; + } + + return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, PermuteMask); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -13419,6 +13472,15 @@ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); + // Attempt to lower a shuffle where one lane comes from V1 and the other + // lane comes from V2 and the lanes do the same operation. We can create + // a new V1 with the lower lane of V1 and the lower lane of V2. And a new + // V2 with the upper lane of V1 and the upper lane of V2 and then do a + // repeated lane shuffle. + if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v8f32, V1, V2, + Mask, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); } Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -860,12 +860,9 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) { ; AVX1-LABEL: PR21138: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR21138: @@ -2419,3 +2416,46 @@ %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> ret <8 x i32> %shuffle } + +define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX512VL-SLOW: # %bb.0: # %entry +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX512VL-FAST: # %bb.0: # %entry +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] +; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm2 +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3 +; AVX512VL-FAST-NEXT: vaddps %ymm3, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + %add = fadd <8 x float> %shuffle, %shuffle1 + ret <8 x float> %add +}