Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10771,6 +10771,59 @@ return SDValue(); } +// Attempt to lower a shuffle where one lane comes from V1 and the other +// lane comes from V2 and the lanes do the same operation. We can create +// a new V1 with the lower lane of V1 and the lower lane of V2. And a new +// V2 with the upper lane of V1 and the upper lane of V2 and then do a +// repeated lane shuffle. +static SDValue lowerVectorShuffleSplitLowHigh(const SDLoc &DL, + MVT VT, SDValue V1, + SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { + int Size = Mask.size(); + SmallVector RepeatMask(Size, -1); + + int LaneSize = 128 / VT.getScalarSizeInBits(); + for (int i = 0; i != Size; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + + // Make sure the element comes from the same source as the half. + if ((M / Size) != (i / LaneSize)) + return SDValue(); + + int LocalM = M % Size; + if (RepeatMask[i % LaneSize] < 0) + RepeatMask[i % LaneSize] = LocalM; + else if (RepeatMask[i % LaneSize] != LocalM) + return SDValue(); + } + + SmallVector PermuteMask(Size, -1); + for (int i = 0; i != Size; ++i) + PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size; + SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); + for (int i = 0; i != Size; ++i) + PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size + LaneSize; + SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); + + for (int i = 0; i != Size; ++i) { + int M = RepeatMask[i % LaneSize]; + PermuteMask[i] = M; + if (PermuteMask[i] < 0) + continue; + + if (PermuteMask[i] >= LaneSize) + PermuteMask[i] += Size - LaneSize; + + PermuteMask[i] += (i / LaneSize) * LaneSize; + } + + return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, PermuteMask); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -13232,6 +13285,15 @@ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); + // Attempt to lower a shuffle where one lane comes from V1 and the other + // lane comes from V2 and the lanes do the same operation. We can create + // a new V1 with the lower lane of V1 and the lower lane of V2. And a new + // V2 with the upper lane of V1 and the upper lane of V2 and then do a + // repeated lane shuffle. + if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v4f64, V1, V2, + Mask, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); } @@ -13419,6 +13481,15 @@ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); + // Attempt to lower a shuffle where one lane comes from V1 and the other + // lane comes from V2 and the lanes do the same operation. We can create + // a new V1 with the lower lane of V1 and the lower lane of V2. And a new + // V2 with the upper lane of V1 and the upper lane of V2 and then do a + // repeated lane shuffle. + if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v8f32, V1, V2, + Mask, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); } Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1671,3 +1671,169 @@ %1 = shufflevector <4 x i64> %a, <4 x i64> , <4 x i32> ret <4 x i64> %1 } + +define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: add_v4f64_0246_1357: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v4f64_0246_1357: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: add_v4f64_0246_1357: +; AVX512VL-SLOW: # %bb.0: # %entry +; AVX512VL-SLOW-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: add_v4f64_0246_1357: +; AVX512VL-FAST: # %bb.0: # %entry +; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-FAST-NEXT: vpermi2pd %ymm1, %ymm0, %ymm2 +; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-FAST-NEXT: vpermi2pd %ymm1, %ymm0, %ymm3 +; AVX512VL-FAST-NEXT: vaddpd %ymm3, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq +entry: + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + %add = fadd <4 x double> %shuffle, %shuffle1 + ret <4 x double> %add +} + +define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: add_v4f64_4602_5713: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v4f64_4602_5713: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: add_v4f64_4602_5713: +; AVX512VL-SLOW: # %bb.0: # %entry +; AVX512VL-SLOW-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: add_v4f64_4602_5713: +; AVX512VL-FAST: # %bb.0: # %entry +; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-FAST-NEXT: vpermi2pd %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-FAST-NEXT: vpermi2pd %ymm0, %ymm1, %ymm3 +; AVX512VL-FAST-NEXT: vaddpd %ymm3, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq +entry: + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + %add = fadd <4 x double> %shuffle, %shuffle1 + ret <4 x double> %add +} + +define <4 x i64> @add_v4i64_0246_1357(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: add_v4i64_0246_1357: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v4i64_0246_1357: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: add_v4i64_0246_1357: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm3 +; AVX512VL-NEXT: vpaddq %ymm3, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + %add = add <4 x i64> %shuffle, %shuffle1 + ret <4 x i64> %add +} + +define <4 x i64> @add_v4i64_4602_5713(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: add_v4i64_4602_5713: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v4i64_4602_5713: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: add_v4i64_4602_5713: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-NEXT: vpermi2q %ymm0, %ymm1, %ymm3 +; AVX512VL-NEXT: vpaddq %ymm3, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + %add = add <4 x i64> %shuffle, %shuffle1 + ret <4 x i64> %add +} Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -860,12 +860,9 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) { ; AVX1-LABEL: PR21138: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR21138: @@ -2419,3 +2416,183 @@ %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> ret <8 x i32> %shuffle } + +define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX512VL-SLOW: # %bb.0: # %entry +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX512VL-FAST: # %bb.0: # %entry +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] +; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm2 +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3 +; AVX512VL-FAST-NEXT: vaddps %ymm3, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + %add = fadd <8 x float> %shuffle, %shuffle1 + ret <8 x float> %add +} + +define <8 x float> @add_v8f32_8ACE0246_9BDF1357(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: add_v8f32_8ACE0246_9BDF1357: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: add_v8f32_8ACE0246_9BDF1357: +; AVX512VL-SLOW: # %bb.0: # %entry +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: add_v8f32_8ACE0246_9BDF1357: +; AVX512VL-FAST: # %bb.0: # %entry +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] +; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm3 +; AVX512VL-FAST-NEXT: vaddps %ymm3, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + %add = fadd <8 x float> %shuffle, %shuffle1 + ret <8 x float> %add +} + +define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: add_v8i32_02468ACE_13579BDF: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v8i32_02468ACE_13579BDF: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: add_v8i32_02468ACE_13579BDF: +; AVX512VL-SLOW: # %bb.0: # %entry +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: add_v8i32_02468ACE_13579BDF: +; AVX512VL-FAST: # %bb.0: # %entry +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] +; AVX512VL-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm3 +; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq +entry: + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + %add = add <8 x i32> %shuffle, %shuffle1 + ret <8 x i32> %add +} + +define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: add_v8i32_8ACE0246_9BDF1357: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v8i32_8ACE0246_9BDF1357: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: add_v8i32_8ACE0246_9BDF1357: +; AVX512VL-SLOW: # %bb.0: # %entry +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: add_v8i32_8ACE0246_9BDF1357: +; AVX512VL-FAST: # %bb.0: # %entry +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] +; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm3 +; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq +entry: + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + %add = add <8 x i32> %shuffle, %shuffle1 + ret <8 x i32> %add +}