Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10747,6 +10747,59 @@ return SDValue(); } +// Attempt to lower a shuffle where one lane comes from V1 and the other +// lane comes from V2 and the lanes do the same operation. We can create +// a new V1 with the lower lane of V1 and the lower lane of V2. And a new +// V2 with the upper lane of V1 and the upper lane of V2 and then do a +// repeated lane shuffle. +static SDValue lowerVectorShuffleSplitLowHigh(const SDLoc &DL, + MVT VT, SDValue V1, + SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { + int Size = Mask.size(); + SmallVector RepeatMask(Size, -1); + + int LaneSize = 128 / VT.getScalarSizeInBits(); + for (int i = 0; i != Size; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + + // Make sure the element comes from the same source as the half. + if ((M / Size) != (i / LaneSize)) + return SDValue(); + + int LocalM = M % Size; + if (RepeatMask[i % LaneSize] < 0) + RepeatMask[i % LaneSize] = LocalM; + else if (RepeatMask[i % LaneSize] != LocalM) + return SDValue(); + } + + SmallVector PermuteMask(Size, -1); + for (int i = 0; i != Size; ++i) + PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size; + SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); + for (int i = 0; i != Size; ++i) + PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size + LaneSize; + SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); + + for (int i = 0; i != Size; ++i) { + int M = RepeatMask[i % LaneSize]; + PermuteMask[i] = M; + if (PermuteMask[i] < 0) + continue; + + if (PermuteMask[i] >= LaneSize) + PermuteMask[i] += Size - LaneSize; + + PermuteMask[i] += (i / LaneSize) * LaneSize; + } + + return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, PermuteMask); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -13208,6 +13261,15 @@ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); + // Attempt to lower a shuffle where one lane comes from V1 and the other + // lane comes from V2 and the lanes do the same operation. We can create + // a new V1 with the lower lane of V1 and the lower lane of V2. And a new + // V2 with the upper lane of V1 and the upper lane of V2 and then do a + // repeated lane shuffle. + if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v4f64, V1, V2, + Mask, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); } @@ -13401,6 +13463,15 @@ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); + // Attempt to lower a shuffle where one lane comes from V1 and the other + // lane comes from V2 and the lanes do the same operation. We can create + // a new V1 with the lower lane of V1 and the lower lane of V2. And a new + // V2 with the upper lane of V1 and the upper lane of V2 and then do a + // repeated lane shuffle. + if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v8f32, V1, V2, + Mask, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); } Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1675,17 +1675,11 @@ define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: add_v4f64_0246_1357: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],xmm4[0] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v4f64_0246_1357: @@ -1724,17 +1718,11 @@ define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: add_v4f64_4602_5713: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm1[0],xmm4[0] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v4f64_4602_5713: @@ -1773,21 +1761,15 @@ define <4 x i64> @add_v4i64_0246_1357(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: add_v4i64_0246_1357: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],xmm4[0] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v4i64_0246_1357: @@ -1826,21 +1808,15 @@ define <4 x i64> @add_v4i64_4602_5713(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: add_v4i64_4602_5713: ; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm1[0],xmm4[0] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v4i64_4602_5713: Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -860,12 +860,9 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) { ; AVX1-LABEL: PR21138: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR21138: @@ -2423,17 +2420,11 @@ define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: add_v8f32_02468ACE_13579BDF: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm0[0,2],xmm4[0,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v8f32_02468ACE_13579BDF: @@ -2472,17 +2463,11 @@ define <8 x float> @add_v8f32_8ACE0246_9BDF1357(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: add_v8f32_8ACE0246_9BDF1357: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,2],xmm4[0,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357: @@ -2521,21 +2506,15 @@ define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: add_v8i32_02468ACE_13579BDF: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm0[0,2],xmm4[0,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v8i32_02468ACE_13579BDF: @@ -2574,21 +2553,15 @@ define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: add_v8i32_8ACE0246_9BDF1357: ; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,2],xmm4[0,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v8i32_8ACE0246_9BDF1357: