Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10576,11 +10576,15 @@ Subtarget, DAG)) return Blend; + // If either input operand is a zero vector, use VPERM2X128 because its mask + // allows us to replace the zero input with an implicit zero. bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); - // If either input operand is a zero vector, use VPERM2X128 because its mask - // allows us to replace the zero input with an implicit zero. + // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding. + if (Subtarget.hasAVX2() && isSingleInputShuffleMask(Mask) && !IsV1Zero) + return SDValue(); + if (!IsV1Zero && !IsV2Zero) { // Check for patterns which can be matched with a single insert of a 128-bit // subvector. @@ -11038,8 +11042,9 @@ SmallVector WidenedMask; if (canWidenShuffleElements(Mask, WidenedMask)) - return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, - DAG); + if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG)) + return V; if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. @@ -11133,8 +11138,9 @@ SmallVector WidenedMask; if (canWidenShuffleElements(Mask, WidenedMask)) - return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, - DAG); + if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) + return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) Index: test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- test/CodeGen/X86/avx-vperm2x128.ll +++ test/CodeGen/X86/avx-vperm2x128.ll @@ -3,10 +3,15 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_45670123: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_45670123: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_45670123: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -23,30 +28,45 @@ } define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_01230123: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_01230123: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_01230123: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_45674567: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_45674567: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_45674567: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v32i8_2323: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_2323: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_2323: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -64,7 +84,7 @@ ; AVX2-LABEL: shuffle_v32i8_2323_domain: ; AVX2: ## BB#0: ## %entry ; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -181,10 +201,15 @@ } define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_uu67uu67: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uu67uu67: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_uu67uu67: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -211,10 +236,15 @@ } define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_uu674567: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uu674567: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_uu674567: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -231,10 +261,15 @@ } define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_4567uu67: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_4567uu67: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_4567uu67: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -455,7 +455,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -471,7 +471,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -487,7 +487,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -503,7 +503,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -519,7 +519,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -535,7 +535,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -551,7 +551,7 @@ ; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -818,7 +818,7 @@ ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -834,7 +834,7 @@ ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -850,7 +850,7 @@ ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -866,7 +866,7 @@ ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -882,7 +882,7 @@ ; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -902,7 +902,7 @@ ; AVX2-NEXT: movl $15, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -849,15 +849,15 @@ ; ; AVX2-LABEL: shuffle_v4i64_0451: ; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0451: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -893,14 +893,14 @@ ; ; AVX2-LABEL: shuffle_v4i64_4015: ; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_4015: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX512VL-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -671,7 +671,7 @@ ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -679,31 +679,49 @@ } define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_32103210: -; ALL: # BB#0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_32103210: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_32103210: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_76547654: -; ALL: # BB#0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_76547654: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_76547654: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_76543210: -; ALL: # BB#0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_76543210: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_76543210: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1764,7 +1782,7 @@ ; AVX2-LABEL: shuffle_v8i32_32103210: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1780,7 +1798,7 @@ ; AVX2-LABEL: shuffle_v8i32_76547654: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1796,7 +1814,7 @@ ; AVX2-LABEL: shuffle_v8i32_76543210: ; AVX2: # BB#0: ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2668,7 +2668,7 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq %b = add <8 x i32> %a, %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32>