Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -12131,16 +12131,23 @@ : Mask[i] % LaneSize + (i / LaneSize) * LaneSize + Size); - // Flip the vector, and blend the results which should now be in-lane. The - // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and - // 5 for the high source. The value 3 selects the high half of source 2 and - // the value 2 selects the low half of source 2. We only use source 2 to - // allow folding it into a memory operand. - unsigned PERMMask = 3 | 2 << 4; + // Flip the vector, and blend the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; - SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, PVT, DAG.getUNDEF(VT), - DAG.getBitcast(PVT, V1), - DAG.getConstant(PERMMask, DL, MVT::i8)); + SDValue Flipped = DAG.getBitcast(PVT, V1); + if (Subtarget.hasAVX2()) { + // If we have AVX2 we should prefer to use VPERMQ/VPERMPD. + unsigned PERMMask = 0x4e; + Flipped = DAG.getNode(X86ISD::VPERMI, DL, PVT, Flipped, + DAG.getConstant(PERMMask, DL, MVT::i8)); + } else { + // The VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and + // 5 for the high source. The value 3 selects the high half of source 2 and + // the value 2 selects the low half of source 2. We only use source 2 to + // allow folding it into a memory operand. + unsigned PERMMask = 3 | 2 << 4; + Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, PVT, DAG.getUNDEF(VT), + Flipped, DAG.getConstant(PERMMask, DL, MVT::i8)); + } Flipped = DAG.getBitcast(VT, Flipped); return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); } @@ -27667,8 +27674,11 @@ // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. // Handle 128-bit lane shuffles of 256-bit vectors. + // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless + // we need to use the zeroing feature. // TODO - this should support binary shuffles. if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 && + !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) && !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! Index: test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- test/CodeGen/X86/avx-vperm2x128.ll +++ test/CodeGen/X86/avx-vperm2x128.ll @@ -3,20 +3,30 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_45670123: -; ALL: # BB#0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_45670123: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_45670123: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_45670123_mem: -; ALL: # BB#0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_45670123_mem: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_45670123_mem: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX2-NEXT: retq entry: %a = load <8 x float>, <8 x float>* %pa %b = load <8 x float>, <8 x float>* %pb @@ -42,7 +52,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_01230123: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -50,10 +60,15 @@ } define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_01230123_mem: -; ALL: # BB#0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_01230123_mem: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_01230123_mem: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-NEXT: retq entry: %a = load <8 x float>, <8 x float>* %pa %b = load <8 x float>, <8 x float>* %pb @@ -62,20 +77,30 @@ } define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_45674567: -; ALL: # BB#0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_45674567: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_45674567: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_45674567_mem: -; ALL: # BB#0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_45674567_mem: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_45674567_mem: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,2,3] +; AVX2-NEXT: retq entry: %a = load <8 x float>, <8 x float>* %pa %b = load <8 x float>, <8 x float>* %pb @@ -84,10 +109,15 @@ } define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v32i8_2323: -; ALL: # BB#0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_2323: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_2323: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -107,7 +137,7 @@ ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -288,10 +318,15 @@ } define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_4567uu67: -; ALL: # BB#0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_4567uu67: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_4567uu67: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle Index: test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -567,15 +567,10 @@ define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) { -; AVX2-LABEL: test_x86_avx2_vperm2i128: -; AVX2: ## BB#0: -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: retl -; -; AVX512-LABEL: test_x86_avx2_vperm2i128: -; AVX512: ## BB#0: -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512-NEXT: retl +; CHECK-LABEL: test_x86_avx2_vperm2i128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; CHECK-NEXT: retl %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -1066,7 +1066,7 @@ ; AVX2-NEXT: vmovdqu (%rcx), %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,2,3,6,7,2,3,8,9,8,9,4,5,6,7,16,17,18,19,22,23,18,19,24,25,24,25,20,21,22,23] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = @@ -1385,7 +1385,7 @@ ; AVX2-NEXT: vmovdqu (%rdx), %ymm1 ; AVX2-NEXT: vmovdqu (%rcx), %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,0,2,2] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm4 Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -159,7 +159,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -188,7 +188,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] @@ -216,7 +216,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq @@ -243,7 +243,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq @@ -489,17 +489,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -511,17 +505,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -533,17 +521,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -555,17 +537,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -577,17 +553,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -599,17 +569,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -621,17 +585,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1835,7 +1793,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -1863,7 +1821,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] @@ -1893,7 +1851,7 @@ ; AVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; AVX2: # BB#0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] @@ -1921,7 +1879,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] @@ -1953,7 +1911,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] @@ -1983,7 +1941,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] @@ -2011,9 +1969,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: @@ -2039,7 +1997,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] @@ -2068,7 +2026,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,24,25,24,25,24,25,24,25,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2096,7 +2054,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] @@ -2125,7 +2083,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15] @@ -2154,7 +2112,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2183,7 +2141,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1,30,31,26,27,28,29,24,25,22,23,18,19,20,21,16,17] @@ -2211,7 +2169,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1,18,19,16,17,26,27,24,25,26,27,24,25,18,19,16,17] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2240,7 +2198,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1,26,27,24,25,18,19,16,17,26,27,24,25,18,19,16,17] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2269,7 +2227,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9,26,27,24,25,18,19,16,17,18,19,16,17,26,27,24,25] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2298,7 +2256,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1,16,17,24,25,24,25,16,17,16,17,24,25,24,25,16,17] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2327,7 +2285,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9,24,25,16,17,16,17,24,25,24,25,16,17,16,17,24,25] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2356,7 +2314,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7,20,21,28,29,24,25,16,17,26,27,18,19,30,31,22,23] @@ -2384,7 +2342,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7,20,21,16,17,28,29,24,25,26,27,18,19,30,31,22,23] @@ -2412,7 +2370,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11,20,21,28,29,24,25,16,17,18,19,22,23,30,31,26,27] @@ -2440,7 +2398,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-NEXT: retq @@ -2467,7 +2425,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9,16,17,16,17,24,25,24,25,24,25,24,25,24,25,24,25] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2496,7 +2454,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9,24,25,24,25,16,17,16,17,24,25,24,25,24,25,24,25] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2525,7 +2483,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2554,7 +2512,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1,16,17,24,25,24,25,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2584,7 +2542,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -2613,7 +2571,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9,16,17,18,19,24,25,24,25,24,25,24,25,24,25,24,25] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,255,255,255,255,0,0,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> @@ -2642,7 +2600,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9,24,25,24,25,24,25,16,17,24,25,24,25,24,25,24,25] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,255,255,255,255,255,255,255,255,0,0,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255> @@ -2671,7 +2629,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = @@ -2792,7 +2750,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7,16,17,18,19,20,21,30,31,24,25,26,27,28,29,22,23] @@ -2819,10 +2777,10 @@ ; ; AVX2-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15,24,25,26,27,28,29,22,23,16,17,18,19,20,21,30,31] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15,24,25,26,27,28,29,22,23,16,17,18,19,20,21,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: @@ -2847,7 +2805,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-NEXT: retq @@ -2875,7 +2833,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] @@ -2906,10 +2864,10 @@ ; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX2: # BB#0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: @@ -2935,10 +2893,10 @@ ; ; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq ; @@ -2967,7 +2925,7 @@ ; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; AVX2: # BB#0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23] @@ -3001,7 +2959,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] @@ -3036,7 +2994,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u> ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19] @@ -3071,7 +3029,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,2,3,0,1,8,9,10,11,6,7,4,5,16,17,18,19,18,19,16,17,24,25,26,27,22,23,20,21] @@ -3102,7 +3060,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] @@ -3132,10 +3090,10 @@ ; ; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX2-NEXT: retq ; @@ -3168,7 +3126,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX2: # BB#0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -3354,7 +3312,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; AVX2: # BB#0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,u,u> ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23] @@ -3474,7 +3432,7 @@ ; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX2: # BB#0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] @@ -3519,7 +3477,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] @@ -3584,7 +3542,7 @@ ; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX2: # BB#0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] @@ -3630,7 +3588,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] @@ -3695,7 +3653,7 @@ ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX2: # BB#0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] @@ -3742,7 +3700,7 @@ ; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX2: # BB#0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] @@ -4027,16 +3985,16 @@ ; ; AVX2-LABEL: PR24935: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> -; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4],ymm2[5,6,7,8],ymm0[9,10],ymm2[11],ymm0[12],ymm2[13,14,15] +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -4063,7 +4021,7 @@ ; ; AVX2-LABEL: PR34369: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,255,255,u,u,255,255,u,u,u,u,255,255,255,255,255,255,u,u,255,255,u,u,u,u,255,255> ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1],zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[4,5,30,31,16,17],zero,zero,ymm0[16,17,18,19,20,21,24,25,24,25] Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -304,7 +304,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 @@ -314,7 +314,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpbroadcastb %xmm0, %xmm0 @@ -340,7 +340,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] @@ -348,7 +348,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: movl $1, %eax ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} @@ -371,7 +371,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] @@ -379,7 +379,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: movw $1, %ax ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} @@ -402,7 +402,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] @@ -410,7 +410,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: movw $1, %ax ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} @@ -433,7 +433,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2OR512VL-NEXT: retq @@ -454,7 +454,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2OR512VL-NEXT: retq @@ -475,7 +475,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2OR512VL-NEXT: retq @@ -496,7 +496,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2OR512VL-NEXT: retq @@ -524,7 +524,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq @@ -552,7 +552,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq @@ -580,7 +580,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq @@ -608,7 +608,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq @@ -636,7 +636,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq @@ -664,7 +664,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq @@ -692,7 +692,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq @@ -724,7 +724,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-NEXT: movl $15, %eax ; AVX512VL-NEXT: vmovd %eax, %xmm1 @@ -886,17 +886,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -908,17 +902,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -930,17 +918,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -952,17 +934,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -974,17 +950,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -998,21 +968,13 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2: # BB#0: -; AVX2-NEXT: movl $15, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movl $15, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: movl $15, %eax +; AVX2OR512VL-NEXT: vmovd %eax, %xmm1 +; AVX2OR512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1659,30 +1621,30 @@ ; ; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] ; AVX512VL-NEXT: movl $-222248896, %eax # imm = 0xF2C0C040 ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] ; AVX512VL-NEXT: movl $134948620, %eax # imm = 0x80B270C ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -706,49 +706,43 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_32103210: -; AVX2: # BB#0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_32103210: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_32103210: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_76547654: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_76547654: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v8f32_76547654: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_76547654: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_76543210: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_76543210: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v8f32_76543210: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_76543210: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1799,17 +1793,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_32103210: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_32103210: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_32103210: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1821,17 +1809,11 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_76547654: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_76547654: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_76547654: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1843,17 +1825,11 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_76543210: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_76543210: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_76543210: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -39,14 +39,14 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: ; KNL: ## BB#0: -; KNL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] -; KNL-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19] -; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255> -; KNL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 -; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] -; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u> -; KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19] +; KNL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] +; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255> +; KNL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] +; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u> +; KNL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,255,255,255,255,255,255,255,255,u,u,255,255,255,255,u,u,255,255,0,0> ; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; KNL-NEXT: retq @@ -63,10 +63,10 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: ; KNL: ## BB#0: -; KNL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7],ymm1[8,9,10,11],ymm2[12,13],ymm1[14],ymm2[15] ; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u] -; KNL-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3,0,1] +; KNL-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15] ; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u] ; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] Index: test/CodeGen/X86/vector-shuffle-512-v64.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v64.ll +++ test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -153,9 +153,9 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512F-NEXT: retq ; @@ -169,9 +169,9 @@ ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512DQ-NEXT: retq ; @@ -436,11 +436,11 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u> ; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm2 ; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512F-NEXT: retq @@ -449,12 +449,12 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u> ; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -463,11 +463,11 @@ ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u> ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512DQ-NEXT: retq @@ -487,12 +487,12 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] ; AVX512F-NEXT: vpshufb %ymm5, %ymm1, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpshufb %ymm5, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 @@ -503,13 +503,13 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] ; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -519,12 +519,12 @@ ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2241,12 +2241,12 @@ define <8 x double> @shuffle_v8f64_2301uuuu(<8 x double> %a0, <8 x double> %a1) { ; AVX512F-LABEL: shuffle_v8f64_2301uuuu: ; AVX512F: # BB#0: -; AVX512F-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_2301uuuu: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX512F-32-NEXT: retl %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> ret <8 x double> %1 Index: test/CodeGen/X86/vector-shuffle-combining-avx.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 ; ; Combine tests involving AVX target shuffles @@ -149,15 +149,35 @@ } define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) { -; X32-LABEL: combine_vpermilvar_vperm2f128_8f32: -; X32: # BB#0: -; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; X32-NEXT: retl +; X32-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X32-AVX1-NEXT: retl ; -; X64-LABEL: combine_vpermilvar_vperm2f128_8f32: -; X64: # BB#0: -; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; X64-NEXT: retq +; X32-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X32-AVX2-NEXT: retl +; +; X32-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X32-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32: +; X64-AVX1: # BB#0: +; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X64-AVX512-NEXT: retq %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> ) Index: test/CodeGen/X86/vector-shuffle-combining-avx2.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -503,14 +503,14 @@ ; X32-LABEL: combine_pshufb_as_zext128: ; X32: # BB#0: ; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X32-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero ; X32-NEXT: retl ; ; X64-LABEL: combine_pshufb_as_zext128: ; X64: # BB#0: ; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero ; X64-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2549,7 +2549,7 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq %b = add <8 x i32> %a, %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -132,11 +132,11 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) { ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX512F: # BB#0: -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0] -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: Index: test/CodeGen/X86/vector-shuffle-v48.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v48.ll +++ test/CodeGen/X86/vector-shuffle-v48.ll @@ -5,7 +5,7 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vmovdqu 32(%rdi), %xmm0 ; CHECK-NEXT: vmovdqu (%rdi), %ymm1 -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] ; CHECK-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u> Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -772,14 +772,14 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],xmm5[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm13 ; AVX2-NEXT: vpshufb %xmm7, %xmm13, %xmm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm14 ; AVX2-NEXT: vpshufb %xmm7, %xmm14, %xmm7 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -855,14 +855,14 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm5 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm12 ; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm3 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm13 ; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14 ; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1] ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] @@ -1227,7 +1227,7 @@ ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,5,21,u,u,22,u,u,23,u,u,24,u,u,25,u,u,26] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,u,u,22,u,u,23,u,u,24,u,u,25,u,u,26,u] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255> ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 @@ -1252,7 +1252,7 @@ ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,5,21,u,u,22,u,u,23,u,u,24,u,u,25,u,u,26] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,u,u,22,u,u,23,u,u,24,u,u,25,u,u,26,u] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255> ; AVX512-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 @@ -1859,7 +1859,7 @@ ; AVX2-NEXT: vmovdqu 96(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> ; AVX2-NEXT: vpblendvb %ymm1, %ymm14, %ymm12, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] ; AVX2-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm2 @@ -1869,7 +1869,7 @@ ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 ; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm4 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm13, %xmm3, %xmm0 ; AVX2-NEXT: vpor %xmm4, %xmm0, %xmm0 @@ -1878,12 +1878,12 @@ ; AVX2-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX2-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1] ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX2-NEXT: vpshufb %xmm13, %xmm7, %xmm1 ; AVX2-NEXT: vpor %xmm4, %xmm1, %xmm1 @@ -1892,14 +1892,14 @@ ; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-NEXT: vpblendvb %ymm13, %ymm14, %ymm12, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; AVX2-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm0 ; AVX2-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX2-NEXT: vpblendvb %ymm11, %ymm13, %ymm15, %ymm11 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm13 = ; AVX2-NEXT: vpshufb %xmm13, %xmm3, %xmm4 @@ -1918,11 +1918,11 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u> ; AVX2-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm4 ; AVX2-NEXT: vpblendvb %ymm1, %ymm12, %ymm14, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = ; AVX2-NEXT: vpshufb %xmm5, %xmm10, %xmm6 @@ -1948,33 +1948,33 @@ ; AVX512-LABEL: interleaved_load_vf64_i8_stride3: ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm8 +; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm9 ; AVX512-NEXT: vmovdqu64 128(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm14 ; AVX512-NEXT: vpblendvb %ymm10, %ymm1, %ymm14, %ymm3 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX512-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,1,4,7,10,13,16,19,22,25,28,31,18,21,24,27,30,17,20,23,26,29] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm11 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm9 -; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm9, %ymm4 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255> ; AVX512-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,16,19,22,25,28,31,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm8[2,3,0,1] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm13 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm13[1,4,7,10,13] -; AVX512-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm12[1,4,7,10,13] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1] +; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero +; AVX512-NEXT: vpor %xmm7, %xmm4, %xmm4 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 -; AVX512-NEXT: vextracti64x4 $1, %zmm8, %ymm7 +; AVX512-NEXT: vextracti64x4 $1, %zmm9, %ymm7 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm7[u,u,u,u,u] @@ -1985,25 +1985,25 @@ ; AVX512-NEXT: vmovdqu8 %zmm11, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u> ; AVX512-NEXT: vpblendvb %ymm11, %ymm14, %ymm1, %ymm4 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX512-NEXT: vpblendvb %ymm15, %ymm4, %ymm6, %ymm4 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,5,8,11,14,17,20,23,26,29,16,19,22,25,28,31,18,21,24,27,30] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm16 -; AVX512-NEXT: vpblendvb %ymm10, %ymm0, %ymm9, %ymm6 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm6[2,3,0,1] +; AVX512-NEXT: vpblendvb %ymm10, %ymm0, %ymm8, %ymm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm6[2,3,0,1] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = <0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u> ; AVX512-NEXT: vpblendvb %ymm15, %ymm6, %ymm10, %ymm6 ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm8[2,3,0,1] -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] -; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,0,1] +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero +; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm5[0,3,6,9,12,15,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm6, %xmm4, %xmm4 @@ -2011,21 +2011,21 @@ ; AVX512-NEXT: vmovdqu8 %zmm16, %zmm3 {%k1} ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> ; AVX512-NEXT: vpblendvb %ymm4, %ymm1, %ymm14, %ymm1 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255> ; AVX512-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,3,6,9,12,15,18,21,24,27,30,17,20,23,26,29,16,19,22,25,28,31] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512-NEXT: vpblendvb %ymm11, %ymm9, %ymm0, %ymm0 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u> ; AVX512-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,u],zero,zero,zero,zero,zero,xmm13[0,3,6,9,12,15] -; AVX512-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,u,u],zero,zero,zero,zero,zero,xmm12[0,3,6,9,12,15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX512-NEXT: vpblendvb %ymm9, %ymm0, %ymm4, %ymm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm5[1,4,7,10,13,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[2,5,8,11,14],zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4