Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -9762,7 +9762,6 @@ case MVT::v8f32: return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); - case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); @@ -9794,7 +9793,6 @@ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8))); } - case MVT::v16i16: { assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); SmallVector RepeatedMask; @@ -9808,7 +9806,16 @@ return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); } - LLVM_FALLTHROUGH; + // Use PBLENDW for lower/upper lanes and then blend lanes. + uint64_t LoMask = BlendMask & 0xFF; + uint64_t HiMask = (BlendMask >> 8) & 0xFF; + SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(LoMask, DL, MVT::i8)); + SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(HiMask, DL, MVT::i8)); + return DAG.getVectorShuffle( + MVT::v16i16, DL, Lo, Hi, + {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}); } case MVT::v16i8: case MVT::v32i8: { Index: test/CodeGen/X86/insertelement-ones.ll =================================================================== --- test/CodeGen/X86/insertelement-ones.ll +++ test/CodeGen/X86/insertelement-ones.ll @@ -290,39 +290,25 @@ ; ; AVX2-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq ; -; AVX512F-LABEL: insert_v16i16_x12345x789ABCDEx: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: insert_v16i16_x12345x789ABCDEx: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: movw $1, %ax -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: movw $64, %ax -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000 -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512-LABEL: insert_v16i16_x12345x789ABCDEx: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> %a, i16 -1, i32 0 %2 = insertelement <16 x i16> %1, i16 -1, i32 6 %3 = insertelement <16 x i16> %2, i16 -1, i32 15 Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -1235,8 +1235,9 @@ ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] Index: test/CodeGen/X86/prefer-avx256-mask-shuffle.ll =================================================================== --- test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -156,8 +156,9 @@ ; AVX256VL-NEXT: vpmovdw %ymm3, %xmm3 ; AVX256VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,1] -; AVX256VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX256VL-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX256VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7,8,9,10,11,12],ymm2[13],ymm1[14,15] +; AVX256VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15] +; AVX256VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm2 ; AVX256VL-NEXT: vpslld $31, %ymm2, %ymm2 ; AVX256VL-NEXT: vptestmd %ymm2, %ymm2, %k1 Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -228,8 +228,8 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: @@ -237,8 +237,8 @@ ; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: @@ -265,8 +265,8 @@ ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq ; @@ -904,18 +904,11 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000 -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -929,18 +922,11 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movw $1, %ax -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -954,18 +940,12 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movw $21930, %ax # imm = 0x55AA -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -979,18 +959,12 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movw $-21931, %ax # imm = 0xAA55 -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2134,8 +2108,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: @@ -2160,8 +2134,8 @@ ; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: retq ; @@ -2190,8 +2164,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; @@ -2222,8 +2196,8 @@ ; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: @@ -2231,8 +2205,8 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: @@ -2263,8 +2237,8 @@ ; AVX2-SLOW-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: @@ -2272,8 +2246,8 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: @@ -2299,8 +2273,8 @@ ; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-NEXT: retq ; @@ -2355,8 +2329,8 @@ ; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] ; AVX2-SLOW-NEXT: retq @@ -2364,8 +2338,8 @@ ; AVX2-FAST-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11,22,23,18,19,20,21,16,17,28,29,30,31,24,25,26,27] ; AVX2-FAST-NEXT: retq ; @@ -2394,8 +2368,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,24,25,24,25,24,25,24,25,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: @@ -2420,8 +2394,8 @@ ; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: retq ; @@ -2449,8 +2423,8 @@ ; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] ; AVX2-SLOW-NEXT: retq @@ -2458,8 +2432,8 @@ ; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11,20,21,22,23,16,17,20,21,28,29,30,31,24,25,26,27] ; AVX2-FAST-NEXT: retq ; @@ -2488,16 +2462,16 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15,20,21,22,23,16,17,18,19,28,29,30,31,24,25,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15,20,21,22,23,16,17,18,19,28,29,30,31,24,25,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: @@ -2523,8 +2497,8 @@ ; AVX2-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1,30,31,26,27,28,29,24,25,22,23,18,19,20,21,16,17] ; AVX2-NEXT: retq ; @@ -2553,8 +2527,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1,18,19,16,17,26,27,24,25,26,27,24,25,18,19,16,17] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: @@ -2582,8 +2556,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1,26,27,24,25,18,19,16,17,26,27,24,25,18,19,16,17] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: @@ -2611,8 +2585,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9,26,27,24,25,18,19,16,17,18,19,16,17,26,27,24,25] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: @@ -2640,8 +2614,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1,16,17,24,25,24,25,16,17,16,17,24,25,24,25,16,17] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: @@ -2669,8 +2643,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9,24,25,16,17,16,17,24,25,24,25,16,17,16,17,24,25] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: @@ -2696,8 +2670,8 @@ ; AVX2-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7,20,21,28,29,24,25,16,17,26,27,18,19,30,31,22,23] ; AVX2-NEXT: retq ; @@ -2724,8 +2698,8 @@ ; AVX2-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7,20,21,16,17,28,29,24,25,26,27,18,19,30,31,22,23] ; AVX2-NEXT: retq ; @@ -2752,8 +2726,8 @@ ; AVX2-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11,20,21,28,29,24,25,16,17,18,19,22,23,30,31,26,27] ; AVX2-NEXT: retq ; @@ -2816,8 +2790,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9,16,17,16,17,24,25,24,25,24,25,24,25,24,25,24,25] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: @@ -2845,8 +2819,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9,24,25,24,25,16,17,16,17,24,25,24,25,24,25,24,25] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: @@ -2874,8 +2848,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: @@ -2903,8 +2877,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1,16,17,24,25,24,25,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: @@ -2933,16 +2907,16 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15,16,17,24,25,24,25,16,17,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15,16,17,24,25,24,25,16,17,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: @@ -2970,8 +2944,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9,16,17,18,19,24,25,24,25,24,25,24,25,24,25,24,25] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,255,255,255,255,0,0,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: @@ -2999,8 +2973,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9,24,25,24,25,24,25,16,17,24,25,24,25,24,25,24,25] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,255,255,255,255,255,255,255,255,0,0,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255> -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: @@ -3028,8 +3002,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: @@ -3133,8 +3107,8 @@ ; AVX2-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7,16,17,18,19,20,21,30,31,24,25,26,27,28,29,22,23] ; AVX2-NEXT: retq ; @@ -3159,10 +3133,10 @@ ; ; AVX2-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15,24,25,26,27,28,29,22,23,16,17,18,19,20,21,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15,24,25,26,27,28,29,22,23,16,17,18,19,20,21,30,31] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: @@ -3223,8 +3197,8 @@ ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq ; @@ -3253,10 +3227,10 @@ ; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: @@ -3315,8 +3289,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23] ; AVX2-NEXT: retq ; @@ -3393,8 +3367,8 @@ ; AVX2-SLOW-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] @@ -3404,8 +3378,8 @@ ; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -3438,8 +3412,8 @@ ; AVX2-SLOW-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -3448,8 +3422,8 @@ ; AVX2-FAST-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,2,3,0,1,8,9,10,11,6,7,4,5,18,19,16,17,18,19,16,17,24,25,26,27,22,23,20,21] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,12,13,14,15,18,19,16,17,22,23,20,21,22,23,20,21,28,29,30,31] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] @@ -3479,8 +3453,8 @@ ; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-NEXT: retq ; @@ -3546,17 +3520,17 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: @@ -3740,8 +3714,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,u,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23] ; AVX2-NEXT: retq ; @@ -3868,8 +3842,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq ; @@ -3913,8 +3887,8 @@ ; AVX2-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq ; @@ -3978,8 +3952,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq ; @@ -4024,8 +3998,8 @@ ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq ; @@ -4089,8 +4063,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq ; @@ -4136,8 +4110,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq ; @@ -4523,15 +4497,17 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7],ymm1[8,9,10],ymm2[11],ymm1[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7],ymm1[8,9,10,11,12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6],ymm1[7],ymm0[8,9],ymm1[10,11],ymm0[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7,8,9],ymm0[10],ymm1[11,12,13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: PR24935: @@ -4539,14 +4515,16 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7],ymm1[8,9,10],ymm2[11],ymm1[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7],ymm1[8,9,10,11,12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,2,3,6,7,10,11,10,11,12,13,14,15,16,17,18,19,18,19,22,23,26,27,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6],ymm1[7],ymm0[8,9],ymm1[10,11],ymm0[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7,8,9],ymm0[10],ymm1[11,12,13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: PR24935: @@ -4581,8 +4559,8 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -388,18 +388,16 @@ ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VLBW-NEXT: movw $1, %ax -; AVX512VLBW-NEXT: kmovd %eax, %k1 -; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7] +; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -425,18 +423,16 @@ ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VLBW-NEXT: movw $1, %ax -; AVX512VLBW-NEXT: kmovd %eax, %k1 -; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7] +; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -2555,8 +2551,9 @@ ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -40,15 +40,17 @@ ; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: ; KNL: ## %bb.0: ; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19] -; KNL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] -; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255> -; KNL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] -; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u> -; KNL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,255,255,255,255,255,255,255,255,u,u,255,255,255,255,u,u,255,255,0,0> -; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] +; KNL-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5,6,7],ymm3[8,9],ymm2[10],ymm3[11,12],ymm2[13,14,15] +; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] +; KNL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15] +; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] +; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] +; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7,8,9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: @@ -72,8 +74,8 @@ ; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3 ; KNL-NEXT: vpbroadcastw %xmm3, %ymm3 -; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,6],ymm3[7],ymm1[8,9,10,11,12,13,14],ymm3[15] +; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17] ; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] ; KNL-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-v48.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v48.ll +++ test/CodeGen/X86/vector-shuffle-v48.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-pc-linux -mattr=+avx2 < %s | FileCheck %s + define <32 x i8> @foo(<48 x i8>* %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: @@ -12,8 +13,8 @@ ; CHECK-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0] -; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: retq %1 = load <48 x i8>, <48 x i8>* %x0, align 1 %2 = shufflevector <48 x i8> %1, <48 x i8> undef, <32 x i32>