Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -6663,6 +6663,10 @@ assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return Masked; + // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; @@ -9071,6 +9075,10 @@ return V; } + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Masked; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {// Low half. 0, 16, 1, 17, 2, 18, 3, 19, Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -470,28 +470,14 @@ } define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) { -; SSE2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: -; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: -; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: +; SSE: # BB#0: +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -951,17 +951,15 @@ ; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,1,128,3,128,5,128,7,128,9,128,11,128,13,128,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-zext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-zext.ll +++ llvm/trunk/test/CodeGen/X86/vector-zext.ll @@ -1092,17 +1092,16 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) { ; SSE2-LABEL: shuf_zext_8i8_to_8i32: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuf_zext_8i8_to_8i32: