diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19930,9 +19930,10 @@ if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) return false; - // i8 is better to be widen to i16, because there is PBLENDW for vXi16 - // when the vector bit size is 128 or 256. - if (VT == MVT::i8 && V1.getSimpleValueType().getSizeInBits() < 512) + // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd + // are preferable to blendw/blendvb/masked-mov. + if ((VT == MVT::i16 || VT == MVT::i8) && + V1.getSimpleValueType().getSizeInBits() < 512) return false; auto HasMaskOperation = [&](SDValue V) { diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2889,26 +2889,12 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: combine_vec_sdiv_nonuniform7: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: combine_vec_sdiv_nonuniform7: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: combine_vec_sdiv_nonuniform7: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX512BW-NEXT: retq +; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: +; AVX2ORLATER: # %bb.0: +; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2ORLATER-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_nonuniform7: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll --- a/llvm/test/CodeGen/X86/shuffle-blendw.ll +++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll @@ -66,13 +66,13 @@ ; X86-AVX512-LABEL: blendw_to_blendd_32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; X86-AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7],ymm0[8,9],ymm1[10,11],ymm0[12,13],ymm1[14,15] +; X86-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: blendw_to_blendd_32: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7],ymm0[8,9],ymm1[10,11],ymm0[12,13],ymm1[14,15] +; X64-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; X64-AVX512-NEXT: retq %x1 = add <16 x i16> %x, %z %shuffle = shufflevector <16 x i16> %x1, <16 x i16> %y, <16 x i32> @@ -119,13 +119,13 @@ ; X86-AVX512-LABEL: blendw_to_blendd_16: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; X86-AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; X86-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: blendw_to_blendd_16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; X64-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64-AVX512-NEXT: retq %x1 = add <8 x i16> %x, %z %shuffle = shufflevector <8 x i16> %x1, <8 x i16> %y, <8 x i32>