diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -13076,6 +13076,33 @@ return ScaledMask; } +// Helper to attempt to widen/narrow blend masks. +static bool adjustBlendMask(uint64_t OldMask, uint64_t OldWidth, + uint64_t NewWidth, uint64_t *pNewMask = nullptr) { + assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) && + "Illegal blend mask scale"); + uint64_t NewMask = 0; + + if ((OldWidth % NewWidth) == 0) { + uint64_t Scale = OldWidth / NewWidth; + uint64_t SubMask = (1u << Scale) - 1; + for (uint64_t i = 0; i != NewWidth; ++i) { + uint64_t Sub = (OldMask >> (i * Scale)) & SubMask; + if (Sub == SubMask) + NewMask |= (1u << i); + else if (Sub != 0x0) + return false; + } + } else { + NewMask = scaleVectorShuffleBlendMask(OldMask, OldWidth, NewWidth / OldWidth); + } + + if (pNewMask) + *pNewMask = NewMask; + return true; +} + + /// Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending @@ -13102,6 +13129,35 @@ unsigned NumElts = VT.getVectorNumElements(); + auto Geti16Blend = [&](MVT TmpVT, unsigned NewWidth) { + uint64_t NewBlendMask = 0; + if (adjustBlendMask(BlendMask, NewWidth * 2, NewWidth, &NewBlendMask)) { + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, TmpVT, DAG.getBitcast(TmpVT, V1), + DAG.getBitcast(TmpVT, V2), + DAG.getTargetConstant(NewBlendMask, DL, MVT::i8))); + } + return SDValue(); + }; + + // blendw is slower than blendps/blendd on many targets. If we can scale + // blendw -> blendps do so. NB: We go for blendps because anytime blendw is + // available for a given width, blendps is also available. + // X86ExecutionDomainFix will convert back to blendw (or to blendd) if its + // needed. + switch (VT.SimpleTy) { + case MVT::v8i16: + if (SDValue R = Geti16Blend(MVT::v4f32, 4)) + return R; + break; + case MVT::v16i16: + if (SDValue R = Geti16Blend(MVT::v8f32, 8)) + return R; + break; + default: + break; + } + switch (VT.SimpleTy) { case MVT::v4i64: case MVT::v8i32: diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2889,26 +2889,12 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: combine_vec_sdiv_nonuniform7: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: combine_vec_sdiv_nonuniform7: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: combine_vec_sdiv_nonuniform7: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX512BW-NEXT: retq +; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: +; AVX2ORLATER: # %bb.0: +; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2ORLATER-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_nonuniform7: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll --- a/llvm/test/CodeGen/X86/shuffle-blendw.ll +++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll @@ -66,13 +66,13 @@ ; X86-AVX512-LABEL: blendw_to_blendd_32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; X86-AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7],ymm0[8,9],ymm1[10,11],ymm0[12,13],ymm1[14,15] +; X86-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: blendw_to_blendd_32: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7],ymm0[8,9],ymm1[10,11],ymm0[12,13],ymm1[14,15] +; X64-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; X64-AVX512-NEXT: retq %x1 = add <16 x i16> %x, %z %shuffle = shufflevector <16 x i16> %x1, <16 x i16> %y, <16 x i32> @@ -119,13 +119,13 @@ ; X86-AVX512-LABEL: blendw_to_blendd_16: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; X86-AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; X86-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: blendw_to_blendd_16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; X64-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64-AVX512-NEXT: retq %x1 = add <8 x i16> %x, %z %shuffle = shufflevector <8 x i16> %x1, <8 x i16> %y, <8 x i32>