diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36226,12 +36226,58 @@ IsBlend = false; break; } - if (IsBlend && - DAG.computeKnownBits(V1, DemandedZeroV1).isZero() && - DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) { - Shuffle = ISD::OR; - SrcVT = DstVT = MaskVT.changeTypeToInteger(); - return true; + if (IsBlend) { + if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() && + DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) { + Shuffle = ISD::OR; + SrcVT = DstVT = MaskVT.changeTypeToInteger(); + return true; + } + if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) { + // FIXME: handle mismatched sizes? + // TODO: investigate if `ISD::OR` handling in + // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead. + auto computeKnownBitsElementWise = [&DAG](SDValue V) { + unsigned NumElts = V.getValueType().getVectorNumElements(); + KnownBits Known(NumElts); + for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) { + APInt Mask = APInt::getOneBitSet(NumElts, EltIdx); + KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask); + if (PeepholeKnown.isZero()) + Known.Zero.setBit(EltIdx); + if (PeepholeKnown.isAllOnes()) + Known.One.setBit(EltIdx); + } + return Known; + }; + + KnownBits V1Known = computeKnownBitsElementWise(V1); + KnownBits V2Known = computeKnownBitsElementWise(V2); + + for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + if (M == SM_SentinelZero) { + IsBlend &= V1Known.Zero[i] && V2Known.Zero[i]; + continue; + } + if (M == (int)i) { + IsBlend &= V2Known.Zero[i] || V1Known.One[i]; + continue; + } + if (M == (int)(i + NumMaskElts)) { + IsBlend &= V1Known.Zero[i] || V2Known.One[i]; + continue; + } + llvm_unreachable("will not get here."); + } + if (IsBlend) { + Shuffle = ISD::OR; + SrcVT = DstVT = MaskVT.changeTypeToInteger(); + return true; + } + } } } diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -280,11 +280,8 @@ ; ; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -315,26 +312,22 @@ ; SSE2-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movl $255, %eax ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: movl $255, %eax ; SSE3-NEXT: movd %eax, %xmm2 ; SSE3-NEXT: pandn %xmm2, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; SSE3-NEXT: por %xmm1, %xmm2 ; SSE3-NEXT: por %xmm2, %xmm0 ; SSE3-NEXT: retq ; @@ -344,7 +337,7 @@ ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] ; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 @@ -372,41 +365,31 @@ ; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movl $255, %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movl $255, %eax ; SSE3-NEXT: movd %eax, %xmm3 ; SSE3-NEXT: pandn %xmm3, %xmm2 -; SSE3-NEXT: por %xmm2, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE3-NEXT: por %xmm4, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: por %xmm4, %xmm2 +; SSE3-NEXT: por %xmm2, %xmm0 ; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] +; SSE3-NEXT: por %xmm4, %xmm3 ; SSE3-NEXT: por %xmm3, %xmm1 -; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm4, %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: @@ -415,15 +398,13 @@ ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[u] -; SSSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero +; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero -; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ;