diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36226,12 +36226,61 @@ IsBlend = false; break; } - if (IsBlend && - DAG.computeKnownBits(V1, DemandedZeroV1).isZero() && - DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) { - Shuffle = ISD::OR; - SrcVT = DstVT = MaskVT.changeTypeToInteger(); - return true; + if (IsBlend) { + if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() && + DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) { + Shuffle = ISD::OR; + SrcVT = DstVT = MaskVT.changeTypeToInteger(); + return true; + } else if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) { + APInt V1KnownZeroElts = APInt::getZero(NumV1Elts); + APInt V1KnownOneElts = APInt::getZero(NumV1Elts); + for (unsigned V1EltIdx = 0; V1EltIdx != NumV1Elts; ++V1EltIdx) { + APInt Mask = APInt::getZero(NumV1Elts); + Mask.setBit(V1EltIdx); + KnownBits ZZ = DAG.computeKnownBits(V1, Mask); + if (ZZ.isZero()) + V1KnownZeroElts.setBit(V1EltIdx); + if (ZZ.isAllOnes()) + V1KnownOneElts.setBit(V1EltIdx); + } + + APInt V2KnownZeroElts = APInt::getZero(NumV2Elts); + APInt V2KnownOneElts = APInt::getZero(NumV2Elts); + for (unsigned V2EltIdx = 0; V2EltIdx != NumV2Elts; ++V2EltIdx) { + APInt Mask = APInt::getZero(NumV2Elts); + Mask.setBit(V2EltIdx); + KnownBits ZZ = DAG.computeKnownBits(V2, Mask); + if (ZZ.isZero()) + V2KnownZeroElts.setBit(V2EltIdx); + if (ZZ.isAllOnes()) + V2KnownOneElts.setBit(V2EltIdx); + } + + for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + if (M == SM_SentinelZero) { + IsBlend &= V1KnownZeroElts[i] && V2KnownZeroElts[i]; + continue; + } + if (M == (int)i) { + IsBlend &= V2KnownZeroElts[i] || V1KnownOneElts[i]; + continue; + } + if (M == (int)(i + NumMaskElts)) { + IsBlend &= V1KnownZeroElts[i] || V2KnownOneElts[i]; + continue; + } + llvm_unreachable("will not get here."); + } + if (IsBlend) { + Shuffle = ISD::OR; + SrcVT = DstVT = MaskVT.changeTypeToInteger(); + return true; + } + } } } diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -280,11 +280,8 @@ ; ; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -315,26 +312,22 @@ ; SSE2-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movl $255, %eax ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: movl $255, %eax ; SSE3-NEXT: movd %eax, %xmm2 ; SSE3-NEXT: pandn %xmm2, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; SSE3-NEXT: por %xmm1, %xmm2 ; SSE3-NEXT: por %xmm2, %xmm0 ; SSE3-NEXT: retq ; @@ -344,7 +337,7 @@ ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] ; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 @@ -372,41 +365,31 @@ ; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movl $255, %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movl $255, %eax ; SSE3-NEXT: movd %eax, %xmm3 ; SSE3-NEXT: pandn %xmm3, %xmm2 -; SSE3-NEXT: por %xmm2, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE3-NEXT: por %xmm4, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: por %xmm4, %xmm2 +; SSE3-NEXT: por %xmm2, %xmm0 ; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] +; SSE3-NEXT: por %xmm4, %xmm3 ; SSE3-NEXT: por %xmm3, %xmm1 -; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm4, %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: @@ -415,15 +398,13 @@ ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[u] -; SSSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero +; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero -; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ;