diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37908,6 +37908,50 @@ resolveTargetShuffleInputsAndMask(Ops, Mask); } + // Try to refine our inputs given our knowledge of target shuffle mask. + for (auto I : enumerate(Ops)) { + int OpIdx = I.index(); + SDValue &Op = I.value(); + + // What range of shuffle mask element values results in picking from Op? + int lo = OpIdx * Mask.size(); + int hi = lo + Mask.size(); + + // Which elements of Op do we demand? + SmallVector OpDemandedIdentityMask(Mask.size(), -1); + for (int MaskElt : Mask) { + if (isInRange(MaskElt, lo, hi)) { // Picks from Op? + int OpEltIdx = MaskElt - lo; + OpDemandedIdentityMask[OpEltIdx] = OpEltIdx; + } + } + + unsigned NumOpElts = Op.getValueType().getVectorNumElements(); + + SmallVector ScaledOpDemandedIdentityMask; + bool scaled = scaleShuffleElements(OpDemandedIdentityMask, NumOpElts, + ScaledOpDemandedIdentityMask); + (void)scaled; + assert(scaled && + "We should always succeed in scaling the identity shuffle mask!"); + assert(isSequentialOrUndefInRange(ScaledOpDemandedIdentityMask, 0, + NumOpElts, 0) && + "Should still have an identity mask after scaling!"); + + // Transform (scaled) identity shuffle mask into a demandedelts mask. + APInt DemandedOpElts = APInt::getNullValue(NumOpElts); + for (int ScaledOpDemandedIdentityMaskElt : ScaledOpDemandedIdentityMask) + if (ScaledOpDemandedIdentityMaskElt >= 0) + DemandedOpElts.setBit(ScaledOpDemandedIdentityMaskElt); + + // Can this operand be simplified any further, given it's demanded elements? + if (SDValue NewOp = + DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts( + Op, DemandedOpElts, DAG)) + Op = NewOp; + } + // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now? + // We can only combine unary and binary shuffle mask cases. if (Ops.size() <= 2) { // Minor canonicalization of the accumulated shuffle mask to make it easier diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -382,9 +382,11 @@ ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq @@ -402,9 +404,11 @@ ; SSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE3-NEXT: por %xmm4, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE3-NEXT: pand %xmm5, %xmm1 ; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE3-NEXT: por %xmm3, %xmm1 +; SSE3-NEXT: pandn %xmm3, %xmm5 +; SSE3-NEXT: por %xmm5, %xmm1 ; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: por %xmm4, %xmm1 ; SSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -2261,12 +2261,13 @@ ; ; AVX1-LABEL: splat_v3i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: splat_v3i32: @@ -2288,12 +2289,13 @@ ; ; XOP-LABEL: splat_v3i32: ; XOP: # %bb.0: -; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; XOP-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; XOP-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; XOP-NEXT: movq (%rdi), %rax +; XOP-NEXT: vmovq %rax, %xmm0 +; XOP-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; XOP-NEXT: vmovd %eax, %xmm2 +; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; XOP-NEXT: retq %1 = load <3 x i32>, <3 x i32>* %ptr, align 1 %2 = shufflevector <3 x i32> %1, <3 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -568,12 +568,13 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] -; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: simplify_select: