diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37897,6 +37897,48 @@ Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget)) return DAG.getBitcast(Root.getValueType(), HOp); + // Try to refine our inputs given our knowledge of target shuffle mask. + for (auto I : enumerate(Ops)) { + int OpIdx = I.index(); + SDValue &Op = I.value(); + + // What range of shuffle mask element values results in picking from Op? + int lo = OpIdx * Mask.size(); + int hi = lo + Mask.size(); + + // Which elements of Op do we demand, given the mask's granularity? + APInt OpDemandedElts(Mask.size(), 0); + for (int MaskElt : Mask) { + if (isInRange(MaskElt, lo, hi)) { // Picks from Op? + int OpEltIdx = MaskElt - lo; + OpDemandedElts.setBit(OpEltIdx); + } + } + + // Is the shuffle result smaller than the root? + if (Op.getValueSizeInBits() < RootSizeInBits) { + // We padded the mask with undefs. But we now need to undo that. + unsigned NumExpectedVectorElts = Mask.size(); + unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts; + unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits; + assert(!OpDemandedElts.extractBits( + NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && + "Demanding the virtual undef widening padding?"); + OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW + } + + // The Op itself may be of different VT, so we need to scale the mask. + unsigned NumOpElts = Op.getValueType().getVectorNumElements(); + APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts); + + // Can this operand be simplified any further, given it's demanded elements? + if (SDValue NewOp = + DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts( + Op, OpScaledDemandedElts, DAG)) + Op = NewOp; + } + // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now? + // Widen any subvector shuffle inputs we've collected. if (any_of(Ops, [RootSizeInBits](SDValue Op) { return Op.getValueSizeInBits() < RootSizeInBits; diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -312,6 +312,7 @@ ; SSE2-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movl $255, %eax ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: pandn %xmm2, %xmm1 @@ -323,6 +324,7 @@ ; SSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: movl $255, %eax ; SSE3-NEXT: movd %eax, %xmm2 ; SSE3-NEXT: pandn %xmm2, %xmm1 @@ -365,6 +367,7 @@ ; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movl $255, %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pandn %xmm3, %xmm2 @@ -380,6 +383,7 @@ ; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movl $255, %eax ; SSE3-NEXT: movd %eax, %xmm3 ; SSE3-NEXT: pandn %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -2261,12 +2261,13 @@ ; ; AVX1-LABEL: splat_v3i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: splat_v3i32: @@ -2288,12 +2289,13 @@ ; ; XOP-LABEL: splat_v3i32: ; XOP: # %bb.0: -; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; XOP-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; XOP-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; XOP-NEXT: movq (%rdi), %rax +; XOP-NEXT: vmovq %rax, %xmm0 +; XOP-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; XOP-NEXT: vmovd %eax, %xmm2 +; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; XOP-NEXT: retq %1 = load <3 x i32>, <3 x i32>* %ptr, align 1 %2 = shufflevector <3 x i32> %1, <3 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -568,12 +568,13 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] -; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: simplify_select: