diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37545,6 +37545,35 @@ } } // namespace llvm +// FIXME: this is extracted from TargetLowering::SimplifyDemandedVectorElts(). +// Move this into a place that will allow reuse and use it there. +static void ScaleDemandedEltsMask(const APInt &DemandedElts, + APInt &SrcDemandedElts) { + unsigned NumElts = DemandedElts.getBitWidth(); + unsigned NumSrcElts = SrcDemandedElts.getBitWidth(); + + // Bitcast from 'large element' src vector to 'small element' vector, we + // must demand a source element if any DemandedElt maps to it. + if ((NumElts % NumSrcElts) == 0) { + unsigned Scale = NumElts / NumSrcElts; + for (unsigned i = 0; i != NumElts; ++i) + if (DemandedElts[i]) + SrcDemandedElts.setBit(i / Scale); + return; + } + + // Bitcast from 'small element' src vector to 'large element' vector, we + // demand all smaller source elements covered by the larger demanded element + // of this vector. + if ((NumSrcElts % NumElts) == 0) { + unsigned Scale = NumSrcElts / NumElts; + for (unsigned i = 0; i != NumElts; ++i) + if (DemandedElts[i]) + SrcDemandedElts.setBits(i * Scale, (i + 1) * Scale); + return; + } +} + /// Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once @@ -37863,6 +37892,38 @@ resolveTargetShuffleInputsAndMask(Ops, Mask); } + // Try to refine our inputs given our knowledge of target shuffle mask. + // FIXME: do this before widenSubVector() to catch more interesting cases. + for (auto I : enumerate(Ops)) { + int OpIdx = I.index(); + SDValue &Op = I.value(); + + // What range of shuffle mask element values results in picking from Op? + int lo = OpIdx * Mask.size(); + int hi = lo + Mask.size(); + + // Which elements of Op do we demand, given the mask's granularity? + APInt OpDemandedElts(Mask.size(), 0); + for (int MaskElt : Mask) { + if (isInRange(MaskElt, lo, hi)) { // Picks from Op? + int OpEltIdx = MaskElt - lo; + OpDemandedElts.setBit(OpEltIdx); + } + } + + // The Op itself may be of different VT, so we need to scale the mask. + unsigned NumOpElts = Op.getValueType().getVectorNumElements(); + APInt OpScaledDemandedElts(NumOpElts, 0); + ScaleDemandedEltsMask(OpDemandedElts, OpScaledDemandedElts); + + // Can this operand be simplified any further, given it's demanded elements? + if (SDValue NewOp = + DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts( + Op, OpScaledDemandedElts, DAG)) + Op = NewOp; + } + // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now? + // We can only combine unary and binary shuffle mask cases. if (Ops.size() <= 2) { // Minor canonicalization of the accumulated shuffle mask to make it easier diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -382,9 +382,11 @@ ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq @@ -402,9 +404,11 @@ ; SSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE3-NEXT: por %xmm4, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE3-NEXT: pand %xmm5, %xmm1 ; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE3-NEXT: por %xmm3, %xmm1 +; SSE3-NEXT: pandn %xmm3, %xmm5 +; SSE3-NEXT: por %xmm5, %xmm1 ; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: por %xmm4, %xmm1 ; SSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -2261,12 +2261,13 @@ ; ; AVX1-LABEL: splat_v3i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: splat_v3i32: @@ -2288,12 +2289,13 @@ ; ; XOP-LABEL: splat_v3i32: ; XOP: # %bb.0: -; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; XOP-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; XOP-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; XOP-NEXT: movq (%rdi), %rax +; XOP-NEXT: vmovq %rax, %xmm0 +; XOP-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; XOP-NEXT: vmovd %eax, %xmm2 +; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; XOP-NEXT: retq %1 = load <3 x i32>, <3 x i32>* %ptr, align 1 %2 = shufflevector <3 x i32> %1, <3 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -568,12 +568,13 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] -; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: simplify_select: