diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2411,18 +2411,21 @@ // Helper for demanding the specified elements and all the bits of both binary // operands. - auto SimplifyDemandedVectorEltsBinOp = [&](SDValue Op0, SDValue Op1) { - SDValue NewOp0 = SimplifyMultipleUseDemandedVectorElts(Op0, DemandedElts, - TLO.DAG, Depth + 1); - SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(Op1, DemandedElts, - TLO.DAG, Depth + 1); - if (NewOp0 || NewOp1) { - SDValue NewOp = TLO.DAG.getNode( - Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1); - return TLO.CombineTo(Op, NewOp); - } - return false; - }; + auto SimplifyDemandedVectorEltsBinOp = + [&](SDValue Op0, SDValue Op1, const APInt *Op0Demanded = nullptr, + const APInt *Op1Demanded = nullptr) { + SDValue NewOp0 = SimplifyMultipleUseDemandedVectorElts( + Op0, Op0Demanded ? *Op0Demanded : DemandedElts, TLO.DAG, Depth + 1); + SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts( + Op1, Op1Demanded ? *Op1Demanded : DemandedElts, TLO.DAG, Depth + 1); + if (NewOp0 || NewOp1) { + SDValue NewOp = + TLO.DAG.getNode(Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0, + NewOp1 ? NewOp1 : Op1); + return TLO.CombineTo(Op, NewOp); + } + return false; + }; switch (Opcode) { case ISD::SCALAR_TO_VECTOR: { @@ -2814,7 +2817,55 @@ // TODO: There are more binop opcodes that could be handled here - MIN, // MAX, saturated math, etc. - case ISD::OR: + case ISD::OR: { + auto computeKnownBitsElementWise = + [&TLO](SDValue V, const APInt &DemandedElts, unsigned Depth) { + unsigned NumElts = V.getValueType().getVectorNumElements(); + KnownBits Known(NumElts); + for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) { + if (!DemandedElts[EltIdx]) + continue; + APInt Mask = APInt::getZero(NumElts); + Mask.setBit(EltIdx); + KnownBits PeepholeKnown = TLO.DAG.computeKnownBits(V, Mask); + if (PeepholeKnown.isZero()) + Known.Zero.setBit(EltIdx); + if (PeepholeKnown.isAllOnes()) + Known.One.setBit(EltIdx); + } + return Known; + }; + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + KnownBits Op0Known = + computeKnownBitsElementWise(Op0, DemandedElts, Depth + 1); + KnownBits Op1Known = + computeKnownBitsElementWise(Op1, DemandedElts, Depth + 1); + + APInt Op0DemandedElts = DemandedElts & ~Op1Known.One; + APInt Op1DemandedElts = DemandedElts & ~Op0Known.One; + + APInt UndefRHS, ZeroRHS; + if (SimplifyDemandedVectorElts(Op1, Op1DemandedElts, UndefRHS, ZeroRHS, TLO, + Depth + 1)) + return true; + APInt UndefLHS, ZeroLHS; + if (SimplifyDemandedVectorElts(Op0, Op0DemandedElts, UndefLHS, ZeroLHS, TLO, + Depth + 1)) + return true; + + KnownZero = Op0Known.Zero & Op1Known.Zero; + KnownUndef = getKnownUndefForVectorBinop(Op, TLO.DAG, UndefLHS, UndefRHS); + + // Attempt to avoid multi-use ops if we don't need anything from them. + // TODO - use KnownUndef to relax the demandedelts? + if (!DemandedElts.isAllOnes()) + if (SimplifyDemandedVectorEltsBinOp(Op0, Op1, &Op0DemandedElts, &Op1DemandedElts)) + return true; + break; + } case ISD::XOR: case ISD::ADD: case ISD::SUB: diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -419,7 +419,6 @@ ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[u] ; SSSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero diff --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll --- a/llvm/test/CodeGen/X86/vshift-6.ll +++ b/llvm/test/CodeGen/X86/vshift-6.ll @@ -30,12 +30,9 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb %al, (%ecx) -; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: psllq $56, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; X86-NEXT: movdqa %xmm2, %xmm1 -; X86-NEXT: pandn %xmm0, %xmm1 -; X86-NEXT: por %xmm2, %xmm1 +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: psllq $56, %xmm1 +; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-NEXT: pcmpeqd %xmm3, %xmm3 ; X86-NEXT: psllw $5, %xmm1 ; X86-NEXT: pxor %xmm2, %xmm2 @@ -65,12 +62,9 @@ ; X64-LABEL: do_not_crash: ; X64: # %bb.0: # %entry ; X64-NEXT: movb %r9b, (%rdi) -; X64-NEXT: movd %r9d, %xmm0 -; X64-NEXT: psllq $56, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; X64-NEXT: movdqa %xmm2, %xmm1 -; X64-NEXT: pandn %xmm0, %xmm1 -; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: movd %r9d, %xmm1 +; X64-NEXT: psllq $56, %xmm1 +; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-NEXT: pcmpeqd %xmm2, %xmm2 ; X64-NEXT: psllw $5, %xmm1 ; X64-NEXT: pxor %xmm3, %xmm3