Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -2916,11 +2916,22 @@ /// elements, returning true on success. Otherwise, analyze the expression and /// return a mask of KnownUndef and KnownZero elements for the expression /// (used to simplify the caller). The KnownUndef/Zero elements may only be - /// accurate for those bits in the DemandedMask + /// accurate for those bits in the DemandedMask. virtual bool SimplifyDemandedVectorEltsForTargetNode( SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const; + /// Attempt to simplify any target nodes based on the demanded bits, + /// returning true on success. Otherwise, analyze the + /// expression and return a mask of KnownOne and KnownZero bits for the + /// expression (used to simplify the caller). The KnownZero/One bits may only + /// be accurate for those bits in the DemandedMask. + virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, + const APInt &DemandedBits, + KnownBits &Known, + TargetLoweringOpt &TLO, + unsigned Depth = 0) const; + /// If \p SNaN is false, \returns true if \p Op is known to never be any /// NaN. If \p sNaN is true, returns if \p Op is known to never be a signaling /// NaN. Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1335,6 +1335,13 @@ LLVM_FALLTHROUGH; } default: + if (Op.getOpcode() >= ISD::BUILTIN_OP_END) { + if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, Known, TLO, + Depth)) + return true; + break; + } + // Just use computeKnownBits to compute output bits. TLO.DAG.computeKnownBits(Op, Known, Depth); break; @@ -1803,6 +1810,23 @@ return false; } +bool TargetLowering::SimplifyDemandedBitsForTargetNode( + SDValue Op, const APInt &DemandedBits, KnownBits &Known, + TargetLoweringOpt &TLO, unsigned Depth) const { + assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || + Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || + Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || + Op.getOpcode() == ISD::INTRINSIC_VOID) && + "Should use SimplifyDemandedBits if you don't know whether Op" + " is a target node!"); + EVT VT = Op.getValueType(); + APInt DemandedElts = VT.isVector() + ? APInt::getAllOnesValue(VT.getVectorNumElements()) + : APInt(1, 1); + computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth); + return false; +} + bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN, Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -874,6 +874,12 @@ TargetLoweringOpt &TLO, unsigned Depth) const override; + bool SimplifyDemandedBitsForTargetNode(SDValue Op, + const APInt &DemandedBits, + KnownBits &Known, + TargetLoweringOpt &TLO, + unsigned Depth) const override; + SDValue unwrapAddress(SDValue N) const override; bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -31870,6 +31870,30 @@ return false; } +bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( + SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known, + TargetLoweringOpt &TLO, unsigned Depth) const { + unsigned Opc = Op.getOpcode(); + switch(Opc) { + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: { + // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. + KnownBits KnownOp; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + APInt DemandedMask = OriginalDemandedBits & APInt::getLowBitsSet(64, 32); + if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1)) + return true; + if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1)) + return true; + break; + } + } + + return TargetLowering::SimplifyDemandedBitsForTargetNode( + Op, OriginalDemandedBits, Known, TLO, Depth); +} + /// Check if a vector extract from a target-specific shuffle of a load can be /// folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but @@ -40362,13 +40386,9 @@ if (ISD::isBuildVectorAllZeros(RHS.getNode())) return RHS; + // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - APInt DemandedMask(APInt::getLowBitsSet(64, 32)); - - // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element. - if (TLI.SimplifyDemandedBits(LHS, DemandedMask, DCI)) - return SDValue(N, 0); - if (TLI.SimplifyDemandedBits(RHS, DemandedMask, DCI)) + if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI)) return SDValue(N, 0); return SDValue(); Index: test/CodeGen/X86/combine-pmuldq.ll =================================================================== --- test/CodeGen/X86/combine-pmuldq.ll +++ test/CodeGen/X86/combine-pmuldq.ll @@ -47,26 +47,10 @@ ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: combine_shuffle_zero_pmuludq: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: combine_shuffle_zero_pmuludq: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: retq +; AVX-LABEL: combine_shuffle_zero_pmuludq: +; AVX: # %bb.0: +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> %2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> %3 = bitcast <4 x i32> %1 to <2 x i64> @@ -84,22 +68,16 @@ ; ; AVX2-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq %1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> Index: test/CodeGen/X86/urem-seteq-vec-nonsplat.ll =================================================================== --- test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -143,31 +143,31 @@ define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-LABEL: test_urem_even_div: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,3435973837,2863311531,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 ; CHECK-SSE2-NEXT: psrld $1, %xmm3 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $3, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,10,12,14] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -377,30 +377,30 @@ define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-LABEL: test_urem_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,0,2863311531,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 ; CHECK-SSE2-NEXT: psrld $1, %xmm3 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] -; CHECK-SSE2-NEXT: psrld $3, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,1,12,14] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [6,1,12,14] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]