Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -38489,6 +38489,33 @@ return SDValue(); } +// Simplify PMULDQ and PMULUDQ operations. +static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + APInt DemandedMask(APInt::getLowBitsSet(64, 32)); + + // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element. + KnownBits LHSKnown; + if (TLI.SimplifyDemandedBits(LHS, DemandedMask, LHSKnown, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(N, 0); + } + + KnownBits RHSKnown; + if (TLI.SimplifyDemandedBits(RHS, DemandedMask, RHSKnown, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(N, 0); + } + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -38610,6 +38637,8 @@ case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI); } return SDValue(); Index: test/CodeGen/X86/combine-pmuldq.ll =================================================================== --- test/CodeGen/X86/combine-pmuldq.ll +++ test/CodeGen/X86/combine-pmuldq.ll @@ -9,19 +9,11 @@ define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-LABEL: combine_shuffle_sext_pmuldq: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: pmovsxdq %xmm0, %xmm0 -; SSE-NEXT: pmuldq %xmm2, %xmm0 +; SSE-NEXT: pmuldq %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_shuffle_sext_pmuldq: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 ; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> @@ -36,19 +28,11 @@ define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-LABEL: combine_shuffle_zext_pmuludq: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_shuffle_zext_pmuludq: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -472,7 +472,7 @@ ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 @@ -503,7 +503,7 @@ ; SKX_SMALL: # %bb.0: # %entry ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -515,7 +515,7 @@ ; SKX_LARGE-LABEL: test9: ; SKX_LARGE: # %bb.0: # %entry ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 -; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax @@ -558,7 +558,7 @@ ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 @@ -589,7 +589,7 @@ ; SKX_SMALL: # %bb.0: # %entry ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -601,7 +601,7 @@ ; SKX_LARGE-LABEL: test10: ; SKX_LARGE: # %bb.0: # %entry ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 -; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax Index: test/CodeGen/X86/xop-ifma.ll =================================================================== --- test/CodeGen/X86/xop-ifma.ll +++ test/CodeGen/X86/xop-ifma.ll @@ -81,8 +81,8 @@ ; ; XOP-AVX2-LABEL: test_mulx_v4i32_add_v4i64: ; XOP-AVX2: # %bb.0: -; XOP-AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; XOP-AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOP-AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ; XOP-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; XOP-AVX2-NEXT: retq