Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1996,6 +1996,35 @@ return replaceInstUsesWith(*II, V); break; + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_sse41_pmuldq: + case Intrinsic::x86_avx2_pmul_dq: + case Intrinsic::x86_avx2_pmulu_dq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth = II->getType()->getVectorNumElements(); + unsigned InnerVWidth = Op0->getType()->getVectorNumElements(); + assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); + + APInt UndefElts(InnerVWidth, 0); + APInt DemandedElts(InnerVWidth, 0); + for (unsigned i = 0; i != VWidth; ++i) + DemandedElts.setBit(i * 2); + + bool MadeChange = false; + if (Value *V = SimplifyDemandedVectorElts(Op0, DemandedElts, UndefElts)) { + II->setArgOperand(0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorElts(Op1, DemandedElts, UndefElts)) { + II->setArgOperand(1, V); + MadeChange = true; + } + if (MadeChange) + return II; + break; + } + case Intrinsic::x86_sse41_insertps: if (Value *V = simplifyX86insertps(*II, *Builder)) return replaceInstUsesWith(*II, V); Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1431,6 +1431,36 @@ break; + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_sse41_pmuldq: + case Intrinsic::x86_avx2_pmul_dq: + case Intrinsic::x86_avx2_pmulu_dq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned InnerVWidth = Op0->getType()->getVectorNumElements(); + assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); + + APInt InnerDemandedElts(InnerVWidth, 0); + for (unsigned i = 0; i != VWidth; ++i) + if (DemandedElts[i]) + InnerDemandedElts.setBit(i * 2); + + UndefElts2 = APInt(InnerVWidth, 0); + TmpV = SimplifyDemandedVectorElts(Op0, InnerDemandedElts, UndefElts2, + Depth + 1); + if (TmpV) { + II->setArgOperand(0, TmpV); + MadeChange = true; + } + + UndefElts3 = APInt(InnerVWidth, 0); + TmpV = SimplifyDemandedVectorElts(Op1, InnerDemandedElts, UndefElts3, + Depth + 1); + if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } + + break; + } + // SSE4A instructions leave the upper 64-bits of the 128-bit result // in an undefined state. case Intrinsic::x86_sse4a_extrq: Index: test/Transforms/InstCombine/x86-muldq.ll =================================================================== --- test/Transforms/InstCombine/x86-muldq.ll +++ test/Transforms/InstCombine/x86-muldq.ll @@ -7,11 +7,10 @@ define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @test_demanded_elts_pmuludq_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: ret <2 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: ret <2 x i64> [[TMP3]] ; %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> @@ -22,10 +21,9 @@ define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @test_demanded_elts_pmuludq_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> [[TMP1]]) +; CHECK-NEXT: ret <4 x i64> [[TMP2]] ; %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> @@ -35,10 +33,9 @@ define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @test_demanded_elts_pmuldq_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> [[TMP1]]) +; CHECK-NEXT: ret <2 x i64> [[TMP2]] ; %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> @@ -48,11 +45,10 @@ define <4 x i64> @test_demanded_elts_pmuluq_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @test_demanded_elts_pmuluq_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> undef, <4 x i32> -; CHECK-NEXT: ret <4 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> undef, <4 x i32> +; CHECK-NEXT: ret <4 x i64> [[TMP3]] ; %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32>