Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43566,6 +43566,34 @@ if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI)) return SDValue(N, 0); + // If the input is an extend_invec and the SimplifyDemandedBits call didn't + // convert it to any_extend_invec, due to the LegalOperations check, do the + // conversion directly to a vector shuffle manually. This exposes combine + // opportunities missed by combineExtInVec not calling + // combineX86ShufflesRecursively on SSE4.1 targets. + // FIXME: This is basically a hack around several other issues related to + // ANY_EXTEND_VECTOR_INREG. + if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() && + (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || + LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && + LHS.getOperand(0).getValueType() == MVT::v4i32) { + SDLoc dl(N); + LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0), + LHS.getOperand(0), { 0, -1, 1, -1 }); + LHS = DAG.getBitcast(MVT::v2i64, LHS); + return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); + } + if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() && + (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || + RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && + RHS.getOperand(0).getValueType() == MVT::v4i32) { + SDLoc dl(N); + RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0), + RHS.getOperand(0), { 0, -1, 1, -1 }); + RHS = DAG.getBitcast(MVT::v2i64, RHS); + return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); + } + return SDValue(); } Index: llvm/test/CodeGen/X86/pmul.ll =================================================================== --- llvm/test/CodeGen/X86/pmul.ll +++ llvm/test/CodeGen/X86/pmul.ll @@ -1131,14 +1131,13 @@ ; ; SSE41-LABEL: mul_v4i64_zero_lower: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: psrlq $32, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 ; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmuludq %xmm1, %xmm0 -; SSE41-NEXT: psrlq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm3, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE41-NEXT: retq ; ; AVX-LABEL: mul_v4i64_zero_lower: Index: llvm/test/CodeGen/X86/xop-ifma.ll =================================================================== --- llvm/test/CodeGen/X86/xop-ifma.ll +++ llvm/test/CodeGen/X86/xop-ifma.ll @@ -67,15 +67,13 @@ define <4 x i64> @test_mulx_v4i32_add_v4i64(<4 x i32> %a0, <4 x i32> %a1, <4 x i64> %a2) { ; XOP-AVX1-LABEL: test_mulx_v4i32_add_v4i64: ; XOP-AVX1: # %bb.0: -; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm3 -; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; XOP-AVX1-NEXT: vpmovsxdq %xmm1, %xmm4 -; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; XOP-AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; XOP-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; XOP-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; XOP-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; XOP-AVX1-NEXT: vpmacsdql %xmm5, %xmm1, %xmm0, %xmm0 -; XOP-AVX1-NEXT: vpmacsdql %xmm2, %xmm4, %xmm3, %xmm1 +; XOP-AVX1-NEXT: vpmacsdql %xmm2, %xmm3, %xmm4, %xmm1 ; XOP-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOP-AVX1-NEXT: retq ;