diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1041,6 +1041,7 @@ setOperationAction(ISD::SMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v16i8, Custom); + setOperationAction(ISD::UMULO, MVT::v2i32, Custom); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); @@ -1255,6 +1256,7 @@ // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); + setOperationAction(ISD::SMULO, MVT::v2i32, Custom); // We directly match byte blends in the backend as they match the VSELECT // condition form. @@ -32379,6 +32381,43 @@ Results.push_back(Res); return; } + case ISD::SMULO: + case ISD::UMULO: { + EVT VT = N->getValueType(0); + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + VT == MVT::v2i32 && "Unexpected VT!"); + bool IsSigned = N->getOpcode() == ISD::SMULO; + unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1)); + SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1); + // Extract the high 32 bits from each result using PSHUFD. + // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD. + SDValue Hi = DAG.getBitcast(MVT::v4i32, Res); + Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1}); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi, + DAG.getIntPtrConstant(0, dl)); + + // Truncate the low bits of the result. This will become PSHUFD. + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + + SDValue HiCmp; + if (IsSigned) { + // SMULO overflows if the high bits don't match the sign of the low. + HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT)); + } else { + // UMULO overflows if the high bits are non-zero. + HiCmp = DAG.getConstant(0, dl, VT); + } + SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE); + + // Widen the result with by padding with undef. + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, + DAG.getUNDEF(VT)); + Results.push_back(Res); + Results.push_back(Ovf); + return; + } case X86ISD::VPMADDWD: { // Legalize types for X86ISD::VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -103,68 +103,44 @@ ; ; SSE41-LABEL: smulo_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuldq %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmuldq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuldq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: smulo_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm1, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: smulo_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm1, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: smulo_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; AVX-NEXT: vpsrad $31, %xmm2, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm2, (%rdi) +; AVX-NEXT: retq ; ; AVX512-LABEL: smulo_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] -; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; AVX512-NEXT: vpsrad $31, %xmm2, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm2, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i32>, <2 x i1>} @llvm.smul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -50,104 +50,70 @@ define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind { ; SSE2-LABEL: umulo_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: movq %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSSE3-NEXT: pmuludq %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSSE3-NEXT: movq %xmm1, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmuludq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE41-NEXT: movq %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX1-LABEL: umulo_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vmovdqa %xmm2, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: umulo_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: umulo_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX-NEXT: vmovq %xmm1, (%rdi) +; AVX-NEXT: retq ; ; AVX512-LABEL: umulo_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] -; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 -; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)