Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1811,6 +1811,10 @@ setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); + setTargetDAGCombine(ISD::SDIV); + setTargetDAGCombine(ISD::UDIV); + setTargetDAGCombine(ISD::SREM); + setTargetDAGCombine(ISD::UREM); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -40929,6 +40933,37 @@ return SDValue(); } +// Try to widen division/remainder by splat constant to avoid scalarization. +// TODO: Can we do something for non-splat? +static SDValue combineDivRem(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + LLVMContext &Context = *DAG.getContext(); + EVT VT = N->getValueType(0); + if (!VT.isVector() || 128 % VT.getSizeInBits() != 0 || + TLI.getTypeAction(Context, VT) != TargetLowering::TypeWidenVector) + return SDValue(); + + APInt SplatVal; + if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) + return SDValue(); + + unsigned NumConcats = 128 / VT.getSizeInBits(); + SmallVector Ops0(NumConcats, DAG.getUNDEF(VT)); + Ops0[0] = N->getOperand(0); + + SDLoc dl(N); + EVT ResVT = TLI.getTypeToTransformTo(Context, VT); + SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0); + SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT); + SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getIntPtrConstant(0, dl)); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -41059,6 +41094,10 @@ case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); case X86ISD::PMULDQ: case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI); + case ISD::UDIV: + case ISD::SDIV: + case ISD::UREM: + case ISD::SREM: return combineDivRem(N, DAG, DCI); } return SDValue(); Index: test/CodeGen/X86/vector-idiv-v2i32.ll =================================================================== --- test/CodeGen/X86/vector-idiv-v2i32.ll +++ test/CodeGen/X86/vector-idiv-v2i32.ll @@ -65,56 +65,40 @@ ; X64_WIDEN-LABEL: test_udiv7_v2i32: ; X64_WIDEN: # %bb.0: ; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64_WIDEN-NEXT: movd %xmm0, %eax -; X64_WIDEN-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; X64_WIDEN-NEXT: shrq $32, %rcx -; X64_WIDEN-NEXT: subl %ecx, %eax -; X64_WIDEN-NEXT: shrl %eax -; X64_WIDEN-NEXT: addl %ecx, %eax -; X64_WIDEN-NEXT: shrl $2, %eax -; X64_WIDEN-NEXT: movd %eax, %xmm1 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X64_WIDEN-NEXT: movd %xmm0, %eax -; X64_WIDEN-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; X64_WIDEN-NEXT: shrq $32, %rcx -; X64_WIDEN-NEXT: subl %ecx, %eax -; X64_WIDEN-NEXT: shrl %eax -; X64_WIDEN-NEXT: addl %ecx, %eax -; X64_WIDEN-NEXT: shrl $2, %eax -; X64_WIDEN-NEXT: movd %eax, %xmm0 -; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64_WIDEN-NEXT: movq %xmm1, (%rsi) +; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64_WIDEN-NEXT: psubd %xmm2, %xmm0 +; X64_WIDEN-NEXT: psrld $1, %xmm0 +; X64_WIDEN-NEXT: paddd %xmm2, %xmm0 +; X64_WIDEN-NEXT: psrld $2, %xmm0 +; X64_WIDEN-NEXT: movq %xmm0, (%rsi) ; X64_WIDEN-NEXT: retq ; ; X86_WIDEN-LABEL: test_udiv7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %edi -; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: movd %xmm0, %ecx -; X86_WIDEN-NEXT: movl $613566757, %edi # imm = 0x24924925 -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: mull %edi -; X86_WIDEN-NEXT: subl %edx, %ecx -; X86_WIDEN-NEXT: shrl %ecx -; X86_WIDEN-NEXT: addl %edx, %ecx -; X86_WIDEN-NEXT: shrl $2, %ecx -; X86_WIDEN-NEXT: movd %ecx, %xmm1 -; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, %ecx -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: mull %edi -; X86_WIDEN-NEXT: subl %edx, %ecx -; X86_WIDEN-NEXT: shrl %ecx -; X86_WIDEN-NEXT: addl %edx, %ecx -; X86_WIDEN-NEXT: shrl $2, %ecx -; X86_WIDEN-NEXT: movd %ecx, %xmm0 -; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86_WIDEN-NEXT: movq %xmm1, (%esi) -; X86_WIDEN-NEXT: popl %esi -; X86_WIDEN-NEXT: popl %edi +; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86_WIDEN-NEXT: psubd %xmm2, %xmm0 +; X86_WIDEN-NEXT: psrld $1, %xmm0 +; X86_WIDEN-NEXT: paddd %xmm2, %xmm0 +; X86_WIDEN-NEXT: psrld $2, %xmm0 +; X86_WIDEN-NEXT: movq %xmm0, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, @@ -201,72 +185,50 @@ ; X64_WIDEN-LABEL: test_urem7_v2i32: ; X64_WIDEN: # %bb.0: ; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64_WIDEN-NEXT: movd %xmm0, %eax -; X64_WIDEN-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; X64_WIDEN-NEXT: shrq $32, %rcx -; X64_WIDEN-NEXT: movl %eax, %edx -; X64_WIDEN-NEXT: subl %ecx, %edx -; X64_WIDEN-NEXT: shrl %edx -; X64_WIDEN-NEXT: addl %ecx, %edx -; X64_WIDEN-NEXT: shrl $2, %edx -; X64_WIDEN-NEXT: leal (,%rdx,8), %ecx -; X64_WIDEN-NEXT: subl %ecx, %edx -; X64_WIDEN-NEXT: addl %eax, %edx -; X64_WIDEN-NEXT: movd %edx, %xmm1 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X64_WIDEN-NEXT: movd %xmm0, %eax -; X64_WIDEN-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; X64_WIDEN-NEXT: shrq $32, %rcx -; X64_WIDEN-NEXT: movl %eax, %edx -; X64_WIDEN-NEXT: subl %ecx, %edx -; X64_WIDEN-NEXT: shrl %edx -; X64_WIDEN-NEXT: addl %ecx, %edx -; X64_WIDEN-NEXT: shrl $2, %edx -; X64_WIDEN-NEXT: leal (,%rdx,8), %ecx -; X64_WIDEN-NEXT: subl %ecx, %edx -; X64_WIDEN-NEXT: addl %eax, %edx -; X64_WIDEN-NEXT: movd %edx, %xmm0 -; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm1 +; X64_WIDEN-NEXT: psubd %xmm2, %xmm1 +; X64_WIDEN-NEXT: psrld $1, %xmm1 +; X64_WIDEN-NEXT: paddd %xmm2, %xmm1 +; X64_WIDEN-NEXT: psrld $2, %xmm1 +; X64_WIDEN-NEXT: movdqa %xmm1, %xmm2 +; X64_WIDEN-NEXT: pslld $3, %xmm2 +; X64_WIDEN-NEXT: psubd %xmm2, %xmm1 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm1 ; X64_WIDEN-NEXT: movq %xmm1, (%rsi) ; X64_WIDEN-NEXT: retq ; ; X86_WIDEN-LABEL: test_urem7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %edi -; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: movd %xmm0, %ecx -; X86_WIDEN-NEXT: movl $613566757, %edi # imm = 0x24924925 -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: mull %edi -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: subl %edx, %eax -; X86_WIDEN-NEXT: shrl %eax -; X86_WIDEN-NEXT: addl %edx, %eax -; X86_WIDEN-NEXT: shrl $2, %eax -; X86_WIDEN-NEXT: leal (,%eax,8), %edx -; X86_WIDEN-NEXT: subl %edx, %eax -; X86_WIDEN-NEXT: addl %ecx, %eax -; X86_WIDEN-NEXT: movd %eax, %xmm1 -; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, %ecx -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: mull %edi -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: subl %edx, %eax -; X86_WIDEN-NEXT: shrl %eax -; X86_WIDEN-NEXT: addl %edx, %eax -; X86_WIDEN-NEXT: shrl $2, %eax -; X86_WIDEN-NEXT: leal (,%eax,8), %edx -; X86_WIDEN-NEXT: subl %edx, %eax -; X86_WIDEN-NEXT: addl %ecx, %eax -; X86_WIDEN-NEXT: movd %eax, %xmm0 -; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86_WIDEN-NEXT: movq %xmm1, (%esi) -; X86_WIDEN-NEXT: popl %esi -; X86_WIDEN-NEXT: popl %edi +; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm1 +; X86_WIDEN-NEXT: psubd %xmm2, %xmm1 +; X86_WIDEN-NEXT: psrld $1, %xmm1 +; X86_WIDEN-NEXT: paddd %xmm2, %xmm1 +; X86_WIDEN-NEXT: psrld $2, %xmm1 +; X86_WIDEN-NEXT: movdqa %xmm1, %xmm2 +; X86_WIDEN-NEXT: pslld $3, %xmm2 +; X86_WIDEN-NEXT: psubd %xmm2, %xmm1 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm1 +; X86_WIDEN-NEXT: movq %xmm1, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = urem <2 x i32> %a, @@ -344,64 +306,52 @@ ; X64_WIDEN-LABEL: test_sdiv7_v2i32: ; X64_WIDEN: # %bb.0: ; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64_WIDEN-NEXT: movd %xmm0, %eax -; X64_WIDEN-NEXT: cltq -; X64_WIDEN-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; X64_WIDEN-NEXT: shrq $32, %rcx -; X64_WIDEN-NEXT: addl %ecx, %eax -; X64_WIDEN-NEXT: movl %eax, %ecx -; X64_WIDEN-NEXT: shrl $31, %ecx -; X64_WIDEN-NEXT: sarl $2, %eax -; X64_WIDEN-NEXT: addl %ecx, %eax -; X64_WIDEN-NEXT: movd %eax, %xmm1 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X64_WIDEN-NEXT: movd %xmm0, %eax -; X64_WIDEN-NEXT: cltq -; X64_WIDEN-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; X64_WIDEN-NEXT: shrq $32, %rcx -; X64_WIDEN-NEXT: addl %ecx, %eax -; X64_WIDEN-NEXT: movl %eax, %ecx -; X64_WIDEN-NEXT: shrl $31, %ecx -; X64_WIDEN-NEXT: sarl $2, %eax -; X64_WIDEN-NEXT: addl %ecx, %eax -; X64_WIDEN-NEXT: movd %eax, %xmm0 -; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64_WIDEN-NEXT: movq %xmm1, (%rsi) +; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64_WIDEN-NEXT: pxor %xmm3, %xmm3 +; X64_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 +; X64_WIDEN-NEXT: pand %xmm1, %xmm3 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm3 +; X64_WIDEN-NEXT: psubd %xmm3, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X64_WIDEN-NEXT: movdqa %xmm2, %xmm0 +; X64_WIDEN-NEXT: psrld $31, %xmm0 +; X64_WIDEN-NEXT: psrad $2, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X64_WIDEN-NEXT: movq %xmm2, (%rsi) ; X64_WIDEN-NEXT: retq ; ; X86_WIDEN-LABEL: test_sdiv7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebx -; X86_WIDEN-NEXT: pushl %edi -; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: movd %xmm0, %ecx -; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, %esi -; X86_WIDEN-NEXT: movl $-1840700269, %ebx # imm = 0x92492493 -; X86_WIDEN-NEXT: movl %esi, %eax -; X86_WIDEN-NEXT: imull %ebx -; X86_WIDEN-NEXT: addl %esi, %edx -; X86_WIDEN-NEXT: movl %edx, %eax -; X86_WIDEN-NEXT: shrl $31, %eax -; X86_WIDEN-NEXT: sarl $2, %edx -; X86_WIDEN-NEXT: addl %eax, %edx -; X86_WIDEN-NEXT: movd %edx, %xmm0 -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: imull %ebx -; X86_WIDEN-NEXT: addl %ecx, %edx -; X86_WIDEN-NEXT: movl %edx, %eax -; X86_WIDEN-NEXT: shrl $31, %eax -; X86_WIDEN-NEXT: sarl $2, %edx -; X86_WIDEN-NEXT: addl %eax, %edx -; X86_WIDEN-NEXT: movd %edx, %xmm1 -; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86_WIDEN-NEXT: movq %xmm1, (%edi) -; X86_WIDEN-NEXT: popl %esi -; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx +; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86_WIDEN-NEXT: pxor %xmm3, %xmm3 +; X86_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 +; X86_WIDEN-NEXT: pand %xmm1, %xmm3 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm3 +; X86_WIDEN-NEXT: psubd %xmm3, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X86_WIDEN-NEXT: movdqa %xmm2, %xmm0 +; X86_WIDEN-NEXT: psrld $31, %xmm0 +; X86_WIDEN-NEXT: psrad $2, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X86_WIDEN-NEXT: movq %xmm2, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, @@ -491,76 +441,60 @@ ; X64_WIDEN-LABEL: test_srem7_v2i32: ; X64_WIDEN: # %bb.0: ; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64_WIDEN-NEXT: movd %xmm0, %eax -; X64_WIDEN-NEXT: cltq -; X64_WIDEN-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; X64_WIDEN-NEXT: shrq $32, %rcx -; X64_WIDEN-NEXT: addl %eax, %ecx -; X64_WIDEN-NEXT: movl %ecx, %edx -; X64_WIDEN-NEXT: shrl $31, %edx -; X64_WIDEN-NEXT: sarl $2, %ecx -; X64_WIDEN-NEXT: addl %edx, %ecx -; X64_WIDEN-NEXT: leal (,%rcx,8), %edx -; X64_WIDEN-NEXT: subl %edx, %ecx -; X64_WIDEN-NEXT: addl %eax, %ecx -; X64_WIDEN-NEXT: movd %ecx, %xmm1 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X64_WIDEN-NEXT: movd %xmm0, %eax -; X64_WIDEN-NEXT: cltq -; X64_WIDEN-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; X64_WIDEN-NEXT: shrq $32, %rcx -; X64_WIDEN-NEXT: addl %eax, %ecx -; X64_WIDEN-NEXT: movl %ecx, %edx -; X64_WIDEN-NEXT: shrl $31, %edx -; X64_WIDEN-NEXT: sarl $2, %ecx -; X64_WIDEN-NEXT: addl %edx, %ecx -; X64_WIDEN-NEXT: leal (,%rcx,8), %edx -; X64_WIDEN-NEXT: subl %edx, %ecx -; X64_WIDEN-NEXT: addl %eax, %ecx -; X64_WIDEN-NEXT: movd %ecx, %xmm0 -; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64_WIDEN-NEXT: movq %xmm1, (%rsi) +; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64_WIDEN-NEXT: pxor %xmm3, %xmm3 +; X64_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 +; X64_WIDEN-NEXT: pand %xmm1, %xmm3 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm3 +; X64_WIDEN-NEXT: psubd %xmm3, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X64_WIDEN-NEXT: movdqa %xmm2, %xmm1 +; X64_WIDEN-NEXT: psrld $31, %xmm1 +; X64_WIDEN-NEXT: psrad $2, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm1, %xmm2 +; X64_WIDEN-NEXT: movdqa %xmm2, %xmm1 +; X64_WIDEN-NEXT: pslld $3, %xmm1 +; X64_WIDEN-NEXT: psubd %xmm1, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X64_WIDEN-NEXT: movq %xmm2, (%rsi) ; X64_WIDEN-NEXT: retq ; ; X86_WIDEN-LABEL: test_srem7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebx -; X86_WIDEN-NEXT: pushl %edi -; X86_WIDEN-NEXT: pushl %esi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: movd %xmm0, %ecx -; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, %esi -; X86_WIDEN-NEXT: movl $-1840700269, %edi # imm = 0x92492493 -; X86_WIDEN-NEXT: movl %esi, %eax -; X86_WIDEN-NEXT: imull %edi -; X86_WIDEN-NEXT: addl %esi, %edx -; X86_WIDEN-NEXT: movl %edx, %eax -; X86_WIDEN-NEXT: shrl $31, %eax -; X86_WIDEN-NEXT: sarl $2, %edx -; X86_WIDEN-NEXT: addl %eax, %edx -; X86_WIDEN-NEXT: leal (,%edx,8), %eax -; X86_WIDEN-NEXT: subl %eax, %edx -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86_WIDEN-NEXT: addl %esi, %edx -; X86_WIDEN-NEXT: movd %edx, %xmm0 -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: imull %edi -; X86_WIDEN-NEXT: addl %ecx, %edx -; X86_WIDEN-NEXT: movl %edx, %eax -; X86_WIDEN-NEXT: shrl $31, %eax -; X86_WIDEN-NEXT: sarl $2, %edx -; X86_WIDEN-NEXT: addl %eax, %edx -; X86_WIDEN-NEXT: leal (,%edx,8), %eax -; X86_WIDEN-NEXT: subl %eax, %edx -; X86_WIDEN-NEXT: addl %ecx, %edx -; X86_WIDEN-NEXT: movd %edx, %xmm1 -; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86_WIDEN-NEXT: movq %xmm1, (%ebx) -; X86_WIDEN-NEXT: popl %esi -; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx +; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86_WIDEN-NEXT: pxor %xmm3, %xmm3 +; X86_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 +; X86_WIDEN-NEXT: pand %xmm1, %xmm3 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm3 +; X86_WIDEN-NEXT: psubd %xmm3, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X86_WIDEN-NEXT: movdqa %xmm2, %xmm1 +; X86_WIDEN-NEXT: psrld $31, %xmm1 +; X86_WIDEN-NEXT: psrad $2, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm1, %xmm2 +; X86_WIDEN-NEXT: movdqa %xmm2, %xmm1 +; X86_WIDEN-NEXT: pslld $3, %xmm1 +; X86_WIDEN-NEXT: psubd %xmm1, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X86_WIDEN-NEXT: movq %xmm2, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = srem <2 x i32> %a,