diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4943,7 +4943,7 @@ ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const { - SmallVector Built; + SmallVector Built; if (SDValue Folded = prepareUREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond, DCI, DL, Built)) { for (SDNode *N : Built) @@ -4978,6 +4978,8 @@ if (!isOperationLegalOrCustom(ISD::MUL, VT)) return SDValue(); + bool ComparingWithAllZeros = true; + bool AllComparisonsWithNonZerosAreTautological = true; bool HadTautologicalLanes = false; bool AllLanesAreTautological = true; bool HadEvenDivisor = false; @@ -4993,6 +4995,8 @@ const APInt &D = CDiv->getAPIntValue(); const APInt &Cmp = CCmp->getAPIntValue(); + ComparingWithAllZeros &= Cmp.isNullValue(); + // x u% C1` is *always* less than C1. So given `x u% C1 == C2`, // if C2 is not less than C1, the comparison is always false. // But we will only be able to produce the comparison that will give the @@ -5000,12 +5004,6 @@ bool TautologicalInvertedLane = D.ule(Cmp); HadTautologicalInvertedLanes |= TautologicalInvertedLane; - // If we are checking that remainder is something smaller than the divisor, - // then this comparison isn't tautological. For now this is not handled, - // other than the comparison that remainder is zero. - if (!Cmp.isNullValue() && !TautologicalInvertedLane) - return false; - // If all lanes are tautological (either all divisors are ones, or divisor // is not greater than the constant we are comparing with), // we will prefer to avoid the fold. @@ -5013,6 +5011,12 @@ HadTautologicalLanes |= TautologicalLane; AllLanesAreTautological &= TautologicalLane; + // If we are comparing with non-zero, we need'll need to subtract said + // comparison value from the LHS. But there is no point in doing that if + // every lane where we are comparing with non-zero is tautological.. + if (!Cmp.isNullValue()) + AllComparisonsWithNonZerosAreTautological &= TautologicalLane; + // Decompose D into D0 * 2^K unsigned K = D.countTrailingZeros(); assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate."); @@ -5033,8 +5037,15 @@ assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check."); - // Q = floor((2^W - 1) / D) - APInt Q = APInt::getAllOnesValue(W).udiv(D); + // Q = floor((2^W - 1) u/ D) + // R = ((2^W - 1) u% D) + APInt Q, R; + APInt::udivrem(APInt::getAllOnesValue(W), D, Q, R); + + // If we are comparing with zero, then that comparison constant is okay, + // else it may need to be one less than that. + if (Cmp.ugt(R)) + Q -= 1; assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) && "We are expecting that K is always less than all-ones for ShSVT"); @@ -5093,6 +5104,14 @@ QVal = QAmts[0]; } + if (!ComparingWithAllZeros && !AllComparisonsWithNonZerosAreTautological) { + if (!isOperationLegalOrCustom(ISD::SUB, VT)) + return SDValue(); // FIXME: Could/should use `ISD::ADD`? + assert(CompTargetNode.getValueType() == N.getValueType() && + "Expecting that the types on LHS and RHS of comparisons match."); + N = DAG.getNode(ISD::SUB, DL, VT, N, CompTargetNode); + } + // (mul N, P) SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal); Created.push_back(Op0.getNode()); diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll @@ -6,12 +6,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #33 -; CHECK-NEXT: add w8, w8, w8, lsl #1 -; CHECK-NEXT: sub w8, w0, w8 -; CHECK-NEXT: cmp w8, #1 // =1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w9, #1431655765 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 1 @@ -23,12 +21,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #33 -; CHECK-NEXT: add w8, w8, w8, lsl #1 -; CHECK-NEXT: sub w8, w0, w8 -; CHECK-NEXT: cmp w8, #2 // =2 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w9, #-1431655766 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #1431655765 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 2 @@ -41,12 +38,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #34 -; CHECK-NEXT: add w8, w8, w8, lsl #2 -; CHECK-NEXT: sub w8, w0, w8 -; CHECK-NEXT: cmp w8, #1 // =1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w9, #858993459 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 1 @@ -58,12 +53,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #34 -; CHECK-NEXT: add w8, w8, w8, lsl #2 -; CHECK-NEXT: sub w8, w0, w8 -; CHECK-NEXT: cmp w8, #2 // =2 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w9, #1717986918 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #858993459 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 2 @@ -75,12 +69,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #34 -; CHECK-NEXT: add w8, w8, w8, lsl #2 -; CHECK-NEXT: sub w8, w0, w8 -; CHECK-NEXT: cmp w8, #3 // =3 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w9, #-1717986919 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #858993459 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 3 @@ -92,12 +85,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #34 -; CHECK-NEXT: add w8, w8, w8, lsl #2 -; CHECK-NEXT: sub w8, w0, w8 -; CHECK-NEXT: cmp w8, #4 // =4 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w9, #-858993460 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #858993459 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 4 @@ -110,12 +102,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #34 -; CHECK-NEXT: mov w9, #6 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: cmp w8, #1 // =1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w9, #1431655765 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #43691 +; CHECK-NEXT: ror w8, w8, #1 +; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 1 @@ -127,12 +120,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #34 -; CHECK-NEXT: mov w9, #6 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: cmp w8, #2 // =2 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w9, #-1431655766 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #43691 +; CHECK-NEXT: ror w8, w8, #1 +; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 2 @@ -144,12 +138,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #34 -; CHECK-NEXT: mov w9, #6 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: cmp w8, #3 // =3 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: sub w8, w8, #1 // =1 +; CHECK-NEXT: mov w9, #43691 +; CHECK-NEXT: ror w8, w8, #1 +; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 3 @@ -160,13 +155,15 @@ ; CHECK-LABEL: t32_6_4: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: mov w9, #21844 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #34 -; CHECK-NEXT: mov w9, #6 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: cmp w8, #4 // =4 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: movk w9, #21845, lsl #16 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #43690 +; CHECK-NEXT: ror w8, w8, #1 +; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 4 @@ -177,13 +174,15 @@ ; CHECK-LABEL: t32_6_5: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: mov w9, #43689 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #34 -; CHECK-NEXT: mov w9, #6 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: cmp w8, #5 // =5 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: movk w9, #43690, lsl #16 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #43690 +; CHECK-NEXT: ror w8, w8, #1 +; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 5 @@ -199,12 +198,11 @@ ; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #33 -; CHECK-NEXT: add w9, w9, w9, lsl #1 -; CHECK-NEXT: sub w8, w8, w9 -; CHECK-NEXT: cmp w8, #2 // =2 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w10, #-1431655766 +; CHECK-NEXT: madd w8, w8, w9, w10 +; CHECK-NEXT: mov w9, #1431655765 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i16 %X, 3 %cmp = icmp eq i16 %urem, 2 @@ -217,12 +215,11 @@ ; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #33 -; CHECK-NEXT: add w9, w9, w9, lsl #1 -; CHECK-NEXT: sub w8, w8, w9 -; CHECK-NEXT: cmp w8, #2 // =2 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w10, #-1431655766 +; CHECK-NEXT: madd w8, w8, w9, w10 +; CHECK-NEXT: mov w9, #1431655765 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i8 %X, 3 %cmp = icmp eq i8 %urem, 2 @@ -234,12 +231,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-6148914691236517206 ; CHECK-NEXT: movk x8, #43691 -; CHECK-NEXT: umulh x8, x0, x8 -; CHECK-NEXT: lsr x8, x8, #1 -; CHECK-NEXT: add x8, x8, x8, lsl #1 -; CHECK-NEXT: sub x8, x0, x8 -; CHECK-NEXT: cmp x8, #2 // =2 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov x9, #-6148914691236517206 +; CHECK-NEXT: madd x8, x0, x8, x9 +; CHECK-NEXT: mov x9, #6148914691236517205 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i64 %X, 3 %cmp = icmp eq i64 %urem, 2 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -4,18 +4,16 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_3: ; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x9, .LCPI0_1 ; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: adrp x9, .LCPI0_0 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_0] -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #1 -; CHECK-NEXT: movi v3.4s, #3 -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -26,18 +24,17 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_5: ; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: adrp x9, .LCPI1_0 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI1_0] -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #2 -; CHECK-NEXT: movi v3.4s, #5 -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s +; CHECK-NEXT: mov w9, #13106 +; CHECK-NEXT: movk w9, #13107, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: dup v1.4s, w9 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll @@ -5,27 +5,18 @@ define i1 @t32_3_1(i32 %X) nounwind { ; X86-LABEL: t32_3_1: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl %edx -; X86-NEXT: leal (%edx,%edx,2), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $1, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB +; X86-NEXT: addl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_3_1: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $33, %rcx -; X64-NEXT: leal (%rcx,%rcx,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $1, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB +; X64-NEXT: addl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 1 @@ -35,27 +26,18 @@ define i1 @t32_3_2(i32 %X) nounwind { ; X86-LABEL: t32_3_2: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl %edx -; X86-NEXT: leal (%edx,%edx,2), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB +; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_3_2: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $33, %rcx -; X64-NEXT: leal (%rcx,%rcx,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $2, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB +; X64-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA +; X64-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 2 @@ -66,27 +48,18 @@ define i1 @t32_5_1(i32 %X) nounwind { ; X86-LABEL: t32_5_1: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl $2, %edx -; X86-NEXT: leal (%edx,%edx,4), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $1, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD +; X86-NEXT: addl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_1: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $34, %rcx -; X64-NEXT: leal (%rcx,%rcx,4), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $1, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD +; X64-NEXT: addl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 1 @@ -96,27 +69,18 @@ define i1 @t32_5_2(i32 %X) nounwind { ; X86-LABEL: t32_5_2: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl $2, %edx -; X86-NEXT: leal (%edx,%edx,4), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD +; X86-NEXT: addl $1717986918, %eax # imm = 0x66666666 +; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_2: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $34, %rcx -; X64-NEXT: leal (%rcx,%rcx,4), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $2, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD +; X64-NEXT: addl $1717986918, %eax # imm = 0x66666666 +; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 2 @@ -126,27 +90,18 @@ define i1 @t32_5_3(i32 %X) nounwind { ; X86-LABEL: t32_5_3: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl $2, %edx -; X86-NEXT: leal (%edx,%edx,4), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD +; X86-NEXT: addl $-1717986919, %eax # imm = 0x99999999 +; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_3: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $34, %rcx -; X64-NEXT: leal (%rcx,%rcx,4), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $3, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD +; X64-NEXT: addl $-1717986919, %eax # imm = 0x99999999 +; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 3 @@ -156,27 +111,18 @@ define i1 @t32_5_4(i32 %X) nounwind { ; X86-LABEL: t32_5_4: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl $2, %edx -; X86-NEXT: leal (%edx,%edx,4), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $4, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD +; X86-NEXT: addl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_4: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $34, %rcx -; X64-NEXT: leal (%rcx,%rcx,4), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $4, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD +; X64-NEXT: addl $-858993460, %eax # imm = 0xCCCCCCCC +; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 4 @@ -187,29 +133,20 @@ define i1 @t32_6_1(i32 %X) nounwind { ; X86-LABEL: t32_6_1: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl %edx -; X86-NEXT: andl $-2, %edx -; X86-NEXT: leal (%edx,%edx,2), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $1, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB +; X86-NEXT: addl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: rorl %eax +; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_1: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $34, %rcx -; X64-NEXT: addl %ecx, %ecx -; X64-NEXT: leal (%rcx,%rcx,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $1, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB +; X64-NEXT: addl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: rorl %eax +; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 1 @@ -219,29 +156,20 @@ define i1 @t32_6_2(i32 %X) nounwind { ; X86-LABEL: t32_6_2: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl %edx -; X86-NEXT: andl $-2, %edx -; X86-NEXT: leal (%edx,%edx,2), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB +; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: rorl %eax +; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_2: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $34, %rcx -; X64-NEXT: addl %ecx, %ecx -; X64-NEXT: leal (%rcx,%rcx,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $2, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB +; X64-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA +; X64-NEXT: rorl %eax +; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 2 @@ -251,29 +179,20 @@ define i1 @t32_6_3(i32 %X) nounwind { ; X86-LABEL: t32_6_3: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl %edx -; X86-NEXT: andl $-2, %edx -; X86-NEXT: leal (%edx,%edx,2), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB +; X86-NEXT: decl %eax +; X86-NEXT: rorl %eax +; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_3: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $34, %rcx -; X64-NEXT: addl %ecx, %ecx -; X64-NEXT: leal (%rcx,%rcx,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $3, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB +; X64-NEXT: decl %eax +; X64-NEXT: rorl %eax +; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 3 @@ -283,29 +202,20 @@ define i1 @t32_6_4(i32 %X) nounwind { ; X86-LABEL: t32_6_4: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl %edx -; X86-NEXT: andl $-2, %edx -; X86-NEXT: leal (%edx,%edx,2), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $4, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB +; X86-NEXT: addl $1431655764, %eax # imm = 0x55555554 +; X86-NEXT: rorl %eax +; X86-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_4: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $34, %rcx -; X64-NEXT: addl %ecx, %ecx -; X64-NEXT: leal (%rcx,%rcx,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $4, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB +; X64-NEXT: addl $1431655764, %eax # imm = 0x55555554 +; X64-NEXT: rorl %eax +; X64-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 4 @@ -315,29 +225,20 @@ define i1 @t32_6_5(i32 %X) nounwind { ; X86-LABEL: t32_6_5: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: shrl %edx -; X86-NEXT: andl $-2, %edx -; X86-NEXT: leal (%edx,%edx,2), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: cmpl $5, %ecx -; X86-NEXT: sete %al +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB +; X86-NEXT: addl $-1431655767, %eax # imm = 0xAAAAAAA9 +; X86-NEXT: rorl %eax +; X86-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_5: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: shrq $34, %rcx -; X64-NEXT: addl %ecx, %ecx -; X64-NEXT: leal (%rcx,%rcx,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpl $5, %edi -; X64-NEXT: sete %al +; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB +; X64-NEXT: addl $-1431655767, %eax # imm = 0xAAAAAAA9 +; X64-NEXT: rorl %eax +; X64-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 5 @@ -350,24 +251,20 @@ define i1 @t16_3_2(i16 %X) nounwind { ; X86-LABEL: t16_3_2: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull $43691, %eax, %ecx # imm = 0xAAAB -; X86-NEXT: shrl $17, %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmpw $2, %ax -; X86-NEXT: sete %al +; X86-NEXT: imull $-21845, {{[0-9]+}}(%esp), %eax # imm = 0xAAAB +; X86-NEXT: addl $-21846, %eax # imm = 0xAAAA +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: cmpl $21845, %eax # imm = 0x5555 +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t16_3_2: ; X64: # %bb.0: -; X64-NEXT: movzwl %di, %eax -; X64-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB -; X64-NEXT: shrl $17, %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: cmpw $2, %di -; X64-NEXT: sete %al +; X64-NEXT: imull $-21845, %edi, %eax # imm = 0xAAAB +; X64-NEXT: addl $-21846, %eax # imm = 0xAAAA +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: cmpl $21845, %eax # imm = 0x5555 +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i16 %X, 3 %cmp = icmp eq i16 %urem, 2 @@ -377,24 +274,18 @@ define i1 @t8_3_2(i8 %X) nounwind { ; X86-LABEL: t8_3_2: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull $171, %eax, %ecx -; X86-NEXT: shrl $9, %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: subb %cl, %al -; X86-NEXT: cmpb $2, %al -; X86-NEXT: sete %al +; X86-NEXT: imull $-85, {{[0-9]+}}(%esp), %eax +; X86-NEXT: addb $-86, %al +; X86-NEXT: cmpb $85, %al +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t8_3_2: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: imull $171, %eax, %ecx -; X64-NEXT: shrl $9, %ecx -; X64-NEXT: leal (%rcx,%rcx,2), %ecx -; X64-NEXT: subb %cl, %al -; X64-NEXT: cmpb $2, %al -; X64-NEXT: sete %al +; X64-NEXT: imull $-85, %edi, %eax +; X64-NEXT: addb $-86, %al +; X64-NEXT: cmpb $85, %al +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i8 %X, 3 %cmp = icmp eq i8 %urem, 2 @@ -419,14 +310,13 @@ ; ; X64-LABEL: t64_3_2: ; X64: # %bb.0: -; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: shrq %rdx -; X64-NEXT: leaq (%rdx,%rdx,2), %rax -; X64-NEXT: subq %rax, %rdi -; X64-NEXT: cmpq $2, %rdi -; X64-NEXT: sete %al +; X64-NEXT: movabsq $-6148914691236517205, %rax # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: imulq %rdi, %rax +; X64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; X64-NEXT: cmpq %rax, %rcx +; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i64 %X, 3 %cmp = icmp eq i64 %urem, 2 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -8,77 +8,52 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_3: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $1, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_3: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,1431655764,1431655764,1431655764] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_3: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_3: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_3: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -88,77 +63,53 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_5: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pslld $2, %xmm1 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_5: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_5: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_5: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,5,5,5] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_5: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -233,16 +184,11 @@ ; ; CHECK-AVX512VL-LABEL: t32_6_part0: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -317,16 +263,11 @@ ; ; CHECK-AVX512VL-LABEL: t32_6_part1: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -415,18 +356,12 @@ ; ; CHECK-AVX512VL-LABEL: t32_tautological: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem,