Index: include/llvm/CodeGen/TargetLowering.h
===================================================================
--- include/llvm/CodeGen/TargetLowering.h
+++ include/llvm/CodeGen/TargetLowering.h
@@ -3695,6 +3695,9 @@
                                                SDValue N1, ISD::CondCode Cond,
                                                DAGCombinerInfo &DCI,
                                                const SDLoc &DL) const;
+
+  SDValue BuildUREMEqFold(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
+                          DAGCombinerInfo &DCI, const SDLoc &DL) const;
 };
 
 /// Given an LLVM IR type and return type attributes, compute the return value
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2774,6 +2774,18 @@
       return V;
   }
 
+  // Fold remainder of division by a a constant
+  if (N0.getOpcode() == ISD::UREM && N0.hasOneUse() &&
+      (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+    AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+
+    // When division is cheap or optimizing for minimum size,
+    // fall through to DIVREM creation by skipping this fold.
+    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize))
+      if (SDValue Folded = BuildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
+        return Folded;
+  }
+
   // Fold away ALL boolean setcc's.
   SDValue Temp;
   if (N0.getValueType().getScalarType() == MVT::i1 && foldBooleans) {
@@ -3703,6 +3715,80 @@
   return DAG.getSelect(dl, VT, IsOne, N0, Q);
 }
 
+/// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE
+/// where the divisor is constant and the comparison target is zero,
+/// return a DAG expression that will generate the same comparison result
+/// using only multiplications, additions and shifts/rotations.
+/// Ref: "Hacker's Delight" 10-17.
+SDValue TargetLowering::BuildUREMEqFold(EVT VT, SDValue REMNode,
+                                        SDValue CompNode, ISD::CondCode Cond,
+                                        DAGCombinerInfo &DCI,
+                                        const SDLoc &DL) const {
+  // fold (seteq/ne (urem N, D), 0) -> (setule/gte (rotr (mul N, P), K), Q)
+  // - D must be constant with D = D0 * 2^K where D0 is odd
+  // - P is the multiplicative inverse of D0 modulo 2^W
+  // - Q = floor((2^W - 1) / D0)
+  // where W is the width of the common type of N and D
+  SelectionDAG &DAG = DCI.DAG;
+  EVT REMVT = REMNode->getValueType(0);
+  bool IsEq = (Cond == ISD::SETEQ);
+
+  if (!isTypeLegal(REMVT))
+    return SDValue();
+
+  // TODO: Add non-uniform constant support
+  ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1));
+  ConstantSDNode *CompTarget = isConstOrConstSplat(CompNode);
+  if (!Divisor || !CompTarget || Divisor->isNullValue() ||
+      !CompTarget->isNullValue())
+    return SDValue();
+
+  APInt D = Divisor->getAPIntValue();
+  unsigned W = D.getBitWidth();
+
+  // Rewrite D = D0 * 2^K
+  unsigned K = D.countTrailingZeros();
+  APInt D0 = D.lshr(K);
+
+  // If D0 == 1, we cannot build this fold. Otherwise, D0 > 1, as needed.
+  if (D0.isOneValue())
+    return SDValue();
+
+  // Calculate the multiplicative inverse P of D0 using Newton's method.
+  APInt tmp;
+  APInt P = D0;
+  while ((tmp = D0 * P) != 1)
+    P *= APInt(D0.getBitWidth(), 2) - tmp;
+
+  // Q = floor((2^W - 1) / D0)
+  APInt Q = APInt::getAllOnesValue(W);
+  Q = Q.udiv(D0);
+
+  SDValue PVal = DAG.getConstant(P, DL, REMVT);
+  SDValue QVal = DAG.getConstant(Q, DL, REMVT);
+  // (mul N, P)
+  SDValue Op1 = DAG.getNode(ISD::MUL, DL, REMVT, REMNode->getOperand(0), PVal);
+  DCI.AddToWorklist(Op1.getNode());
+  // Will change in the signed case
+  SDValue Op2 = QVal;
+
+  // Rotate right only if D was even and thus shifted (K > 0)
+  if (K) {
+    SDValue ShAmt =
+        DAG.getConstant(K, DL, getShiftAmountTy(REMVT, DAG.getDataLayout()));
+    SDNodeFlags Flags;
+    Flags.setExact(true);
+    // UREM: (rotr (mul N, P), K)
+    // SREM: (rotr (add (mul N, P), Q'), K)
+    Op1 = DAG.getNode(ISD::ROTR, DL, REMVT, Op1, ShAmt, Flags);
+    DCI.AddToWorklist(Op1.getNode());
+  }
+
+  // UREM: (setule/setugt (rotr (mul N, P), K), Q)
+  Op1 = DAG.getSetCC(DL, VT, Op1, Op2, (IsEq ? ISD::SETULE : ISD::SETUGT));
+  return Op1;
+}
+
 bool TargetLowering::
 verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
   if (!isa<ConstantSDNode>(Op.getOperand(0))) {
Index: test/CodeGen/AArch64/urem-seteq-optsize.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/urem-seteq-optsize.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=aarch64-eabi | FileCheck %s
+
+; On AArch64, division in expensive. BuildRemEqFold should therefore run even
+; when optimizing for size. Only optimizing for minimum size retains a plain div.
+
+define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone {
+; CHECK: udiv
+; CHECK-NOT: mul
+  %rem = urem i32 %X, 5
+  %cmp = icmp eq i32 %rem, 0
+  %ret = select i1 %cmp, i32 42, i32 -10
+  ret i32 %ret
+}
+
+define i32 @test_optsize(i32 %X) optsize nounwind readnone {
+; CHECK: mul
+; CHECK-NOT: udiv
+  %rem = urem i32 %X, 5
+  %cmp = icmp eq i32 %rem, 0
+  %ret = select i1 %cmp, i32 42, i32 -10
+  ret i32 %ret
+}
Index: test/CodeGen/AArch64/urem-seteq.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/urem-seteq.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-eabi | FileCheck %s
+
+; This tests the BuildREMEqFold optimization with UREM, i32, odd divisor, SETEQ.
+; The corresponding pseudocode is:
+; Q <- [N * multInv(5, 2^32)] <=> [N * 0xCCCCCCCD] <=> [N * (-858993459)]
+; res <- [Q <= (2^32 - 1) / 5] <=> [Q <= 858993459] <=> [Q < 858993460]
+define i32 @test_urem_odd(i32 %X) nounwind readnone {
+; CHECK-LABEL: test_urem_odd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #52429
+; CHECK-NEXT:    movk w8, #52428, lsl #16
+; CHECK-NEXT:    mov w9, #13108
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    movk w9, #13107, lsl #16
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+entry:
+  %0 = urem i32 %X, 5
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; This is like test_urem_odd, except the divisor has bit 30 set.
+define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone {
+; CHECK-LABEL: test_urem_odd_bit30:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #43691
+; CHECK-NEXT:    movk w8, #27306, lsl #16
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    cmp w8, #4 // =4
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+entry:
+  %0 = urem i32 %X, 1073741827
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; This is like test_urem_odd, except the divisor has bit 31 set.
+define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone {
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #43691
+; CHECK-NEXT:    movk w8, #10922, lsl #16
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    cmp w8, #2 // =2
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+entry:
+  %0 = urem i32 %X, 2147483651
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; This tests the BuildREMEqFold optimization with UREM, i16, even divisor, SETNE.
+; In this case, D <=> 14 <=> 7 * 2^1, so D0 = 7 and K = 1.
+; The corresponding pseudocode is:
+; Q <- [N * multInv(D0, 2^16)] <=> [N * multInv(7, 2^16)] <=> [N * 28087]
+; Q <- [Q >>rot K] <=> [Q >>rot 1]
+; res <- ![Q <= (2^16 - 1) / 7] <=> ![Q <= 9362] <=> [Q > 9362]
+define i16 @test_urem_even(i16 %X) nounwind readnone {
+; CHECK-LABEL: test_urem_even:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w9, #28087
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    movk w9, #46811, lsl #16
+; CHECK-NEXT:    mul w8, w8, w9
+; CHECK-NEXT:    mov w9, #18724
+; CHECK-NEXT:    ror w8, w8, #1
+; CHECK-NEXT:    movk w9, #9362, lsl #16
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+entry:
+  %0 = urem i16 %X, 14
+  %cmp = icmp ne i16 %0, 0
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+; This is like test_urem_even, except the divisor has bit 30 set.
+define i32 @test_urem_even_bit30(i32 %X) nounwind readnone {
+; CHECK-LABEL: test_urem_even_bit30:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #20165
+; CHECK-NEXT:    movk w8, #64748, lsl #16
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    ror w8, w8, #3
+; CHECK-NEXT:    cmp w8, #32 // =32
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+entry:
+  %0 = urem i32 %X, 1073741928
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; This is like test_urem_odd, except the divisor has bit 31 set.
+define i32 @test_urem_even_bit31(i32 %X) nounwind readnone {
+; CHECK-LABEL: test_urem_even_bit31:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #64251
+; CHECK-NEXT:    movk w8, #47866, lsl #16
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    ror w8, w8, #1
+; CHECK-NEXT:    cmp w8, #4 // =4
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+entry:
+  %0 = urem i32 %X, 2147483750
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; We should not proceed with this fold if the divisor is 1 or -1
+define i32 @test_urem_one(i32 %X) nounwind readnone {
+; CHECK-LABEL: test_urem_one:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr w0, wzr, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = urem i32 %X, 1
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; We can lower remainder of division by powers of two much better elsewhere;
+; also, BuildREMEqFold does not work when the only odd factor of the divisor is 1.
+; This ensures we don't touch powers of two.
+define i32 @test_urem_pow2(i32 %X) nounwind readnone {
+; CHECK-LABEL: test_urem_pow2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    tst w0, #0xf
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %0 = urem i32 %X, 16
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
Index: test/CodeGen/X86/jump_sign.ll
===================================================================
--- test/CodeGen/X86/jump_sign.ll
+++ test/CodeGen/X86/jump_sign.ll
@@ -236,13 +236,11 @@
 ; CHECK-NEXT:    jne .LBB12_8
 ; CHECK-NEXT:  # %bb.4: # %if.end29
 ; CHECK-NEXT:    movzwl (%eax), %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    imull $52429, %eax, %ecx # imm = 0xCCCD
-; CHECK-NEXT:    shrl $19, %ecx
-; CHECK-NEXT:    addl %ecx, %ecx
-; CHECK-NEXT:    leal (%ecx,%ecx,4), %ecx
-; CHECK-NEXT:    cmpw %cx, %ax
-; CHECK-NEXT:    jne .LBB12_5
+; CHECK-NEXT:    imull $-13107, %eax, %eax
+; CHECK-NEXT:    rorw $1, %ax
+; CHECK-NEXT:    movzwl	%ax, %eax
+; CHECK-NEXT:    cmpl	$13108, %eax
+; CHECK-NEXT:    jae	.LBB12_5
 ; CHECK-NEXT:  .LBB12_8: # %if.then44
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
Index: test/CodeGen/X86/urem-seteq-optsize.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/urem-seteq-optsize.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64,NOBMI2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi2 < %s | FileCheck %s --check-prefixes=CHECK,X64,BMI2
+
+; On X86, division in expensive. BuildRemEqFold should therefore run even
+; when optimizing for size. Only optimizing for minimum size retains a plain div.
+
+define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone {
+; CHECK: divl
+; CHECK-NOT: imull
+  %rem = urem i32 %X, 5
+  %cmp = icmp eq i32 %rem, 0
+  %ret = select i1 %cmp, i32 42, i32 -10
+  ret i32 %ret
+}
+
+define i32 @test_optsize(i32 %X) optsize nounwind readnone {
+; CHECK: imull
+; CHECK-NOT: divl
+  %rem = urem i32 %X, 5
+  %cmp = icmp eq i32 %rem, 0
+  %ret = select i1 %cmp, i32 42, i32 -10
+  ret i32 %ret
+}
Index: test/CodeGen/X86/urem-seteq.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/urem-seteq.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64,NOBMI2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi2 < %s | FileCheck %s --check-prefixes=CHECK,X64,BMI2
+
+; This tests the BuildREMEqFold optimization with UREM, i32, odd divisor, SETEQ.
+; The corresponding pseudocode is:
+; Q <- [N * multInv(5, 2^32)] <=> [N * 0xCCCCCCCD] <=> [N * (-858993459)]
+; res <- [Q <= (2^32 - 1) / 5] <=> [Q <= 858993459] <=> [Q < 858993460]
+define i32 @test_urem_odd(i32 %X) nounwind readnone {
+; X86-LABEL: test_urem_odd:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $858993460, %ecx # imm = 0x33333334
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_urem_odd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $858993460, %ecx # imm = 0x33333334
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+
+entry:
+  %0 = urem i32 %X, 5
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; This is like test_urem_odd, except the divisor has bit 30 set.
+define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone {
+; X86-LABEL: test_urem_odd_bit30:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    imull $1789569707, {{[0-9]+}}(%esp), %ecx # imm = 0x6AAAAAAB
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $4, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_urem_odd_bit30:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    imull $1789569707, %edi, %ecx # imm = 0x6AAAAAAB
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $4, %ecx
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+
+entry:
+  %0 = urem i32 %X, 1073741827
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; This is like test_urem_odd, except the divisor has bit 31 set.
+define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone {
+; X86-LABEL: test_urem_odd_bit31:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    imull $715827883, {{[0-9]+}}(%esp), %ecx # imm = 0x2AAAAAAB
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $2, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_urem_odd_bit31:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    imull $715827883, %edi, %ecx # imm = 0x2AAAAAAB
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $2, %ecx
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+
+entry:
+  %0 = urem i32 %X, 2147483651
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; This tests the BuildREMEqFold optimization with UREM, i16, even divisor, SETNE.
+; In this case, D <=> 14 <=> 7 * 2^1, so D0 = 7 and K = 1.
+; The corresponding pseudocode is:
+; Q <- [N * multInv(D0, 2^16)] <=> [N * multInv(7, 2^16)] <=> [N * 28087]
+; Q <- [Q >>rot K] <=> [Q >>rot 1]
+; res <- ![Q <= (2^16 - 1) / 7] <=> ![Q <= 9362] <=> [Q > 9362]
+define i16 @test_urem_even(i16 %X) nounwind readnone {
+; X86-LABEL: test_urem_even:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    imull $28087, {{[0-9]+}}(%esp), %eax # imm = 0x6DB7
+; X86-NEXT:    rorw $1, %ax
+; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $9362, %ecx # imm = 0x2492
+; X86-NEXT:    seta %al
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_urem_even:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    imull $28087, %edi, %eax # imm = 0x6DB7
+; X64-NEXT:    rorw $1, %ax
+; X64-NEXT:    movzwl %ax, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $9362, %ecx # imm = 0x2492
+; X64-NEXT:    seta %al
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+
+entry:
+  %0 = urem i16 %X, 14
+  %cmp = icmp ne i16 %0, 0
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+; This is like test_urem_even, except the divisor has bit 30 set.
+define i32 @test_urem_even_bit30(i32 %X) nounwind readnone {
+; X86-LABEL: test_urem_even_bit30:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5
+; X86-NEXT:    rorl $3, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $32, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_urem_even_bit30:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    imull $-51622203, %edi, %ecx # imm = 0xFCEC4EC5
+; X64-NEXT:    rorl $3, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $32, %ecx
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+
+entry:
+  %0 = urem i32 %X, 1073741928
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; This is like test_urem_odd, except the divisor has bit 31 set.
+define i32 @test_urem_even_bit31(i32 %X) nounwind readnone {
+; X86-LABEL: test_urem_even_bit31:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    imull $-1157956869, {{[0-9]+}}(%esp), %ecx # imm = 0xBAFAFAFB
+; X86-NEXT:    rorl $1, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $4, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_urem_even_bit31:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    imull $-1157956869, %edi, %ecx # imm = 0xBAFAFAFB
+; X64-NEXT:    rorl $1, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $4, %ecx
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+
+entry:
+  %0 = urem i32 %X, 2147483750
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; We should not proceed with this fold if the divisor is 1 or -1
+define i32 @test_urem_one(i32 %X) nounwind readnone {
+; CHECK-LABEL: test_urem_one:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    ret{{[l|q]}}
+entry:
+  %0 = urem i32 %X, 1
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+; We can lower remainder of division by powers of two much better elsewhere;
+; also, BuildREMEqFold does not work when the only odd factor of the divisor is 1.
+; This ensures we don't touch powers of two.
+define i32 @test_urem_pow2(i32 %X) nounwind readnone {
+; X86-LABEL: test_urem_pow2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testb $15, {{[0-9]+}}(%esp)
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_urem_pow2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $15, %dil
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+entry:
+  %0 = urem i32 %X, 16
+  %cmp = icmp eq i32 %0, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}