Index: include/llvm/CodeGen/TargetLowering.h
===================================================================
--- include/llvm/CodeGen/TargetLowering.h
+++ include/llvm/CodeGen/TargetLowering.h
@@ -3495,6 +3495,12 @@
                     bool IsAfterLegalization,
                     SmallVectorImpl<SDNode *> &Created) const;
 
+  //===--------------------------------------------------------------------===//
+  // REM equality fold
+  //
+  SDValue BuildREMEqFold(SDNode *N, SDNode *CondNode,
+                         SelectionDAG &DAG, std::vector<SDNode *> &Created) const;
+
   /// Targets may override this function to provide custom SDIV lowering for
   /// power-of-2 denominators.  If the target returns an empty SDValue, LLVM
   /// assumes SDIV is expensive and replaces it with a series of other integer
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -431,6 +431,7 @@
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
+    SDValue BuildREMEqFold(SDNode *REMNode, SDNode *CondNode);
     SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
     SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
@@ -3368,6 +3369,33 @@
     }
   }
 
+  // fold (rem lhs, rhs) -> lhs when |lhs| < |rhs|
+  // FIXME: this optimization is otherwise missed if BuildREMEqFold succeeds.
+  // Ref: test1 in test/CodeGen/X86/jump_sign.ll
+  if (N1C) {
+    KnownBits LHSBits;
+    DAG.computeKnownBits(N0, LHSBits);
+    LHSBits.makeNonNegative();
+    unsigned TypeWidth = N0.getValueSizeInBits();
+    unsigned LeadingZeros = LHSBits.countMinLeadingZeros();
+    APInt N1Value = N1C->getAPIntValue();
+    // If the possibly nonzero part of LHS is wider than RHS,
+    // we cannot conclude |LHS| < |RHS|
+    if (TypeWidth - LeadingZeros < N1Value.getBitWidth()) {
+      APInt LHSMaxValue(N1Value.getBitWidth(), 0);
+      LHSMaxValue.setLowBits(TypeWidth - LeadingZeros);
+      N1Value.clearSignBit();
+      // If |LHS| < |RHS|, the remainder is just LHS.
+      if (LHSMaxValue.ult(N1Value))
+        return N0;
+    }
+  }
+
+  if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::SETCC) {
+    if (SDValue Folded = BuildREMEqFold(N, *N->use_begin()))
+      return Folded;
+  }
+
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
 
   // If X/C can be simplified by the division-by-constant logic, lower
@@ -18096,6 +18124,36 @@
   return S;
 }
 
+/// Given an ISD::UREM or ISD::SREM used only by an ISD::SETEQ or ISD::SETNE
+/// where the divisor is constant and the comparison target is zero,
+/// return a DAG expression that will generate the same comparison result
+/// using only multiplications, additions and shifts/rotations.
+/// Ref: "Hacker's Delight" 10-17.
+SDValue DAGCombiner::BuildREMEqFold(SDNode *REMNode, SDNode *CondNode) {
+  // UREM/SREM will be shorter than any other path we take here
+  if (DAG.getMachineFunction().getFunction().optForMinSize())
+      return SDValue();
+
+  SDValue CondCodeNode = CondNode->getOperand(2);
+  ISD::CondCode CondCode = cast<CondCodeSDNode>(CondCodeNode)->get();
+  if (CondCode != ISD::SETEQ && CondCode != ISD::SETNE)
+    return SDValue();
+
+  std::vector<SDNode *> Built;
+  SDValue NewCond = TLI.BuildREMEqFold(REMNode, CondNode, DAG, Built);
+
+  if (NewCond) {
+    SDNode *Last = Built.back();
+    for (SDNode *N : Built)
+      AddToWorklist(N);
+    AddToWorklist(NewCond.getNode());
+    DAG.ReplaceAllUsesWith(SDValue(CondNode, 0), NewCond);
+    return SDValue(Last, 0);
+  }
+
+  return SDValue();
+}
+
 /// Given an ISD::UDIV node expressing a divide by constant, return a DAG
 /// expression that will generate the same value by multiplying by a magic
 /// number.
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3596,6 +3596,102 @@
   }
 }
 
+/// Given an ISD::UREM or ISD::SREM used only by an ISD::SETEQ or ISD::SETNE
+/// where the divisor is constant and the comparison target is zero,
+/// return a DAG expression that will generate the same comparison result
+/// using only multiplications, additions and shifts/rotations.
+/// Ref: "Hacker's Delight" 10-17.
+SDValue TargetLowering::BuildREMEqFold(SDNode *REMNode, SDNode *CondNode,
+                                       SelectionDAG &DAG,
+                                       std::vector<SDNode *> &Created) const {
+  // fold (seteq/ne (urem N, D), 0) -> (setule/gte (rotr (mul N, P), K), Q)
+  // fold (seteq/ne (srem N, D), 0)
+  //          -> (setule/gte (rotr (add (mul N, P), Q''), K), (srl Q', K-1))
+  // - D must be constant with D = D0 * 2^K where D0 is odd
+  // - P is the multiplicative inverse of D0 modulo 2^W
+  // - Q = floor((2^W - 1) / D0)
+  // - Q' = floor((2^(W - 1) - 1) / D0)
+  // - Q'' = Q' & -2^K
+  // where W is the width of the common type of N and D
+  EVT REMVT = REMNode->getValueType(0);
+  EVT CondVT = CondNode->getValueType(0);
+  SDLoc DL(REMNode);
+  bool IsEq =
+      cast<CondCodeSDNode>(CondNode->getOperand(2))->get() == ISD::SETEQ;
+  bool IsSigned = (REMNode->getOpcode() == ISD::SREM);
+
+  if (!isTypeLegal(REMVT))
+    return SDValue();
+
+  // TODO: Add non-uniform constant support
+  ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1));
+  ConstantSDNode *CompTarget = isConstOrConstSplat(CondNode->getOperand(1));
+  if (!Divisor || !CompTarget || Divisor->isNullValue() || !CompTarget->isNullValue())
+    return SDValue();
+  APInt D = Divisor->getAPIntValue();
+
+  // Rewrite D = D0 * 2^K
+  unsigned K = D.countTrailingZeros();
+  APInt D0 = D.ashr(K);
+
+  // Calculate the multiplicative inverse of D0 using Newton's method.
+  APInt tmp, P = D0;
+  while ((tmp = D0*P) != 1)
+    P *= APInt(D0.getBitWidth(), 2) - tmp;
+
+  // Q = floor((2^W - 1) / D)
+  APInt Q = APInt::getAllOnesValue(D.getBitWidth());
+  if (IsSigned)
+    Q.clearBit(D.getBitWidth());
+  Q = Q.udiv(D);
+  if (IsSigned) {
+    Q.ashrInPlace(1);
+    // Q'' = Q' & -2^K
+    APInt Neg2K = APInt(D.getBitWidth(), 0);
+    Neg2K.setHighBits(D.getBitWidth() - K);
+    Q &= Neg2K;
+  }
+
+  SDValue PVal = DAG.getConstant(P, DL, REMVT);
+  SDValue QVal = DAG.getConstant(Q, DL, REMVT);
+  // (mul N, P)
+  SDValue Op1 = DAG.getNode(ISD::MUL, DL, REMVT, REMNode->getOperand(0), PVal);
+  Created.push_back(Op1.getNode());
+  // Will change in the signed case
+  SDValue Op2 = QVal;
+
+  if (IsSigned) {
+    // (add (mul N, P), Q')
+    Op1 = DAG.getNode(ISD::ADD, DL, REMVT, Op1, QVal);
+    Created.push_back(Op1.getNode());
+  }
+
+  // Rotate right only if D was even and thus shifted (K > 0)
+  if (K) {
+    SDValue ShAmt =
+        DAG.getConstant(K, DL, getShiftAmountTy(REMVT, DAG.getDataLayout()));
+    SDNodeFlags Flags;
+    Flags.setExact(true);
+    // UREM: (rotr (mul N, P), K)
+    // SREM: (rotr (add (mul N, P), Q'), K)
+    Op1 = DAG.getNode(ISD::ROTR, DL, REMVT, Op1, ShAmt, Flags);
+    Created.push_back(Op1.getNode());
+  }
+
+  // Don't insert a useless node if K - 1 == 0
+  if (IsSigned && K != 1) {
+    if (K != 0)
+      Op2 = DAG.getConstant(Q.ashr(K - 1), DL, REMVT);
+    else
+      Op2 = DAG.getConstant(Q.shl(1), DL, REMVT);
+  }
+
+  // UREM: (setule/setugt (rotr (mul N, P), K), Q)
+  // SREM: (setule/setugt (rotr (add (mul N, P), Q''), K), (srl Q', K-1))
+  Op1 = DAG.getSetCC(DL, CondVT, Op1, Op2, (IsEq ? ISD::SETULE : ISD::SETUGT));
+  return Op1;
+}
+
 bool TargetLowering::
 verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
   if (!isa<ConstantSDNode>(Op.getOperand(0))) {
Index: test/CodeGen/Hexagon/swp-const-tc2.ll
===================================================================
--- test/CodeGen/Hexagon/swp-const-tc2.ll
+++ test/CodeGen/Hexagon/swp-const-tc2.ll
@@ -9,14 +9,14 @@
 ; CHECK: = mpy
 ; CHECK-NOT: = mpy
 
-define void @f0() {
+define i32 @f0(i32* %a0) {
 b0:
   br label %b1
 
 b1:                                               ; preds = %b1, %b0
   %v0 = phi i32 [ 0, %b0 ], [ %v9, %b1 ]
   %v1 = phi i32 [ 0, %b0 ], [ %v8, %b1 ]
-  %v2 = load i32, i32* undef, align 4
+  %v2 = load i32, i32* %a0, align 4
   %v3 = add nsw i32 %v1, 1
   %v4 = srem i32 %v2, 3
   %v5 = icmp ne i32 %v4, 0
@@ -32,5 +32,5 @@
   br label %b3
 
 b3:                                               ; preds = %b3, %b2
-  br label %b3
+  ret i32 %v11
 }
Index: test/CodeGen/X86/jump_sign.ll
===================================================================
--- test/CodeGen/X86/jump_sign.ll
+++ test/CodeGen/X86/jump_sign.ll
@@ -236,13 +236,11 @@
 ; CHECK-NEXT:    jne .LBB12_8
 ; CHECK-NEXT:  # %bb.4: # %if.end29
 ; CHECK-NEXT:    movzwl (%eax), %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    imull $52429, %eax, %ecx # imm = 0xCCCD
-; CHECK-NEXT:    shrl $19, %ecx
-; CHECK-NEXT:    addl %ecx, %ecx
-; CHECK-NEXT:    leal (%ecx,%ecx,4), %ecx
-; CHECK-NEXT:    cmpw %cx, %ax
-; CHECK-NEXT:    jne .LBB12_5
+; CHECK-NEXT:    imull $-13107, %eax, %eax # imm = 0xCCCD
+; CHECK-NEXT:    rorw $1, %ax
+; CHECK-NEXT:    movzwl	%ax, %eax
+; CHECK-NEXT:    cmpl	$6554, %eax
+; CHECK-NEXT:    jae	.LBB12_5
 ; CHECK-NEXT:  .LBB12_8: # %if.then44
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
Index: test/CodeGen/X86/rem.ll
===================================================================
--- test/CodeGen/X86/rem.ll
+++ test/CodeGen/X86/rem.ll
@@ -78,3 +78,87 @@
   ret i32 %0
 }
 
+define i32 @test6(i32 %X) nounwind readnone {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    imull $-858993459, {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cmpl $858993460, %eax
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    jb [[LOC:\.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movl $-10, %eax
+; CHECK-NEXT:  [[LOC]]: # %entry
+; CHECK-NEXT:    retl
+
+entry:
+  %0 = urem i32 %X, 5
+  %cmp = icmp eq i32 %0, 0
+  %ret = select i1 %cmp, i32 42, i32 -10
+  ret i32 %ret
+}
+
+define i16 @test7(i16 %X) nounwind readnone {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    imull $28087, {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    rorw $1, %ax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    cmpl $4681, %eax
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    ja [[LOC:\.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movl $65526, %eax
+; CHECK-NEXT:  [[LOC]]: # %entry
+; CHECK-NEXT:  # kill: {{.*}}
+; CHECK-NEXT:    retl
+
+entry:
+  %0 = urem i16 %X, 14
+  %cmp = icmp ne i16 %0, 0
+  %ret = select i1 %cmp, i16 42, i16 -10
+  ret i16 %ret
+}
+
+define i32 @test8(i32 %X) nounwind readnone {
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    imull $-1108378657, {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addl $69273666, %eax
+; CHECK-NEXT:    cmpl $138547333, %eax
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    jb [[LOC:\.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movl $-10, %eax
+; CHECK-NEXT:  [[LOC]]: # %entry
+; CHECK-NEXT:    retl
+
+entry:
+  %0 = srem i32 %X, 31
+  %cmp = icmp eq i32 %0, 0
+  %ret = select i1 %cmp, i32 42, i32 -10
+  ret i32 %ret
+}
+
+define i8 @test9(i8 %X) nounwind readnone {
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    movb $-85, %cl
+; CHECK-NEXT:    mulb %cl
+; CHECK-NEXT:    addb $8, %al
+; CHECK-NEXT:    rorb $2, %al
+; CHECK-NEXT:    cmpb $4, %al
+; CHECK-NEXT:    movb $42, %al
+; CHECK-NEXT:    ja [[LOC:\.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movb $-10, %al
+; CHECK-NEXT:  [[LOC]]: # %entry
+; CHECK-NEXT:    retl
+
+entry:
+  %0 = srem i8 %X, 12
+  %cmp = icmp ne i8 %0, 0
+  %ret = select i1 %cmp, i8 42, i8 -10
+  ret i8 %ret
+}
+
Index: test/CodeGen/X86/vselect-avx.ll
===================================================================
--- test/CodeGen/X86/vselect-avx.ll
+++ test/CodeGen/X86/vselect-avx.ll
@@ -85,17 +85,9 @@
 define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,  <4 x i16> %tmp3, <4 x i16> %tmp12) {
 ; AVX1-LABEL: test3:
 ; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766]
-; AVX1-NEXT:    vpmuldq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmuldq %xmm4, %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpsrld $31, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpmulld LCPI{{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd LCPI{{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpminud LCPI{{.*}}(%rip), %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
@@ -106,19 +98,12 @@
 ;
 ; AVX2-LABEL: test3:
 ; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpmuldq %xmm4, %xmm5, %xmm4
-; AVX2-NEXT:    vpmuldq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
-; AVX2-NEXT:    vpsrld $31, %xmm3, %xmm4
-; AVX2-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3]
-; AVX2-NEXT:    vpmulld %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531]
+; AVX2-NEXT:    vpmulld %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [715827882,715827882,715827882,715827882]
+; AVX2-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1431655764,1431655764,1431655764,1431655764]
+; AVX2-NEXT:    vpminud %xmm3, %xmm0, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0