Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -231,7 +231,7 @@
   /// several shifts, adds, and multiplies for this target.
   /// The definition of "cheaper" may depend on whether we're optimizing
   /// for speed or for size.
-  virtual bool isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  virtual bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const {
     return false;
   }
 
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2369,14 +2369,14 @@
   // alternate sequence.  Targets may check function attributes for size/speed
   // trade-offs.
   AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
-  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
+  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr, true))
     if (SDValue Op = BuildSDIV(N))
       return Op;
 
   // sdiv, srem -> sdivrem
   // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true.
   // Otherwise, we break the simplification logic in visitREM().
-  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr, true))
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
@@ -2434,14 +2434,14 @@
 
   // fold (udiv x, c) -> alternate
   AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
-  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
+  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr, false))
     if (SDValue Op = BuildUDIV(N))
       return Op;
 
   // sdiv, srem -> sdivrem
   // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true.
   // Otherwise, we break the simplification logic in visitREM().
-  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr, false))
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
@@ -2506,7 +2506,7 @@
   // div is not cheap, combine will not return a DIVREM.  Regardless,
   // checking cheapness here makes sense since the simplification results in
   // fatter code.
-  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr)) {
+  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr, isSigned)) {
     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
     SDValue Div = DAG.getNode(DivOpcode, DL, VT, N0, N1);
     AddToWorklist(Div.getNode());
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2921,7 +2921,7 @@
                                       std::vector<SDNode *> *Created) const {
   AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.isIntDivCheap(N->getValueType(0), Attr))
+  if (TLI.isIntDivCheap(N->getValueType(0), Attr, true))
     return SDValue(N,0); // Lower SDIV as SDIV
   return SDValue();
 }
@@ -2950,19 +2950,29 @@
   APInt::ms magics = Divisor.magic();
 
   // Multiply the numerator (operand 0) by the magic value
-  // FIXME: We should support doing a MUL in a wider type
+  // FIXME: expand using MULHS for vector types after addressing possible
+  // regressions in X86 backend.
+  unsigned Opcode;
+  if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT)
+                          : isOperationLegalOrCustom(ISD::MULHS, VT))
+    Opcode = ISD::MULHS;
+  else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT)
+                               : isOperationLegalOrCustom(ISD::SMUL_LOHI, VT))
+    Opcode = ISD::SMUL_LOHI;
+  else if (!IsAfterLegalization && !VT.isVector())
+    Opcode = ISD::MULHS;
+  else
+    return SDValue();
+
   SDValue Q;
-  if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT) :
-                            isOperationLegalOrCustom(ISD::MULHS, VT))
+  if (Opcode == ISD::MULHS)
     Q = DAG.getNode(ISD::MULHS, dl, VT, N->getOperand(0),
                     DAG.getConstant(magics.m, dl, VT));
-  else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT) :
-                                 isOperationLegalOrCustom(ISD::SMUL_LOHI, VT))
+  else
     Q = SDValue(DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT),
                               N->getOperand(0),
                               DAG.getConstant(magics.m, dl, VT)).getNode(), 1);
-  else
-    return SDValue();       // No mulhs or equvialent
+
   // If d > 0 and m < 0, add the numerator
   if (Divisor.isStrictlyPositive() && magics.m.isNegative()) {
     Q = DAG.getNode(ISD::ADD, dl, VT, Q, N->getOperand(0));
@@ -3029,16 +3039,25 @@
   }
 
   // Multiply the numerator (operand 0) by the magic value
-  // FIXME: We should support doing a MUL in a wider type
-  if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) :
-                            isOperationLegalOrCustom(ISD::MULHU, VT))
+  // FIXME: expand using MULHU for vector types after addressing possible
+  // regressions in X86 backend.
+  unsigned Opcode;
+  if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT)
+                          : isOperationLegalOrCustom(ISD::MULHU, VT))
+    Opcode = ISD::MULHU;
+  else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT)
+                               : isOperationLegalOrCustom(ISD::UMUL_LOHI, VT))
+    Opcode = ISD::UMUL_LOHI;
+  else if (!IsAfterLegalization && !VT.isVector())
+    Opcode = ISD::MULHU;
+  else
+    return SDValue();
+
+  if (Opcode == ISD::MULHU)
     Q = DAG.getNode(ISD::MULHU, dl, VT, Q, DAG.getConstant(magics.m, dl, VT));
-  else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) :
-                                 isOperationLegalOrCustom(ISD::UMUL_LOHI, VT))
+  else
     Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), Q,
                             DAG.getConstant(magics.m, dl, VT)).getNode(), 1);
-  else
-    return SDValue();       // No mulhu or equivalent
 
   Created->push_back(Q.getNode());
 
Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -402,7 +402,7 @@
     return AArch64::X1;
   }
 
-  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+  bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const override;
 
   bool isCheapToSpeculateCttz() const override {
     return true;
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7611,7 +7611,7 @@
                                      SelectionDAG &DAG,
                                      std::vector<SDNode *> *Created) const {
   AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
-  if (isIntDivCheap(N->getValueType(0), Attr))
+  if (isIntDivCheap(N->getValueType(0), Attr, true))
     return SDValue(N,0); // Lower SDIV as SDIV
 
   // fold (sdiv X, pow2)
@@ -10621,7 +10621,8 @@
   }
 }
 
-bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr,
+                                          bool Signed) const {
   // Integer division on AArch64 is expensive. However, when aggressively
   // optimizing for code size, we prefer to use a div instruction, as it is
   // usually smaller than the alternative sequence.
Index: lib/Target/AMDGPU/SOPInstructions.td
===================================================================
--- lib/Target/AMDGPU/SOPInstructions.td
+++ lib/Target/AMDGPU/SOPInstructions.td
@@ -910,6 +910,12 @@
   (S_ADD_U32 $src0, $src1)
 >;
 
+// Similarly for V_SUB_I32/S_SUB_U32.
+def : Pat <
+  (i32 (subc i32:$src0, i32:$src1)),
+  (S_SUB_U32 $src0, $src1)
+>;
+
 // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
 // REG_SEQUENCE patterns don't support instructions with multiple
 // outputs.
@@ -932,7 +938,6 @@
 >;
 
 
-
 //===----------------------------------------------------------------------===//
 // SOPP Patterns
 //===----------------------------------------------------------------------===//
Index: lib/Target/BPF/BPFISelLowering.h
===================================================================
--- lib/Target/BPF/BPFISelLowering.h
+++ lib/Target/BPF/BPFISelLowering.h
@@ -46,6 +46,8 @@
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
 
+  bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const override;
+
 private:
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
Index: lib/Target/BPF/BPFISelLowering.cpp
===================================================================
--- lib/Target/BPF/BPFISelLowering.cpp
+++ lib/Target/BPF/BPFISelLowering.cpp
@@ -132,6 +132,14 @@
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
 }
 
+bool BPFTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr,
+                                      bool Signed) const {
+  // We don't want to apply optimizations to SDIV, so that the resulting
+  // error messages about not having signed division do not depend on
+  // optimizations.
+  return Signed;
+}
+
 SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   case ISD::BR_CC:
Index: lib/Target/WebAssembly/WebAssemblyISelLowering.h
===================================================================
--- lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -58,7 +58,7 @@
                              unsigned AS) const override;
   bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
                                       bool *Fast) const override;
-  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+  bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
Index: lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
===================================================================
--- lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -253,7 +253,8 @@
   return true;
 }
 
-bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr,
+                                              bool Signed) const {
   // The current thinking is that wasm engines will perform this optimization,
   // so we can save on code size.
   return true;
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -1030,7 +1030,7 @@
     /// \brief Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
 
-    bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+    bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const override;
 
     bool supportSwiftError() const override;
 
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -33383,7 +33383,8 @@
   return -1;
 }
 
-bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr,
+                                      bool Signed) const {
   // Integer division on x86 is expensive. However, when aggressively optimizing
   // for code size, we prefer to use a div instruction, as it is usually smaller
   // than the alternative sequence.
Index: test/CodeGen/AMDGPU/sdiv.ll
===================================================================
--- test/CodeGen/AMDGPU/sdiv.ll
+++ test/CodeGen/AMDGPU/sdiv.ll
@@ -136,6 +136,27 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}sdiv_i32_const:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493
+; SI-NOT: v_rcp
+define void @sdiv_i32_const(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1)* %in
+  %result = sdiv i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv_i64_const:
+; SI-DAG: s_mov_b32 [[MAGIC_LO:s[0-9]+]], 0x24924925
+; SI-DAG: s_mov_b32 [[MAGIC_HI:s[0-9]+]], 0x49249249
+; SI-NOT: v_rcp
+define void @sdiv_i64_const(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %num = load i64, i64 addrspace(1)* %in
+  %result = sdiv i64 %num, 7
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
 ; Tests for 64-bit divide bypass.
 ; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = sdiv i64 %a, %b
Index: test/CodeGen/AMDGPU/udiv.ll
===================================================================
--- test/CodeGen/AMDGPU/udiv.ll
+++ test/CodeGen/AMDGPU/udiv.ll
@@ -158,3 +158,24 @@
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
+
+; FUNC-LABEL: {{^}}udiv_i32_const:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925
+; SI-NOT: v_rcp
+define void @udiv_i32_const(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1)* %in
+  %result = udiv i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv_i64_const:
+; SI-DAG: s_mov_b32 [[MAGIC_HI:s[0-9]+]], 0x24924924
+; SI-DAG: s_mov_b32 [[MAGIC_LO:s[0-9]+]], 0x92492493
+; SI-NOT: v_rcp
+define void @udiv_i64_const(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %num = load i64, i64 addrspace(1)* %in
+  %result = udiv i64 %num, 7
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
Index: test/CodeGen/SPARC/rem.ll
===================================================================
--- test/CodeGen/SPARC/rem.ll
+++ test/CodeGen/SPARC/rem.ll
@@ -24,12 +24,39 @@
 
 ; PR18150
 ; CHECK-LABEL: test3
-; CHECK:       sethi 2545, [[R0:%[gilo][0-7]]]
-; CHECK:       or    [[R0]], 379, [[R1:%[gilo][0-7]]]
-; CHECK:       mulx  %o0, [[R1]], [[R2:%[gilo][0-7]]]
-; CHECK:       udivx [[R2]], 1021, [[R3:%[gilo][0-7]]]
-; CHECK:       mulx  [[R3]], 1021, [[R4:%[gilo][0-7]]]
-; CHECK:       sub   [[R2]], [[R4]], %o0
+; CHECK:        sethi 2545, %o1
+; CHECK-NEXT:   or %o1, 379, %o1
+; CHECK-NEXT:   mulx %o0, %o1, %o0
+; CHECK-NEXT:   srl %o0, 0, %o1
+; CHECK-NEXT:   sethi 12324, %o2
+; CHECK-NEXT:   or %o2, 108, %o2
+; CHECK-NEXT:   mulx %o1, %o2, %o3
+; CHECK-NEXT:   sethi 1331003, %o4
+; CHECK-NEXT:   or %o4, 435, %o4
+; CHECK-NEXT:   mulx %o1, %o4, %o1
+; CHECK-NEXT:   srlx %o1, 32, %o1
+; CHECK-NEXT:   add %o1, %o3, %o1
+; CHECK-NEXT:   srlx %o1, 32, %o3
+; CHECK-NEXT:   srlx %o0, 32, %o5
+; CHECK-NEXT:   mulx %o5, %o4, %o4
+; CHECK-NEXT:   srlx %o4, 32, %g2
+; CHECK-NEXT:   mulx %o5, %o2, %o2
+; CHECK-NEXT:   srlx %o2, 32, %o5
+; CHECK-NEXT:   addcc %o1, %o4, %o1
+; CHECK-NEXT:   addxcc %o3, %g2, %o1
+; CHECK-NEXT:   addxcc %o5, 0, %o3
+; CHECK-NEXT:   sllx %o3, 32, %o3
+; CHECK-NEXT:   srl %o2, 0, %o2
+; CHECK-NEXT:   or %o2, %o3, %o2
+; CHECK-NEXT:   srl %o1, 0, %o1
+; CHECK-NEXT:   add %o1, %o2, %o1
+; CHECK-NEXT:   sub %o0, %o1, %o2
+; CHECK-NEXT:   srlx %o2, 1, %o2
+; CHECK-NEXT:   add %o2, %o1, %o1
+; CHECK-NEXT:   srlx %o1, 9, %o1
+; CHECK-NEXT:   mulx %o1, 1021, %o1
+; CHECK-NEXT:   retl
+; CHECK-NEXT:   sub %o0, %o1, %o0
 
 define i64 @test3(i64 %b) {
 entry: