Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -13262,6 +13262,73 @@
       %res = call i4 @llvm.smul.fix.sat.i4(i4 2, i4 4, i32 1)  ; %res = 4 (1 x 2 = 2)
 
 
+'``llvm.umul.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.umul.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.umul.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+      declare i32 @llvm.umul.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+      declare i64 @llvm.umul.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+      declare <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.umul.fix.sat``' family of intrinsic functions perform unsigned
+fixed point saturation multiplication on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo unsigned fixed point multiplication. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point multiplication on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest unsigned value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest unsigned value representable by this bit width (zero).
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.umul.fix.sat.i4(i4 3, i4 2, i32 0)  ; %res = 6 (2 x 3 = 6)
+      %res = call i4 @llvm.umul.fix.sat.i4(i4 3, i4 2, i32 1)  ; %res = 3 (1.5 x 1 = 1.5)
+
+      ; The result in the following could be rounded up to 2.5 or down to 2
+      %res = call i4 @llvm.umul.fix.sat.i4(i4 3, i4 3, i32 1)  ; %res = 8 (or 9) (1.5 x 1.5 = 2.25)
+
+      ; Saturation
+      %res = call i4 @llvm.umul.fix.sat.i4(i4 7, i4 2, i32 0)  ; %res = 7
+      %res = call i4 @llvm.umul.fix.sat.i4(i4 7, i4 2, i32 2)  ; %res = 7
+
+      ; Scale can affect the saturation result
+      %res = call i4 @llvm.umul.fix.sat.i4(i4 2, i4 4, i32 0)  ; %res = 7 (2 x 4 -> clamped to 7)
+      %res = call i4 @llvm.umul.fix.sat.i4(i4 2, i4 4, i32 1)  ; %res = 4 (1 x 2 = 2)
+
+
 Specialised Arithmetic Intrinsics
 ---------------------------------
 
Index: llvm/include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -281,7 +281,7 @@
     /// Same as the corresponding unsaturated fixed point instructions, but the
     /// result is clamped between the min and max values representable by the
     /// bits of the first 2 operands.
-    SMULFIXSAT,
+    SMULFIXSAT, UMULFIXSAT,
 
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -838,6 +838,7 @@
     case ISD::SMULFIX:
     case ISD::SMULFIXSAT:
     case ISD::UMULFIX:
+    case ISD::UMULFIXSAT:
       Supported = isSupportedFixedPointOperation(Op, VT, Scale);
       break;
     }
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -858,6 +858,9 @@
 def int_smul_fix_sat : Intrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                                  [IntrNoMem, IntrSpeculatable, Commutative]>;
+def int_umul_fix_sat : Intrinsic<[llvm_anyint_ty],
+                                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                                 [IntrNoMem, IntrSpeculatable, Commutative]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
Index: llvm/include/llvm/Target/TargetSelectionDAG.td
===================================================================
--- llvm/include/llvm/Target/TargetSelectionDAG.td
+++ llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -393,6 +393,7 @@
 def smulfix    : SDNode<"ISD::SMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
 def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
 def umulfix    : SDNode<"ISD::UMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
+def umulfixsat : SDNode<"ISD::UMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
 
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1129,7 +1129,8 @@
   }
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: {
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
@@ -3294,6 +3295,7 @@
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
     Results.push_back(TLI.expandFixedPointMul(Node, DAG));
     break;
   case ISD::SADDO:
Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -150,7 +150,8 @@
   case ISD::USUBSAT:     Res = PromoteIntRes_ADDSUBSAT(N); break;
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX:     Res = PromoteIntRes_MULFIX(N); break;
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:  Res = PromoteIntRes_MULFIX(N); break;
 
   case ISD::ATOMIC_LOAD:
     Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
@@ -1100,7 +1101,8 @@
 
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: Res = PromoteIntOp_MULFIX(N); break;
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: Res = PromoteIntOp_MULFIX(N); break;
 
   case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
   }
@@ -1633,7 +1635,8 @@
 
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: ExpandIntRes_MULFIX(N, Lo, Hi); break;
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -2609,7 +2612,10 @@
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   uint64_t Scale = N->getConstantOperandVal(2);
-  bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
+  bool Saturating =
+      (N->getOpcode() == ISD::SMULFIXSAT || N->getOpcode() == ISD::UMULFIXSAT);
+  bool Signed =
+      (N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::SMULFIXSAT);
   EVT BoolVT =
       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   SDValue Zero = DAG.getConstant(0, dl, VT);
@@ -2618,17 +2624,28 @@
     if (!Saturating) {
       Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
     } else {
-      Result = DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      unsigned MulOp = Signed ? ISD::SMULO : ISD::UMULO;
+      Result = DAG.getNode(MulOp, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
       SDValue Product = Result.getValue(0);
       SDValue Overflow = Result.getValue(1);
 
-      APInt MinVal = APInt::getSignedMinValue(VTSize);
-      APInt MaxVal = APInt::getSignedMaxValue(VTSize);
-      SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
-      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
-      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
-      Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
-      Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+      if (Signed) {
+        APInt MinVal = APInt::getSignedMinValue(VTSize);
+        APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+
+        SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+        SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+        SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
+        Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+        Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+      } else {
+        // For unsigned multiplication, we only need to check the max since we
+        // can't really overflow towards zero.
+        APInt MaxVal = APInt::getMaxValue(VTSize);
+
+        SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+        Result = DAG.getSelect(dl, VT, Overflow, SatMax, Product);
+      }
     }
     SplitInteger(Result, Lo, Hi);
     return;
@@ -2640,8 +2657,6 @@
   GetExpandedInteger(RHS, RL, RH);
   SmallVector<SDValue, 4> Result;
 
-  bool Signed = (N->getOpcode() == ISD::SMULFIX ||
-                 N->getOpcode() == ISD::SMULFIXSAT);
   unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
   if (!TLI.expandMUL_LOHI(LoHiOp, VT, dl, LHS, RHS, Result, NVT, DAG,
                           TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
@@ -2704,7 +2719,7 @@
     // the max value. If these top bits are < -1, then we overflowed past the
     // min value. Otherwise, we did not overflow.
     if (Saturating) {
-      unsigned OverflowBits = VTSize - Scale + 1;
+      unsigned OverflowBits = Signed ? (VTSize - Scale + 1) : (VTSize - Scale);
       assert(OverflowBits <= VTSize && OverflowBits > NVTSize &&
              "Extent of overflow bits must start within HL");
       SDValue HLHiMask = DAG.getConstant(
@@ -2713,20 +2728,27 @@
           APInt::getLowBitsSet(NVTSize, VTSize - OverflowBits), dl, NVT);
 
       // HH > 0 or HH == 0 && HL > HLLoMask
-      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero,
+                                   Signed ? ISD::SETGT : ISD::SETUGT);
       SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
       SDValue HLPos =
           DAG.getSetCC(dl, BoolNVT, ResultHL, HLLoMask, ISD::SETUGT);
       SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
                            DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLPos));
 
-      // HH < -1 or HH == -1 && HL < HLHiMask
-      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
-      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
-      SDValue HLNeg =
-          DAG.getSetCC(dl, BoolNVT, ResultHL, HLHiMask, ISD::SETULT);
-      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLNeg));
+      if (Signed) {
+        // HH < -1 or HH == -1 && HL < HLHiMask
+        SDValue HHNeg =
+            DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+        SDValue HHNeg1 =
+            DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+        SDValue HLNeg =
+            DAG.getSetCC(dl, BoolNVT, ResultHL, HLHiMask, ISD::SETULT);
+        SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
+                             DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLNeg));
+      } else {
+        SatMin = HHZero;
+      }
     }
   } else if (Scale == NVTSize) {
     // If the scales are equal, Lo and Hi are ResultLH and Result HL,
@@ -2737,17 +2759,26 @@
     // We overflow max if HH > 0 or HH == 0 && HL sign is negative.
     // We overflow min if HH < -1 or HH == -1 && HL sign is 0.
     if (Saturating) {
-      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero,
+                                   Signed ? ISD::SETGT : ISD::SETUGT);
       SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
-      SDValue HLNeg = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETLT);
-      SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLNeg));
-
-      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
-      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
-      SDValue HLPos = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETGT);
-      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLPos));
+      if (Signed) {
+        SDValue HLNeg =
+            DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETLT);
+        SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
+                             DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLNeg));
+
+        SDValue HHNeg =
+            DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+        SDValue HHNeg1 =
+            DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+        SDValue HLPos =
+            DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETGT);
+        SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
+                             DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLPos));
+      } else {
+        SatMax = HHZero;
+      }
     }
   } else if (Scale < VTSize) {
     // If the scale is instead less than the old VT size, but greater than or
@@ -2767,14 +2798,18 @@
     // This is similar to the case when we saturate if Scale < NVTSize, but we
     // only need to chech HH.
     if (Saturating) {
-      unsigned OverflowBits = VTSize - Scale + 1;
-      SDValue HHHiMask = DAG.getConstant(
-          APInt::getHighBitsSet(NVTSize, OverflowBits), dl, NVT);
+      unsigned OverflowBits = Signed ? (VTSize - Scale + 1) : (VTSize - Scale);
       SDValue HHLoMask = DAG.getConstant(
           APInt::getLowBitsSet(NVTSize, NVTSize - OverflowBits), dl, NVT);
 
-      SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, HHLoMask, ISD::SETGT);
-      SatMin = DAG.getSetCC(dl, BoolNVT, ResultHH, HHHiMask, ISD::SETLT);
+      SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, HHLoMask,
+                            Signed ? ISD::SETGT : ISD::SETUGT);
+
+      if (Signed) {
+        SDValue HHHiMask = DAG.getConstant(
+            APInt::getHighBitsSet(NVTSize, OverflowBits), dl, NVT);
+        SatMin = DAG.getSetCC(dl, BoolNVT, ResultHH, HHHiMask, ISD::SETLT);
+      }
     }
   } else if (Scale == VTSize) {
     assert(
@@ -2789,13 +2824,17 @@
   }
 
   if (Saturating) {
-    APInt LHMax = APInt::getSignedMaxValue(NVTSize);
+    APInt LHMax = Signed ? APInt::getSignedMaxValue(NVTSize)
+                         : APInt::getMaxValue(NVTSize);
     APInt LLMax = APInt::getAllOnesValue(NVTSize);
-    APInt LHMin = APInt::getSignedMinValue(NVTSize);
     Hi = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LHMax, dl, NVT), Hi);
-    Hi = DAG.getSelect(dl, NVT, SatMin, DAG.getConstant(LHMin, dl, NVT), Hi);
     Lo = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LLMax, dl, NVT), Lo);
-    Lo = DAG.getSelect(dl, NVT, SatMin, NVTZero, Lo);
+
+    if (Signed) {
+      APInt LHMin = APInt::getSignedMinValue(NVTSize);
+      Hi = DAG.getSelect(dl, NVT, SatMin, DAG.getConstant(LHMin, dl, NVT), Hi);
+      Lo = DAG.getSelect(dl, NVT, SatMin, NVTZero, Lo);
+    }
   }
 }
 
Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -427,7 +427,8 @@
     break;
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: {
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -174,6 +174,7 @@
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
     R = ScalarizeVecRes_MULFIX(N);
     break;
   }
@@ -863,6 +864,7 @@
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
     SplitVecRes_MULFIX(N, Lo, Hi);
     break;
   }
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5908,6 +5908,14 @@
                              Op3));
     return nullptr;
   }
+  case Intrinsic::umul_fix_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    setValue(&I, DAG.getNode(ISD::UMULFIXSAT, sdl, Op1.getValueType(), Op1, Op2,
+                             Op3));
+    return nullptr;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -303,6 +303,7 @@
   case ISD::SMULFIX:                    return "smulfix";
   case ISD::SMULFIXSAT:                 return "smulfixsat";
   case ISD::UMULFIX:                    return "umulfix";
+  case ISD::UMULFIXSAT:                 return "umulfixsat";
 
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5420,7 +5420,10 @@
   SDValue RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
   unsigned Scale = Node->getConstantOperandVal(2);
-  bool Saturating = Node->getOpcode() == ISD::SMULFIXSAT;
+  bool Saturating = (Node->getOpcode() == ISD::SMULFIXSAT ||
+                     Node->getOpcode() == ISD::UMULFIXSAT);
+  bool Signed =
+      Node->getOpcode() == ISD::SMULFIX || Node->getOpcode() == ISD::SMULFIXSAT;
   EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   unsigned VTSize = VT.getScalarSizeInBits();
 
@@ -5428,7 +5431,7 @@
     // [us]mul.fix(a, b, 0) -> mul(a, b)
     if (!Saturating && isOperationLegalOrCustom(ISD::MUL, VT)) {
       return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
-    } else if (isOperationLegalOrCustom(ISD::SMULO, VT)) {
+    } else if (Signed && isOperationLegalOrCustom(ISD::SMULO, VT)) {
       SDValue Result =
           DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
       SDValue Product = Result.getValue(0);
@@ -5442,12 +5445,19 @@
       SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
       Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
       return DAG.getSelect(dl, VT, Overflow, Result, Product);
+    } else if (!Signed && isOperationLegalOrCustom(ISD::UMULO, VT)) {
+      SDValue Result =
+          DAG.getNode(ISD::UMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+
+      APInt MaxVal = APInt::getMaxValue(VTSize);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      return DAG.getSelect(dl, VT, Overflow, SatMax, Product);
     }
     return SDValue();
   }
 
-  bool Signed =
-      Node->getOpcode() == ISD::SMULFIX || Node->getOpcode() == ISD::SMULFIXSAT;
   assert(((Signed && Scale < VTSize) || (!Signed && Scale <= VTSize)) &&
          "Expected scale to be less than the number of bits if signed or at "
          "most the number of bits if unsigned.");
@@ -5490,17 +5500,24 @@
   if (!Saturating)
     return Result;
 
-  unsigned OverflowBits = VTSize - Scale + 1; // +1 for the sign
-  SDValue HiMask =
-      DAG.getConstant(APInt::getHighBitsSet(VTSize, OverflowBits), dl, VT);
+  unsigned OverflowBits = Signed ? (VTSize - Scale + 1) : (VTSize - Scale);
   SDValue LoMask = DAG.getConstant(
       APInt::getLowBitsSet(VTSize, VTSize - OverflowBits), dl, VT);
-  SDValue SatMax = DAG.getSetCC(dl, BoolVT, Hi, LoMask, ISD::SETGT);
-  SDValue SatMin = DAG.getSetCC(dl, BoolVT, Hi, HiMask, ISD::SETLT);
-
-  APInt MaxVal = APInt::getSignedMaxValue(VTSize);
-  APInt MinVal = APInt::getSignedMinValue(VTSize);
+  SDValue SatMax =
+      DAG.getSetCC(dl, BoolVT, Hi, LoMask, Signed ? ISD::SETGT : ISD::SETUGT);
+  APInt MaxVal =
+      Signed ? APInt::getSignedMaxValue(VTSize) : APInt::getMaxValue(VTSize);
   Result =
       DAG.getSelect(dl, VT, SatMax, DAG.getConstant(MaxVal, dl, VT), Result);
-  return DAG.getSelect(dl, VT, SatMin, DAG.getConstant(MinVal, dl, VT), Result);
+
+  if (Signed) {
+    SDValue HiMask =
+        DAG.getConstant(APInt::getHighBitsSet(VTSize, OverflowBits), dl, VT);
+    SDValue SatMin = DAG.getSetCC(dl, BoolVT, Hi, HiMask, ISD::SETLT);
+    APInt MinVal = APInt::getSignedMinValue(VTSize);
+    Result =
+        DAG.getSelect(dl, VT, SatMin, DAG.getConstant(MinVal, dl, VT), Result);
+  }
+
+  return Result;
 }
Index: llvm/lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -626,6 +626,7 @@
     setOperationAction(ISD::SMULFIX, VT, Expand);
     setOperationAction(ISD::SMULFIXSAT, VT, Expand);
     setOperationAction(ISD::UMULFIX, VT, Expand);
+    setOperationAction(ISD::UMULFIXSAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
Index: llvm/lib/IR/Verifier.cpp
===================================================================
--- llvm/lib/IR/Verifier.cpp
+++ llvm/lib/IR/Verifier.cpp
@@ -4576,7 +4576,8 @@
   }
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat:
-  case Intrinsic::umul_fix: {
+  case Intrinsic::umul_fix:
+  case Intrinsic::umul_fix_sat: {
     Value *Op1 = Call.getArgOperand(0);
     Value *Op2 = Call.getArgOperand(1);
     Assert(Op1->getType()->isIntOrIntVectorTy(),
Index: llvm/test/CodeGen/X86/umul_fix_sat.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -0,0 +1,547 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare  i4  @llvm.umul.fix.sat.i4   (i4,  i4, i32)
+declare  i32 @llvm.umul.fix.sat.i32  (i32, i32, i32)
+declare  i64 @llvm.umul.fix.sat.i64  (i64, i64, i32)
+declare  <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movl %eax, %edx
+; X64-NEXT:    shldl $30, %ecx, %edx
+; X64-NEXT:    cmpl $3, %eax
+; X64-NEXT:    movl $-1, %eax
+; X64-NEXT:    cmovbel %edx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    shrdl $2, %edx, %eax
+; X86-NEXT:    cmpl $3, %edx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    retl
+  %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 2);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    shrdq $2, %rdx, %rax
+; X64-NEXT:    cmpq $3, %rdx
+; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    cmovaq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    setne %cl
+; X86-NEXT:    sete %ch
+; X86-NEXT:    cmpl $3, %edx
+; X86-NEXT:    seta %bl
+; X86-NEXT:    andb %ch, %bl
+; X86-NEXT:    orb %cl, %bl
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $30, %ecx, %eax
+; X86-NEXT:    testb %bl, %bl
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 2);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func3:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $3, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    movb $-1, %al
+; X64-NEXT:    ja .LBB2_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    andb $15, %dil
+; X64-NEXT:    andl $15, %esi
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull %esi, %eax
+; X64-NEXT:    shrb $2, %al
+; X64-NEXT:  .LBB2_2:
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func3:
+; X86:       # %bb.0:
+; X86-NEXT:    movb $3, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movb $-1, %al
+; X86-NEXT:    ja .LBB2_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.umul.fix.sat.i4(i4 %x, i4 %y, i32 2);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movl %eax, %edx
+; X64-NEXT:    shldl $30, %ecx, %edx
+; X64-NEXT:    cmpl $3, %eax
+; X64-NEXT:    movl $-1, %eax
+; X64-NEXT:    cmoval %eax, %edx
+; X64-NEXT:    movd %edx, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %edx
+; X64-NEXT:    imulq %rcx, %rdx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    movl %ecx, %esi
+; X64-NEXT:    shldl $30, %edx, %esi
+; X64-NEXT:    cmpl $3, %ecx
+; X64-NEXT:    cmoval %eax, %esi
+; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    movd %xmm0, %edx
+; X64-NEXT:    imulq %rcx, %rdx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    movl %ecx, %esi
+; X64-NEXT:    shldl $30, %edx, %esi
+; X64-NEXT:    cmpl $3, %ecx
+; X64-NEXT:    cmoval %eax, %esi
+; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT:    movd %xmm0, %edx
+; X64-NEXT:    imulq %rcx, %rdx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    movl %ecx, %esi
+; X64-NEXT:    shldl $30, %edx, %esi
+; X64-NEXT:    cmpl $3, %ecx
+; X64-NEXT:    cmoval %eax, %esi
+; X64-NEXT:    movd %esi, %xmm0
+; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shrdl $2, %edx, %esi
+; X86-NEXT:    cmpl $3, %edx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmoval %ecx, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    shrdl $2, %edx, %ebp
+; X86-NEXT:    cmpl $3, %edx
+; X86-NEXT:    cmoval %ecx, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrdl $2, %edx, %ebx
+; X86-NEXT:    cmpl $3, %edx
+; X86-NEXT:    cmoval %ecx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    shrdl $2, %edx, %eax
+; X86-NEXT:    cmpl $3, %edx
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    movl %ebx, 8(%edi)
+; X86-NEXT:    movl %ebp, 4(%edi)
+; X86-NEXT:    movl %esi, (%edi)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2);
+  ret <4 x i32> %tmp;
+}
+
+; These result in regular integer multiplication
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func4:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    mull %esi
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    cmovol %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    retl
+  %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 0);
+  ret i32 %tmp;
+}
+
+define i64 @func5(i64 %x, i64 %y) {
+; X64-LABEL: func5:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    cmovoq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    .cfi_offset %esi, -20
+; X86-NEXT:    .cfi_offset %edi, -16
+; X86-NEXT:    .cfi_offset %ebx, -12
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    setne %dl
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %cl
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    seto %bl
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    seto %ch
+; X86-NEXT:    orb %bl, %ch
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    orb %ch, %bl
+; X86-NEXT:    orb %cl, %bl
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    popl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 0);
+  ret i64 %tmp;
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func6:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $15, %al
+; X64-NEXT:    andb $15, %sil
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    mulb %sil
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    movb $-1, %al
+; X64-NEXT:    jo .LBB6_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:  .LBB6_2:
+; X64-NEXT:    retq
+;
+; X86-LABEL: func6:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    mulb %cl
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movb $-1, %al
+; X86-NEXT:    jo .LBB6_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB6_2:
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.umul.fix.sat.i4(i4 %x, i4 %y, i32 0);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec2:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    mull %ecx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    cmovol %ecx, %eax
+; X64-NEXT:    movd %eax, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %edx
+; X64-NEXT:    mull %edx
+; X64-NEXT:    cmovol %ecx, %eax
+; X64-NEXT:    movd %eax, %xmm3
+; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movd %xmm1, %edx
+; X64-NEXT:    mull %edx
+; X64-NEXT:    cmovol %ecx, %eax
+; X64-NEXT:    movd %eax, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-NEXT:    movd %xmm0, %edx
+; X64-NEXT:    mull %edx
+; X64-NEXT:    cmovol %ecx, %eax
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovol %esi, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    cmovol %esi, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    cmovol %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovol %esi, %eax
+; X86-NEXT:    movl %eax, 12(%ecx)
+; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl %ebx, 4(%ecx)
+; X86-NEXT:    movl %ebp, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0);
+  ret <4 x i32> %tmp;
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func7:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    shrdq $32, %rdx, %rax
+; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    cmovaq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func7:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl $1, %ebx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 32);
+  ret i64 %tmp;
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func8:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    shrdq $63, %rdx, %rax
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    cmovaq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    shrdl $31, %edx, %eax
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    cmovsl %edx, %ecx
+; X86-NEXT:    cmovsl %edx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 63);
+  ret i64 %tmp;
+}