Index: lib/Transforms/InstCombine/InstCombineSelect.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -526,6 +526,61 @@
   return Builder.CreateOr(V, Y);
 }
 
+/// We want to turn any of the possible patterns to ((a > b) ? a : b) - b) which
+/// is the cannonical for unsigned saturation subtraction pattern.
+/// TODO: Also support a - UMIN(a,b) patterns.
+static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI,
+                                            const Value *TrueVal,
+                                            const Value *FalseVal,
+                                            InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  if (!ICmpInst::isUnsigned(Pred))
+    return nullptr;
+
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+
+  // (b > a) ? 0 : a - b -> (b <= a) ? a - b : 0
+  if (match(TrueVal, m_Zero())) {
+    Pred = ICmpInst::getInversePredicate(Pred);
+    std::swap(TrueVal, FalseVal);
+  }
+
+  if (!match(FalseVal, m_Zero()))
+    return nullptr;
+
+  // If sub is used anywhere else, transformation will duplicate the instruction
+  if (!TrueVal->hasOneUse())
+    return nullptr;
+
+  if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_ULT) {
+    // (b < a) ? a - b : 0 -> (a > b) ? a - b : 0
+    std::swap(CmpLHS, CmpRHS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) &&
+         "Unexpected isUnsigned predicate!");
+
+  bool IsNegative = false;
+
+  // Notice negative form of psubus ((a > b) ? b - a : 0)
+  if (match(TrueVal, m_Sub(m_Specific(CmpRHS), m_Specific(CmpLHS)))) {
+    IsNegative = true;
+    // The psubus should be (a - b) after transform.
+    std::swap(CmpLHS, CmpRHS);
+  } else if (!match(TrueVal, m_Sub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
+    return nullptr;
+
+  // All check passed, convert to cannonical unsigned saturated
+  // subtraction form sub(max())
+  // (a > b) ? a - b : 0 -> ((a > b) ? a : b) - b)
+  Value *Sel = Builder.CreateSelect(Builder.CreateICmp(Pred, CmpLHS, CmpRHS),
+                                    CmpLHS, CmpRHS);
+  Value *Sub = Builder.CreateSub(Sel, CmpRHS);
+  return IsNegative ? Builder.CreateNeg(Sub) : Sub;
+}
+
 /// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
 /// call to cttz/ctlz with flag 'is_zero_undef' cleared.
 ///
@@ -848,10 +903,12 @@
   if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder))
     return replaceInstUsesWith(SI, V);
 
+  if (Value *V = canonicalizeSaturatedSubtract(ICI, TrueVal, FalseVal, Builder))
+    return replaceInstUsesWith(SI, V);
+
   return Changed ? &SI : nullptr;
 }
 
-
 /// SI is a select whose condition is a PHI node (but the two may be in
 /// different blocks). See if the true/false values (V) are live in all of the
 /// predecessor blocks of the PHI. For example, cases like this can't be mapped:
Index: test/Transforms/InstCombine/unsigned_saturated_sub.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/unsigned_saturated_sub.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Canonicalization of unsigned saturated subtraction idiom is tested here.
+; The backend recognition of pattern test is in test/CodeGen/X86/psubus.ll.
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+define <4 x i32> @max_sub_ugt_128(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: @max_sub_ugt_128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[LHS:%.*]], [[RHS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[LHS]], <4 x i32> [[RHS]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[TMP1]], [[RHS]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+entry:
+  %cmp = icmp ugt <4 x i32> %lhs, %rhs
+  %sub = sub <4 x i32> %lhs, %rhs
+  %sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> zeroinitializer
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @max_sub_ugt_256(<4 x i64> %lhs, <4 x i64> %rhs) {
+; CHECK-LABEL: @max_sub_ugt_256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i64> [[LHS:%.*]], [[RHS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[LHS]], <4 x i64> [[RHS]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i64> [[TMP1]], [[RHS]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+entry:
+  %cmp = icmp ugt <4 x i64> %lhs, %rhs
+  %sub = sub <4 x i64> %lhs, %rhs
+  %sel = select <4 x i1> %cmp, <4 x i64> %sub, <4 x i64> zeroinitializer
+  ret <4 x i64> %sel
+}
+
+define <8 x i64> @max_sub_ugt_512(<8 x i64> %lhs, <8 x i64> %rhs) {
+; CHECK-LABEL: @max_sub_ugt_512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i64> [[LHS:%.*]], [[RHS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select <8 x i1> [[TMP0]], <8 x i64> [[LHS]], <8 x i64> [[RHS]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <8 x i64> [[TMP1]], [[RHS]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+entry:
+  %cmp = icmp ugt <8 x i64> %lhs, %rhs
+  %sub = sub <8 x i64> %lhs, %rhs
+  %sel = select <8 x i1> %cmp, <8 x i64> %sub, <8 x i64> zeroinitializer
+  ret <8 x i64> %sel
+}
+
+define i64 @max_sub_ugt(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: @max_sub_ugt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[LHS:%.*]], [[RHS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[LHS]], i64 [[RHS]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[RHS]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %cmp = icmp ugt i64 %lhs, %rhs
+  %sub = sub i64 %lhs, %rhs
+  %sel = select i1 %cmp, i64 %sub ,i64 0
+  ret i64 %sel
+}
+
+define i64 @max_sub_uge(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: @max_sub_uge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[LHS:%.*]], [[RHS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[RHS]], i64 [[LHS]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[RHS]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %cmp = icmp uge i64 %lhs, %rhs
+  %sub = sub i64 %lhs, %rhs
+  %sel = select i1 %cmp, i64 %sub ,i64 0
+  ret i64 %sel
+}
+
+define i64 @max_sub_ult(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: @max_sub_ult(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[LHS:%.*]], [[RHS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[RHS]], i64 [[LHS]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[RHS]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %cmp = icmp ult i64 %lhs, %rhs
+  %sub = sub i64 %lhs, %rhs
+  %sel = select i1 %cmp, i64 0, i64 %sub
+  ret i64 %sel
+}
+
+define i64 @max_sub_ule(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: @max_sub_ule(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[LHS:%.*]], [[RHS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[LHS]], i64 [[RHS]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[RHS]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %cmp = icmp ule i64 %lhs, %rhs
+  %sub = sub i64 %lhs, %rhs
+  %sel = select i1 %cmp, i64 0, i64 %sub
+  ret i64 %sel
+}
+
+; a - sub(max) case
+define i64 @sub_neg_max(i64 %lhs, i64 %rhs, i64 %sb) {
+; CHECK-LABEL: @sub_neg_max(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[LHS:%.*]], [[RHS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[LHS]], i64 [[RHS]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[RHS]]
+; CHECK-NEXT:    [[SUB2:%.*]] = add i64 [[TMP2]], [[SB:%.*]]
+; CHECK-NEXT:    ret i64 [[SUB2]]
+;
+entry:
+  %cmp = icmp ule i64 %rhs, %lhs
+  %sub = sub i64 %lhs, %rhs
+  %sel = select i1 %cmp, i64 0, i64 %sub
+  %sub2 = sub i64 %sb, %sel
+  ret i64 %sub2
+}
+
+; a + sub(max) case
+define i64 @add_neg_max(i64 %lhs, i64 %rhs, i64 %sb) {
+; CHECK-LABEL: @add_neg_max(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[LHS:%.*]], [[RHS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[LHS]], i64 [[RHS]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[RHS]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[TMP2]], [[SB:%.*]]
+; CHECK-NEXT:    ret i64 [[ADD]]
+;
+entry:
+  %cmp = icmp ule i64 %rhs, %lhs
+  %sub = sub i64 %lhs, %rhs
+  %sel = select i1 %cmp, i64 0, i64 %sub
+  %add = add i64 %sb, %sel
+  ret i64 %add
+}