Index: include/llvm/CodeGen/SelectionDAGNodes.h
===================================================================
--- include/llvm/CodeGen/SelectionDAGNodes.h
+++ include/llvm/CodeGen/SelectionDAGNodes.h
@@ -328,6 +328,7 @@
   bool NoInfs : 1;
   bool NoSignedZeros : 1;
   bool AllowReciprocal : 1;
+  bool Reduction : 1;
 
 public:
   /// Default constructor turns off all optimization flags.
@@ -340,6 +341,7 @@
     NoInfs = false;
     NoSignedZeros = false;
     AllowReciprocal = false;
+    Reduction = false;
   }
 
   // These are mutators for each flag.
@@ -351,6 +353,7 @@
   void setNoInfs(bool b) { NoInfs = b; }
   void setNoSignedZeros(bool b) { NoSignedZeros = b; }
   void setAllowReciprocal(bool b) { AllowReciprocal = b; }
+  void setReduction(bool b) { Reduction = b; }
 
   // These are accessors for each flag.
   bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
@@ -361,6 +364,7 @@
   bool hasNoInfs() const { return NoInfs; }
   bool hasNoSignedZeros() const { return NoSignedZeros; }
   bool hasAllowReciprocal() const { return AllowReciprocal; }
+  bool hasReduction() const { return Reduction; }
 
   /// Return a raw encoding of the flags.
   /// This function should only be used to add data to the NodeID value.
Index: include/llvm/IR/Instructions.h
===================================================================
--- include/llvm/IR/Instructions.h
+++ include/llvm/IR/Instructions.h
@@ -2399,9 +2399,13 @@
 //
 class PHINode : public Instruction {
   void *operator new(size_t, unsigned) = delete;
+
   /// ReservedSpace - The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
-  unsigned ReservedSpace;
+  unsigned ReservedSpace : 31;
+  /// IsRecution - a flag indicating if this Phi is a reduction Phi.
+  bool     IsReduction   :  1;
+
   PHINode(const PHINode &PN);
   // allocate space for exactly zero operands
   void *operator new(size_t s) {
@@ -2409,17 +2413,17 @@
   }
   explicit PHINode(Type *Ty, unsigned NumReservedValues,
                    const Twine &NameStr = "",
-                   Instruction *InsertBefore = nullptr)
+                   Instruction *InsertBefore = nullptr, bool IsRdx = false)
     : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertBefore),
-      ReservedSpace(NumReservedValues) {
+      ReservedSpace(NumReservedValues), IsReduction(IsRdx) {
     setName(NameStr);
     allocHungoffUses(ReservedSpace);
   }
 
   PHINode(Type *Ty, unsigned NumReservedValues, const Twine &NameStr,
-          BasicBlock *InsertAtEnd)
+          BasicBlock *InsertAtEnd, bool IsRdx = false)
     : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertAtEnd),
-      ReservedSpace(NumReservedValues) {
+      ReservedSpace(NumReservedValues), IsReduction(IsRdx) {
     setName(NameStr);
     allocHungoffUses(ReservedSpace);
   }
@@ -2441,12 +2445,14 @@
   /// edges that this phi node will have (use 0 if you really have no idea).
   static PHINode *Create(Type *Ty, unsigned NumReservedValues,
                          const Twine &NameStr = "",
-                         Instruction *InsertBefore = nullptr) {
-    return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore);
+                         Instruction *InsertBefore = nullptr,
+                         bool IsRdx = false) {
+    return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore, IsRdx);
   }
   static PHINode *Create(Type *Ty, unsigned NumReservedValues,
-                         const Twine &NameStr, BasicBlock *InsertAtEnd) {
-    return new PHINode(Ty, NumReservedValues, NameStr, InsertAtEnd);
+                         const Twine &NameStr, BasicBlock *InsertAtEnd,
+                         bool IsRdx = false) {
+    return new PHINode(Ty, NumReservedValues, NameStr, InsertAtEnd, IsRdx);
   }
 
   /// Provide fast operand accessors
@@ -2577,6 +2583,8 @@
   /// same value, return the value, otherwise return null.
   Value *hasConstantValue() const;
 
+  bool isReduction() const { return IsReduction; }
+
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::PHI;
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2317,6 +2317,7 @@
   bool nuw = false;
   bool nsw = false;
   bool exact = false;
+  bool reduction = false;
   FastMathFlags FMF;
 
   if (const OverflowingBinaryOperator *OFBinOp =
@@ -2330,10 +2331,20 @@
   if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
     FMF = FPOp->getFastMathFlags();
 
+  // Check if this binary op is a reduction.
+  {
+    const PHINode *PN;
+    if ((PN = dyn_cast<PHINode>(I.getOperand(0))))
+      reduction = PN->isReduction();
+    if (!reduction && (PN = dyn_cast<PHINode>(I.getOperand(1))))
+      reduction = PN->isReduction();
+  }
+
   SDNodeFlags Flags;
   Flags.setExact(exact);
   Flags.setNoSignedWrap(nsw);
   Flags.setNoUnsignedWrap(nuw);
+  Flags.setReduction(reduction);
   if (EnableFMFInDAG) {
     Flags.setAllowReciprocal(FMF.allowReciprocal());
     Flags.setNoInfs(FMF.noInfs());
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -26621,9 +26621,122 @@
                      DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
 }
 
+static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (!VT.isVector() || !VT.isSimple())
+    return SDValue();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (!(VT.getVectorElementType() == MVT::i32 && isPowerOf2_32(NumElems)))
+    return SDValue();
+
+  if (Subtarget->hasAVX512()) {
+    if (VT.getSizeInBits() > 512)
+      return SDValue();
+  } else if (Subtarget->hasAVX2()) {
+    if (VT.getSizeInBits() > 256)
+      return SDValue();
+  } else {
+    if (VT.getSizeInBits() > 128)
+      return SDValue();
+  }
+
+  // Detect the following pattern:
+  //
+  // 1:    %2 = zext <4 x i8> %0 to <4 x i32>
+  // 2:    %3 = zext <4 x i8> %1 to <4 x i32>
+  // 3:    %4 = sub nsw <4 x i32> %2, %3
+  // 4:    %5 = icmp sgt <4 x i32> %4, zeroinitializer
+  // 5:    %6 = sub nsw <4 x i32> zeroinitializer, %4
+  // 6:    %7 = select <4 x i1> %5, <4 x i32> %4, <4 x i32> %6
+  // 7:    %8 = add nsw <4 x i32> %7, %vec.phi
+  //
+  // The last instruction must be a reduction add. The instructions 3-6 forms an
+  // ABSDIFF pattern.
+
+  SDValue SelectOp, Phi;
+  if (Op0.getOpcode() == ISD::CopyFromReg && Op1.getOpcode() == ISD::VSELECT) {
+    SelectOp = Op1;
+    Phi = Op0;
+  } else if (Op1.getOpcode() == ISD::CopyFromReg &&
+           Op0.getOpcode() == ISD::VSELECT) {
+    SelectOp = Op0;
+    Phi = Op1;
+  } else
+    return SDValue();
+
+  SDValue SetCC = SelectOp->getOperand(0);
+  if (SetCC.getOpcode() != ISD::SETCC)
+    return SDValue();
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  if (CC != ISD::SETGT)
+    return SDValue();
+
+  Op0 = SelectOp->getOperand(1);
+  Op1 = SelectOp->getOperand(2);
+  if (Op0.getOpcode() != ISD::SUB || Op1.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  if (SetCC.getOperand(0) != Op0 || Op1.getOperand(1) != Op0)
+    return SDValue();
+
+  // A lambda checking the given SDValue is a constant vector with all ones.
+  auto IsConstVectorOfZeros = [](SDValue V) {
+    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
+    if (!BV || !BV->isConstant())
+      return false;
+    auto NumOperands = V.getNumOperands();
+    for (unsigned i = 0; i < NumOperands; i++) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+      if (!C || !C->isNullValue())
+        return false; }
+    return true;
+  };
+
+  if (!IsConstVectorOfZeros(SetCC.getOperand(1)) ||
+      !IsConstVectorOfZeros(Op1.getOperand(0)))
+    return SDValue();
+
+  Op1 = Op0.getOperand(1);
+  Op0 = Op0.getOperand(0);
+
+  assert(Op0.getOperand(0).getValueType().isVector());
+  assert(Op1.getOperand(0).getValueType().isVector());
+  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
+      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
+      Op1.getOpcode() != ISD::ZERO_EXTEND ||
+      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
+    return SDValue();
+
+  unsigned NumConcat = 128 / Op0.getOperand(0).getValueType().getSizeInBits();
+  SmallVector<SDValue, 16> Ops(
+      NumConcat, DAG.getConstant(0, DL, Op0.getOperand(0).getValueType()));
+  Ops[0] = Op0.getOperand(0);
+  Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Ops);
+  Ops[0] = Op1.getOperand(0);
+  Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Ops);
+
+  SDValue SAD =
+      DAG.getNode(X86ISD::PSADBW, DL, MVT::v16i8, Op0, Op1);
+  SAD = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, SAD);
+  return DAG.getNode(ISD::ADD, DL, VT, SAD, Phi);
+}
+
 /// PerformADDCombine - Do target-specific dag combines on integer adds.
 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget *Subtarget) {
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+  if (Flags->hasReduction()) {
+    SDValue SAD = detectSADPattern(N, DAG, Subtarget);
+    if (SAD.getNode())
+      return SAD;
+  }
+
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -4062,6 +4062,8 @@
                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PSADBW  : PDI_binop_all<0xF6, "psadbw", X86psadbw, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 
 // Intrinsic forms
 defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
@@ -4082,8 +4084,6 @@
                                  int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
 defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
                                  int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
-defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
-                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
 
 let Predicates = [HasAVX2] in
   def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1),
Index: lib/Target/X86/X86IntrinsicsInfo.h
===================================================================
--- lib/Target/X86/X86IntrinsicsInfo.h
+++ lib/Target/X86/X86IntrinsicsInfo.h
@@ -278,6 +278,7 @@
   X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
   X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
@@ -1694,6 +1695,7 @@
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(sse2_psad_bw,      INTR_TYPE_2OP, X86ISD::PSADBW, 0),
   X86_INTRINSIC_DATA(sse2_pshuf_d,      INTR_TYPE_2OP, X86ISD::PSHUFD, 0),
   X86_INTRINSIC_DATA(sse2_pshufh_w,     INTR_TYPE_2OP, X86ISD::PSHUFHW, 0),
   X86_INTRINSIC_DATA(sse2_pshufl_w,     INTR_TYPE_2OP, X86ISD::PSHUFLW, 0),
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3612,7 +3612,8 @@
       Type *VecTy = (VF == 1) ? PN->getType() :
       VectorType::get(PN->getType(), VF);
       Entry[part] = PHINode::Create(
-          VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt());
+          VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt(),
+          true /* IsReduction */);
     }
     PV->push_back(P);
     return;