Index: include/llvm/CodeGen/SelectionDAGNodes.h
===================================================================
--- include/llvm/CodeGen/SelectionDAGNodes.h
+++ include/llvm/CodeGen/SelectionDAGNodes.h
@@ -328,6 +328,7 @@
   bool NoInfs : 1;
   bool NoSignedZeros : 1;
   bool AllowReciprocal : 1;
+  bool Reduction : 1;
 
 public:
   /// Default constructor turns off all optimization flags.
@@ -340,6 +341,7 @@
     NoInfs = false;
     NoSignedZeros = false;
     AllowReciprocal = false;
+    Reduction = false;
   }
 
   // These are mutators for each flag.
@@ -351,6 +353,7 @@
   void setNoInfs(bool b) { NoInfs = b; }
   void setNoSignedZeros(bool b) { NoSignedZeros = b; }
   void setAllowReciprocal(bool b) { AllowReciprocal = b; }
+  void setReduction(bool b) { Reduction = b; }
 
   // These are accessors for each flag.
   bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
@@ -361,6 +364,7 @@
   bool hasNoInfs() const { return NoInfs; }
   bool hasNoSignedZeros() const { return NoSignedZeros; }
   bool hasAllowReciprocal() const { return AllowReciprocal; }
+  bool hasReduction() const { return Reduction; }
 
   /// Return a raw encoding of the flags.
   /// This function should only be used to add data to the NodeID value.
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2322,6 +2322,7 @@
   bool nuw = false;
   bool nsw = false;
   bool exact = false;
+  bool reduction = false;
   FastMathFlags FMF;
 
   if (const OverflowingBinaryOperator *OFBinOp =
@@ -2335,10 +2336,41 @@
   if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
     FMF = FPOp->getFastMathFlags();
 
+  // Check if this binary operation performs a reduction.
+  // FIXME: Support more reduction operations for more pattern recognitions.
+  if (OpCode == ISD::ADD) {
+    // Check if this binary op is a reduction.
+    auto IsReductionPHI = [](const Value *V) {
+      const PHINode *PN = dyn_cast<PHINode>(V);
+      return PN && PN->getMetadata("llvm.loop.vectorize.reduction");
+    };
+
+    // This lambda checks if an instruction is a reduction one. It is a
+    // reduction if one of its operands is either a reduction PHI or another
+    // reduction operation.
+    std::function<bool(const User *)> IsReductionOp = [&](const User *I) {
+      if (I == nullptr)
+        return false;
+      const BinaryOperator *BOp = dyn_cast<BinaryOperator>(I);
+      if (BOp == nullptr)
+        return false;
+      Instruction::BinaryOps OpCode = BOp->getOpcode();
+      if (OpCode != Instruction::Add)
+        return false;
+      return IsReductionPHI(I->getOperand(0)) ||
+             IsReductionPHI(I->getOperand(1)) ||
+             IsReductionOp(dyn_cast<User>(I->getOperand(0))) ||
+             IsReductionOp(dyn_cast<User>(I->getOperand(1)));
+    };
+
+    reduction = IsReductionOp(&I);
+  }
+
   SDNodeFlags Flags;
   Flags.setExact(exact);
   Flags.setNoSignedWrap(nsw);
   Flags.setNoUnsignedWrap(nuw);
+  Flags.setReduction(reduction);
   if (EnableFMFInDAG) {
     Flags.setAllowReciprocal(FMF.allowReciprocal());
     Flags.setNoInfs(FMF.noInfs());
@@ -2346,8 +2378,8 @@
     Flags.setNoSignedZeros(FMF.noSignedZeros());
     Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
   }
-  SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
-                                     Op1, Op2, &Flags);
+  SDValue BinNodeValue =
+      DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(), Op1, Op2, &Flags);
   setValue(&I, BinNodeValue);
 }
 
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -26926,9 +26926,163 @@
                      DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
 }
 
+// Check if the given SDValue is a constant vector with all Val.
+static bool isConstVectorOf(SDValue V, int Val) {
+  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
+  if (!BV || !BV->isConstant())
+    return false;
+  auto NumOperands = V.getNumOperands();
+  for (unsigned i = 0; i < NumOperands; i++) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+    if (!C || C->getSExtValue() != Val)
+      return false;
+  }
+  return true;
+}
+
+static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (!VT.isVector() || !VT.isSimple())
+    return SDValue();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (!(VT.getVectorElementType() == MVT::i32 && isPowerOf2_32(NumElems)))
+    return SDValue();
+
+  unsigned RegSize = 128;
+  if (Subtarget->hasAVX512())
+    RegSize = 512;
+  else if (Subtarget->hasAVX2())
+    RegSize = 256;
+
+  if (VT.getSizeInBits() > RegSize)
+    return SDValue();
+
+  // Detect the following pattern:
+  //
+  // 1:    %2 = zext <N x i8> %0 to <N x i32>
+  // 2:    %3 = zext <N x i8> %1 to <N x i32>
+  // 3:    %4 = sub nsw <N x i32> %2, %3
+  // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
+  // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
+  // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+  // 7:    %8 = add nsw <N x i32> %7, %vec.phi
+  //
+  // The last instruction must be a reduction add. The instructions 3-6 forms an
+  // ABSDIFF pattern.
+
+  // The two operands of reduction add are from PHI and a select-op as in line 7
+  // above.
+  SDValue SelectOp, Phi;
+  if (Op0.getOpcode() == ISD::VSELECT) {
+    SelectOp = Op0;
+    Phi = Op1;
+  } else if (Op1.getOpcode() == ISD::VSELECT) {
+    SelectOp = Op1;
+    Phi = Op0;
+  } else
+    return SDValue();
+
+  // Check the condition of the select instruction is greater-than.
+  SDValue SetCC = SelectOp->getOperand(0);
+  if (SetCC.getOpcode() != ISD::SETCC)
+    return SDValue();
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  if (CC != ISD::SETGT)
+    return SDValue();
+
+  Op0 = SelectOp->getOperand(1);
+  Op1 = SelectOp->getOperand(2);
+
+  // The second operand of SelectOp Op1 is the negation of the first operand
+  // Op0, which is implementes as 0 - Op0.
+  if (!(Op1.getOpcode() == ISD::SUB && isConstVectorOf(Op1.getOperand(0), 0) &&
+        Op1.getOperand(1) == Op0))
+    return SDValue();
+
+  // The first operand of SetCC is the first operand of SelectOp, which is the
+  // difference between two input vectors.
+  if (SetCC.getOperand(0) != Op0)
+    return SDValue();
+
+  // The second operand of > comparison can be either -1 or 0.
+  if (!(isConstVectorOf(SetCC.getOperand(1), 0) ||
+        isConstVectorOf(SetCC.getOperand(1), -1)))
+    return SDValue();
+
+  // The first operand of SelectOp is the difference between two input vectors.
+  if (Op0.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  Op1 = Op0.getOperand(1);
+  Op0 = Op0.getOperand(0);
+
+  // Check if the operands of the diff are zero-extended from vectors of i8.
+  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
+      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
+      Op1.getOpcode() != ISD::ZERO_EXTEND ||
+      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
+    return SDValue();
+
+  // SAD pattern detected. Now build a SAD instruction and an addition for
+  // reduction. Note that the number of elments of the result of SAD is less
+  // than the number of elements of its input. Therefore, we could only update
+  // part of elements in the reduction vector.
+
+  // Legalize the type of the inputs of PSADBW.
+  EVT InVT = Op0.getOperand(0).getValueType();
+  if (InVT.getSizeInBits() <= 128)
+    RegSize = 128;
+  else if (InVT.getSizeInBits() <= 256)
+    RegSize = 256;
+
+  unsigned NumConcat =
+      RegSize / Op0.getOperand(0).getValueType().getSizeInBits();
+  SmallVector<SDValue, 16> Ops(
+      NumConcat, DAG.getConstant(0, DL, Op0.getOperand(0).getValueType()));
+  Ops[0] = Op0.getOperand(0);
+  MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
+  Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+  Ops[0] = Op1.getOperand(0);
+  Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+
+  // The output of PSADBW is a vector of i64.
+  MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
+  SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
+
+  // We need to turn the vector of i64 into a vector of i32.
+  MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+  Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
+
+  NumConcat = VT.getSizeInBits() / ResVT.getSizeInBits();
+  if (NumConcat > 1) {
+    // Update part of elements of the reduction vector. This is done by first
+    // extracting a sub-vector from it, updating this sub-vector, and inserting
+    // it back.
+    SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
+                                 DAG.getIntPtrConstant(0, DL));
+    SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
+                       DAG.getIntPtrConstant(0, DL));
+  } else
+    return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
+}
+
 /// PerformADDCombine - Do target-specific dag combines on integer adds.
 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget *Subtarget) {
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+  if (Flags->hasReduction()) {
+    SDValue Sad = detectSADPattern(N, DAG, Subtarget);
+    if (Sad.getNode())
+      return Sad;
+  }
+
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -4066,6 +4066,10 @@
                                  int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
 defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
                                  int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
+defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
+                                 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
+defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
+                                 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
 
 let Predicates = [HasAVX] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3618,8 +3618,11 @@
       // This is phase one of vectorizing PHIs.
       Type *VecTy = (VF == 1) ? PN->getType() :
       VectorType::get(PN->getType(), VF);
-      Entry[part] = PHINode::Create(
+      PHINode *PHI = PHINode::Create(
           VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt());
+      MDNode *MD = MDNode::get(PHI->getContext(), None);
+      PHI->setMetadata("llvm.loop.vectorize.reduction", MD);
+      Entry[part] = PHI;
     }
     PV->push_back(P);
     return;
Index: test/CodeGen/X86/sad.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/sad.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 -force-target-max-vector-interleave=1 -unroll-count=1 | llc | FileCheck %s --check-prefix=SSE2
+; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -force-target-max-vector-interleave=1 -unroll-count=1 | llc | FileCheck %s --check-prefix=AVX2
+; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw -force-target-max-vector-interleave=1 -unroll-count=1 | llc | FileCheck %s --check-prefix=AVX512BW
+; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 -force-target-max-vector-interleave=1 | llc | FileCheck %s --check-prefix=SSE2-UNROLL
+; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -force-target-max-vector-interleave=1 | llc | FileCheck %s --check-prefix=AVX2-UNROLL
+; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw -force-target-max-vector-interleave=1 | llc | FileCheck %s --check-prefix=AVX512BW-UNROLL
+
+@a = global [1024 x i8] zeroinitializer, align 16
+@b = global [1024 x i8] zeroinitializer, align 16
+
+define i32 @sad() {
+; SSE2-LABEL: sad
+; SSE2: psadbw %xmm1, %xmm2
+; SSE2: paddd %xmm2, %xmm0
+;
+; AVX2-LABEL: sad
+; AVX2: vpsadbw %xmm1, %xmm2, %xmm1
+; AVX2: vpaddd %xmm0, %xmm1, %xmm1
+;
+; AVX512BW-LABEL: sad
+; AVX512BW: vpsadbw {{.*}}, %xmm1, %xmm1
+; AVX512BW: vpaddd %xmm0, %xmm1, %xmm1
+;
+; SSE2-UNROLL-LABEL: sad
+; SSE2-UNROLL: psadbw %xmm1, %xmm2
+; SSE2-UNROLL: paddd %xmm0, %xmm2
+; SSE2-UNROLL: psadbw %xmm1, %xmm0
+; SSE2-UNROLL: paddd %xmm2, %xmm0
+;
+; AVX2-UNROLL-LABEL: sad
+; AVX2-UNROLL: vpsadbw %xmm1, %xmm2, %xmm1
+; AVX2-UNROLL: vpaddd %xmm0, %xmm1, %xmm1
+; AVX2-UNROLL: vpsadbw %xmm2, %xmm3, %xmm2
+; AVX2-UNROLL: vpaddd %xmm1, %xmm2, %xmm1
+;
+; AVX512BW-UNROLL-LABEL: sad
+; AVX512BW-UNROLL: vpsadbw {{.*}}, %xmm1, %xmm1
+; AVX512BW-UNROLL: vpaddd %xmm0, %xmm1, %xmm1
+; AVX512BW-UNROLL: vpsadbw {{.*}}, %xmm1, %xmm1
+; AVX512BW-UNROLL: vpaddd %xmm0, %xmm1, %xmm1
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %2 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %2, %s.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
Index: test/Transforms/LoopVectorize/vectorize-once.ll
===================================================================
--- test/Transforms/LoopVectorize/vectorize-once.ll
+++ test/Transforms/LoopVectorize/vectorize-once.ll
@@ -68,11 +68,11 @@
 
 attributes #0 = { nounwind readonly ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" }
 
-; CHECK: !0 = distinct !{!0, !1, !2}
-; CHECK: !1 = !{!"llvm.loop.vectorize.width", i32 1}
-; CHECK: !2 = !{!"llvm.loop.interleave.count", i32 1}
-; CHECK: !3 = distinct !{!3, !4, !1, !2}
-; CHECK: !4 = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: !1 = distinct !{!1, !2, !3}
+; CHECK: !2 = !{!"llvm.loop.vectorize.width", i32 1}
+; CHECK: !3 = !{!"llvm.loop.interleave.count", i32 1}
+; CHECK: !4 = distinct !{!4, !5, !2, !3}
+; CHECK: !5 = !{!"llvm.loop.unroll.runtime.disable"}
 
 !0 = !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.width", i32 1}