Index: include/llvm/CodeGen/TargetLowering.h
===================================================================
--- include/llvm/CodeGen/TargetLowering.h
+++ include/llvm/CodeGen/TargetLowering.h
@@ -2650,6 +2650,18 @@
       const SmallVectorImpl<CCValAssign> &ArgLocs,
       const SmallVectorImpl<SDValue> &OutVals) const;
 
+  static bool isBasicSADPattern(SelectionDAG &DAG, SDNode *Extract,
+                                SDValue &Zext0, SDValue &Zext1,
+                                ArrayRef<EVT> CandidateDataTypes,
+                                ArrayRef<ISD::NodeType> CandidateExtOps);
+
+  static bool detectExtAbsDiff(const SDValue &Select, SDValue &Op0,
+                               SDValue &Op1, ArrayRef<EVT> CandidateDataTypes,
+                               ArrayRef<ISD::NodeType> CandidateExtOps);
+
+  static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
+                                     ArrayRef<ISD::NodeType> CandidateBinOps);
+
   //===--------------------------------------------------------------------===//
   // TargetLowering Optimization Methods
   //
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4291,3 +4291,182 @@
   }
   return SDValue();
 }
+
+// Match a binop + shuffle pyramid that represents a horizontal reduction over
+// the elements of a vector.
+// Returns the vector that is being reduced on, or SDValue() if a reduction
+// was not matched.
+SDValue
+TargetLowering::matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
+                                    ArrayRef<ISD::NodeType> CandidateBinOps) {
+  // The pattern must end in an extract from index 0.
+  if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
+      !isNullConstant(Extract->getOperand(1)))
+    return SDValue();
+
+  SDValue Op = Extract->getOperand(0);
+  unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
+
+  // Match against one of the candidate binary ops.
+  if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
+        return Op.getOpcode() == unsigned(BinOp);
+      }))
+    return SDValue();
+
+  // At each stage, we're looking for something that looks like:
+  // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
+  //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+  //                               i32 undef, i32 undef, i32 undef, i32 undef>
+  // %a = binop <8 x i32> %op, %s
+  // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
+  // we expect something like:
+  // <4,5,6,7,u,u,u,u>
+  // <2,3,u,u,u,u,u,u>
+  // <1,u,u,u,u,u,u,u>
+  unsigned CandidateBinOp = Op.getOpcode();
+  for (unsigned i = 0; i < Stages; ++i) {
+    if (Op.getOpcode() != CandidateBinOp)
+      return SDValue();
+
+    ShuffleVectorSDNode *Shuffle =
+        dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
+    if (Shuffle) {
+      Op = Op.getOperand(1);
+    } else {
+      Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
+      Op = Op.getOperand(0);
+    }
+
+    // The first operand of the shuffle should be the same as the other operand
+    // of the binop.
+    if (!Shuffle || Shuffle->getOperand(0) != Op)
+      return SDValue();
+
+    // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+    for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
+      if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
+        return SDValue();
+  }
+
+  BinOp = CandidateBinOp;
+  return Op;
+}
+
+// Given a select, detect the following pattern:
+// 1:    %2 = zext <N x i8> %0 to <N x i32>
+// 2:    %3 = zext <N x i8> %1 to <N x i32>
+// 3:    %4 = sub nsw <N x i32> %2, %3
+// 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
+// 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
+// 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+// This is useful as it is the input into a SAD pattern.
+bool
+TargetLowering::detectExtAbsDiff(const SDValue &Select, SDValue &Op0,
+                                 SDValue &Op1,
+                                 ArrayRef<EVT> CandidateDataTypes,
+                                 ArrayRef<ISD::NodeType> CandidateExtOps) {
+  // Check the condition of the select instruction is greater-than.
+  SDValue SetCC = Select->getOperand(0);
+  if (SetCC.getOpcode() != ISD::SETCC)
+    return false;
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  if (CC != ISD::SETGT && CC != ISD::SETLT)
+    return false;
+
+  SDValue SelectOp1 = Select->getOperand(1);
+  SDValue SelectOp2 = Select->getOperand(2);
+
+  // The following instructions assume SelectOp1 is the subtraction operand
+  // and SelectOp2 is the negation operand.
+  // In the case of SETLT this is the other way around.
+  if (CC == ISD::SETLT)
+    std::swap(SelectOp1, SelectOp2);
+
+  // The second operand of the select should be the negation of the first
+  // operand, which is implemented as 0 - SelectOp1.
+  if (!(SelectOp2.getOpcode() == ISD::SUB &&
+        ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
+        SelectOp2.getOperand(1) == SelectOp1))
+    return false;
+
+  // The first operand of SetCC is the first operand of the select, which is the
+  // difference between the two input vectors.
+  if (SetCC.getOperand(0) != SelectOp1)
+    return false;
+
+  // In SetLT case, The second operand of the comparison can be either 1 or 0.
+  APInt SplatVal;
+  if ((CC == ISD::SETLT) &&
+      !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
+         SplatVal.isOneValue()) ||
+        (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+    return false;
+
+  // In SetGT case, The second operand of the comparison can be either -1 or 0.
+  if ((CC == ISD::SETGT) &&
+      !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+        ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
+    return false;
+
+  // The first operand of the select is the difference between the two input
+  // vectors.
+  if (SelectOp1.getOpcode() != ISD::SUB)
+    return false;
+
+  Op0 = SelectOp1.getOperand(0);
+  Op1 = SelectOp1.getOperand(1);
+
+  // Check if the data type and signedness match for two input vector.
+  if (Op0.getOpcode() != Op1.getOpcode() ||
+      Op0.getOperand(0).getValueType() != Op1.getOperand(0).getValueType())
+    return false;
+
+  // Match against one of the candidate extension type.
+  if (llvm::none_of(CandidateExtOps, [Op0](ISD::NodeType ExtOp) {
+        return Op0.getOpcode() == unsigned(ExtOp);
+      }))
+    return false;
+
+  // Match against one of the candidate data type.
+  if (llvm::none_of(CandidateDataTypes, [Op0](EVT DT) {
+        return Op0.getOperand(0).getValueType().getVectorElementType() == DT;
+      }))
+    return false;
+
+  return true;
+}
+
+bool
+TargetLowering::isBasicSADPattern(SelectionDAG &DAG, SDNode *Extract,
+                                  SDValue &Zext0, SDValue &Zext1,
+                                  ArrayRef<EVT> CandidateDataTypes,
+                                  ArrayRef<ISD::NodeType> CandidateExtOps) {
+  // Match shuffle + add pyramid.
+  unsigned BinOp = 0;
+  SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
+
+  // The operand is expected to be extended by one of extension opcode
+  // in CandidateExtOps from a data type in CandidateDataTypes
+  // (verified in detectExtAbsDiff).
+  // In order to convert to i64 and above, additional any/zero/sign
+  // extend is expected.
+  // The zero extend from 32 bit has no mathematical effect on the result.
+  // Also the sign extend is basically zero extend
+  // (extends the sign bit which is zero).
+  // So it is correct to skip the sign/zero extend instruction.
+  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
+    Root.getOpcode() == ISD::ZERO_EXTEND ||
+    Root.getOpcode() == ISD::ANY_EXTEND))
+    Root = Root.getOperand(0);
+
+  // If there was a match, we want Root to be a select that is the root of an
+  // abs-diff pattern.
+  if (!Root || (Root.getOpcode() != ISD::VSELECT))
+    return false;
+
+  // Check whether we have an abs-diff pattern feeding into the select.
+  if (!detectExtAbsDiff(Root, Zext0, Zext1, CandidateDataTypes, CandidateExtOps))
+    return false;
+
+  return true;
+}
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1023,6 +1023,8 @@
     setTargetDAGCombine(ISD::FSQRT);
   }
 
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+
   // Darwin long double math library functions have $LDBL128 appended.
   if (Subtarget.isDarwin()) {
     setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
@@ -12090,6 +12092,107 @@
   return SDValue();
 }
 
+static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
+                                      const PPCSubtarget &Subtarget) {
+  // Currently, we support SAD pattern only on ppc64le with VSX
+  if (!(Subtarget.hasAltivec() && Subtarget.isPPC64() &&
+        Subtarget.isLittleEndian()))
+    return SDValue();
+
+  // Verify the type we're extracting from is any integer type above i16.
+  EVT VT = Extract->getOperand(0).getValueType();
+  if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
+    return SDValue();
+
+  // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
+  // TODO: We should be able to handle larger vectors by splitting them before
+  // feeding them into several SADs, and then reducing over those.
+  if (VT.getVectorNumElements() != 16 && VT.getVectorNumElements() != 8)
+    return SDValue();
+
+  SDValue Zext0, Zext1;
+  if (!TargetLowering::isBasicSADPattern(DAG, Extract, Zext0, Zext1,
+                                         {MVT::i8, MVT::i16},
+                                         {ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}))
+    return SDValue();
+
+  EVT SrcVT = Zext0.getOperand(0).getValueType();
+  bool IsSigned = (Zext0.getOpcode() == ISD::SIGN_EXTEND);
+
+  SDLoc DL(Extract);
+  SDValue VZero = SDValue(DAG.getMachineNode(PPC::V_SET0, DL, MVT::v4i32), 0);
+  SDNode *MaxNode, *MinNode;
+  if (SrcVT == MVT::v16i8) {
+    if (IsSigned) {
+      MaxNode = DAG.getMachineNode(PPC::VMAXSB, DL, MVT::v16i8,
+                                   Zext0.getOperand(0), Zext1.getOperand(0));
+      MinNode = DAG.getMachineNode(PPC::VMINSB, DL, MVT::v16i8,
+                                   Zext0.getOperand(0), Zext1.getOperand(0));
+    }
+    else {
+      MaxNode = DAG.getMachineNode(PPC::VMAXUB, DL, MVT::v16i8,
+                                   Zext0.getOperand(0), Zext1.getOperand(0));
+      MinNode = DAG.getMachineNode(PPC::VMINUB, DL, MVT::v16i8,
+                                   Zext0.getOperand(0), Zext1.getOperand(0));
+    }
+    SDNode *AbsNode = DAG.getMachineNode(PPC::VSUBUBM, DL, MVT::v16i8,
+                                         SDValue(MaxNode, 0),
+                                         SDValue(MinNode, 0));
+    SDNode *Sum1Node = DAG.getMachineNode(PPC::VSUM4UBS, DL, MVT::v4i32,
+                                          SDValue(AbsNode, 0), VZero);
+    SDNode *Sum2Node = DAG.getMachineNode(PPC::VSUMSWS, DL, MVT::v4i32,
+                                          SDValue(Sum1Node, 0), VZero);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                      SDValue(Sum2Node, 0), Extract->getOperand(1));
+  }
+  if (SrcVT == MVT::v8i16) {
+    if (IsSigned) {
+      MaxNode = DAG.getMachineNode(PPC::VMAXSH, DL, MVT::v8i16,
+                                   Zext0.getOperand(0), Zext1.getOperand(0));
+      MinNode = DAG.getMachineNode(PPC::VMINSH, DL, MVT::v8i16,
+                                   Zext0.getOperand(0), Zext1.getOperand(0));
+    }
+    else {
+      MaxNode = DAG.getMachineNode(PPC::VMAXUH, DL, MVT::v8i16,
+                                   Zext0.getOperand(0), Zext1.getOperand(0));
+      MinNode = DAG.getMachineNode(PPC::VMINUH, DL, MVT::v8i16,
+                                   Zext0.getOperand(0), Zext1.getOperand(0));
+    }
+    SDNode *AbsNode = DAG.getMachineNode(PPC::VSUBUHM, DL, MVT::v8i16,
+                                         SDValue(MaxNode, 0),
+                                         SDValue(MinNode, 0));
+
+    // We cannot use VSUM4SHS since the absolute value in AbsNode is unsigned.
+    // So we create two zero-extended v4i32 vectors from input v8i16 vector
+    // and execute two VSUMSWS instructions.
+    SmallVector<SDValue, 16> Mask1, Mask2;
+    for (unsigned i = 0; i < 16; i++)
+      if (i & 2) {
+        Mask1.push_back(DAG.getConstant(0, DL, MVT::i32));
+        Mask2.push_back(DAG.getConstant(0, DL, MVT::i32));
+      }
+      else {
+        Mask1.push_back(DAG.getConstant(29 - i, DL, MVT::i32));
+        Mask2.push_back(DAG.getConstant(0xFF, DL, MVT::i32));
+      }
+    SDValue VMask1 = DAG.getBuildVector(MVT::v16i8, DL, Mask1);
+    SDValue VMask2 = DAG.getBuildVector(MVT::v16i8, DL, Mask2);
+    SDNode *AbsOddNode = DAG.getMachineNode(PPC::VPERM, DL, MVT::v8i16,
+                                            VZero, SDValue(AbsNode, 0),
+                                            VMask1);
+    SDNode *AbsEvenNode = DAG.getMachineNode(PPC::VAND, DL, MVT::v8i16,
+                                             SDValue(AbsNode, 0), VMask2);
+    SDNode *Sum1Node = DAG.getMachineNode(PPC::VSUMSWS, DL, MVT::v4i32,
+                                          SDValue(AbsEvenNode, 0), VZero);
+    SDNode *Sum2Node = DAG.getMachineNode(PPC::VSUMSWS, DL, MVT::v4i32,
+                                          SDValue(AbsOddNode, 0),
+                                          SDValue(Sum1Node, 0));
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                      SDValue(Sum2Node, 0), Extract->getOperand(1));
+  }
+  return SDValue();
+}
+
 // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
 // builtins) into loads with swaps.
 SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
@@ -12928,8 +13031,12 @@
   }
   case ISD::BUILD_VECTOR:
     return DAGCombineBuildVector(N, DCI);
+
+  case ISD::EXTRACT_VECTOR_ELT:
+    return combineBasicSADPattern(N, DAG, Subtarget);
   }
 
+
   return SDValue();
 }
 
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -31184,136 +31184,6 @@
   return SDValue();
 }
 
-// Match a binop + shuffle pyramid that represents a horizontal reduction over
-// the elements of a vector.
-// Returns the vector that is being reduced on, or SDValue() if a reduction
-// was not matched.
-static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
-                                   ArrayRef<ISD::NodeType> CandidateBinOps) {
-  // The pattern must end in an extract from index 0.
-  if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
-      !isNullConstant(Extract->getOperand(1)))
-    return SDValue();
-
-  SDValue Op = Extract->getOperand(0);
-  unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
-
-  // Match against one of the candidate binary ops.
-  if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
-        return Op.getOpcode() == unsigned(BinOp);
-      }))
-    return SDValue();
-
-  // At each stage, we're looking for something that looks like:
-  // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
-  //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
-  //                               i32 undef, i32 undef, i32 undef, i32 undef>
-  // %a = binop <8 x i32> %op, %s
-  // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
-  // we expect something like:
-  // <4,5,6,7,u,u,u,u>
-  // <2,3,u,u,u,u,u,u>
-  // <1,u,u,u,u,u,u,u>
-  unsigned CandidateBinOp = Op.getOpcode();
-  for (unsigned i = 0; i < Stages; ++i) {
-    if (Op.getOpcode() != CandidateBinOp)
-      return SDValue();
-
-    ShuffleVectorSDNode *Shuffle =
-        dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
-    if (Shuffle) {
-      Op = Op.getOperand(1);
-    } else {
-      Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
-      Op = Op.getOperand(0);
-    }
-
-    // The first operand of the shuffle should be the same as the other operand
-    // of the binop.
-    if (!Shuffle || Shuffle->getOperand(0) != Op)
-      return SDValue();
-
-    // Verify the shuffle has the expected (at this stage of the pyramid) mask.
-    for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
-      if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
-        return SDValue();
-  }
-
-  BinOp = CandidateBinOp;
-  return Op;
-}
-
-// Given a select, detect the following pattern:
-// 1:    %2 = zext <N x i8> %0 to <N x i32>
-// 2:    %3 = zext <N x i8> %1 to <N x i32>
-// 3:    %4 = sub nsw <N x i32> %2, %3
-// 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
-// 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
-// 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
-// This is useful as it is the input into a SAD pattern.
-static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
-                              SDValue &Op1) {
-  // Check the condition of the select instruction is greater-than.
-  SDValue SetCC = Select->getOperand(0);
-  if (SetCC.getOpcode() != ISD::SETCC)
-    return false;
-  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
-  if (CC != ISD::SETGT && CC != ISD::SETLT)
-    return false;
-
-  SDValue SelectOp1 = Select->getOperand(1);
-  SDValue SelectOp2 = Select->getOperand(2);
-
-  // The following instructions assume SelectOp1 is the subtraction operand
-  // and SelectOp2 is the negation operand.
-  // In the case of SETLT this is the other way around.
-  if (CC == ISD::SETLT)
-    std::swap(SelectOp1, SelectOp2);
-
-  // The second operand of the select should be the negation of the first
-  // operand, which is implemented as 0 - SelectOp1.
-  if (!(SelectOp2.getOpcode() == ISD::SUB &&
-        ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
-        SelectOp2.getOperand(1) == SelectOp1))
-    return false;
-
-  // The first operand of SetCC is the first operand of the select, which is the
-  // difference between the two input vectors.
-  if (SetCC.getOperand(0) != SelectOp1)
-    return false;
-
-  // In SetLT case, The second operand of the comparison can be either 1 or 0.
-  APInt SplatVal;
-  if ((CC == ISD::SETLT) &&
-      !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
-         SplatVal.isOneValue()) ||
-        (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
-    return false;
-
-  // In SetGT case, The second operand of the comparison can be either -1 or 0.
-  if ((CC == ISD::SETGT) &&
-      !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
-        ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
-    return false;
-
-  // The first operand of the select is the difference between the two input
-  // vectors.
-  if (SelectOp1.getOpcode() != ISD::SUB)
-    return false;
-
-  Op0 = SelectOp1.getOperand(0);
-  Op1 = SelectOp1.getOperand(1);
-
-  // Check if the operands of the sub are zero-extended from vectors of i8.
-  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
-      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
-      Op1.getOpcode() != ISD::ZERO_EXTEND ||
-      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
-    return false;
-
-  return true;
-}
-
 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
 // to these zexts.
 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
@@ -31358,7 +31228,7 @@
 
   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
   unsigned BinOp;
-  SDValue Src = matchBinOpReduction(
+  SDValue Src = TargetLowering::matchBinOpReduction(
       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
   if (!Src)
     return SDValue();
@@ -31438,7 +31308,7 @@
 
   // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
   unsigned BinOp = 0;
-  SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
+  SDValue Match = TargetLowering::matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
   if (!Match)
     return SDValue();
 
@@ -31496,6 +31366,12 @@
   return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
 }
 
+static bool detectZextAbsDiff(const SDValue &SelectOp, SDValue &Op0,
+                              SDValue &Op1) {
+  return TargetLowering::detectExtAbsDiff(SelectOp, Op0, Op1, {MVT::i8},
+                                          {ISD::ZERO_EXTEND});
+}
+
 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   // PSADBW is only supported on SSE2 and up.
@@ -31519,31 +31395,9 @@
   if (RegSize / VT.getVectorNumElements() < 8)
     return SDValue();
 
-  // Match shuffle + add pyramid.
-  unsigned BinOp = 0;
-  SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
-
-  // The operand is expected to be zero extended from i8
-  // (verified in detectZextAbsDiff).
-  // In order to convert to i64 and above, additional any/zero/sign
-  // extend is expected.
-  // The zero extend from 32 bit has no mathematical effect on the result.
-  // Also the sign extend is basically zero extend
-  // (extends the sign bit which is zero).
-  // So it is correct to skip the sign/zero extend instruction.
-  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
-    Root.getOpcode() == ISD::ZERO_EXTEND ||
-    Root.getOpcode() == ISD::ANY_EXTEND))
-    Root = Root.getOperand(0);
-
-  // If there was a match, we want Root to be a select that is the root of an
-  // abs-diff pattern.
-  if (!Root || (Root.getOpcode() != ISD::VSELECT))
-    return SDValue();
-
-  // Check whether we have an abs-diff pattern feeding into the select.
   SDValue Zext0, Zext1;
-  if (!detectZextAbsDiff(Root, Zext0, Zext1))
+  if (!TargetLowering::isBasicSADPattern(DAG, Extract, Zext0, Zext1, {MVT::i8},
+                                         {ISD::ZERO_EXTEND}))
     return SDValue();
 
   // Create the SAD instruction.
Index: test/CodeGen/PowerPC/ppc64_basicSAD.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/ppc64_basicSAD.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s
+
+define zeroext i32 @func8s(i8* nocapture readonly %pix1, i8* nocapture readonly %pix2) {
+; CHECK-LABEL: @func8s
+; CHECK-DAG: vminsb [[MIN:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-DAG: vmaxsb [[MAX:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+;	CHECK-DAG: vxor [[ZERO:[0-9]+]], [[ZERO]], [[ZERO]]
+; CHECK: vsububm [[ABS:[0-9]+]], [[MAX]], [[MIN]]
+; CHECK: vsum4ubs [[SUM1:[0-9]+]], [[ABS]], [[ZERO]]
+; CHECK: vsumsws [[SUM2:[0-9]+]], [[SUM1]], [[ZERO]]
+; CHECK: mfvsrwz {{[0-9]+}}
+entry:
+  %0 = bitcast i8* %pix1 to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = sext <16 x i8> %1 to <16 x i32>
+  %3 = bitcast i8* %pix2 to <16 x i8>*
+  %4 = load <16 x i8>, <16 x i8>* %3, align 1
+  %5 = sext <16 x i8> %4 to <16 x i32>
+  %6 = sub nsw <16 x i32> %2, %5
+  %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %8 = sub nsw <16 x i32> zeroinitializer, %6
+  %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
+  %rdx.shuf = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add nsw <16 x i32> %9, %rdx.shuf
+  %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add nsw <16 x i32> %bin.rdx, %rdx.shuf12
+  %rdx.shuf14 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx15 = add nsw <16 x i32> %bin.rdx13, %rdx.shuf14
+  %rdx.shuf16 = shufflevector <16 x i32> %bin.rdx15, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx17 = add nsw <16 x i32> %bin.rdx15, %rdx.shuf16
+  %10 = extractelement <16 x i32> %bin.rdx17, i32 0
+  ret i32 %10
+}
+
+define zeroext i32 @func8u(i8* nocapture readonly %pix1, i8* nocapture readonly %pix2) {
+; CHECK-LABEL: @func8u
+; CHECK-DAG: vminub [[MIN:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-DAG: vmaxub [[MAX:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+;	CHECK-DAG: vxor [[ZERO:[0-9]+]], [[ZERO]], [[ZERO]]
+; CHECK: vsububm [[ABS:[0-9]+]], [[MAX]], [[MIN]]
+; CHECK: vsum4ubs [[SUM1:[0-9]+]], [[ABS]], [[ZERO]]
+; CHECK: vsumsws [[SUM2:[0-9]+]], [[SUM1]], [[ZERO]]
+; CHECK: mfvsrwz {{[0-9]+}}
+entry:
+  %0 = bitcast i8* %pix1 to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = zext <16 x i8> %1 to <16 x i32>
+  %3 = bitcast i8* %pix2 to <16 x i8>*
+  %4 = load <16 x i8>, <16 x i8>* %3, align 1
+  %5 = zext <16 x i8> %4 to <16 x i32>
+  %6 = sub nsw <16 x i32> %2, %5
+  %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %8 = sub nsw <16 x i32> zeroinitializer, %6
+  %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
+  %rdx.shuf = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <16 x i32> %9, %rdx.shuf
+  %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add <16 x i32> %bin.rdx, %rdx.shuf12
+  %rdx.shuf14 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx15 = add <16 x i32> %bin.rdx13, %rdx.shuf14
+  %rdx.shuf16 = shufflevector <16 x i32> %bin.rdx15, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx17 = add <16 x i32> %bin.rdx15, %rdx.shuf16
+  %10 = extractelement <16 x i32> %bin.rdx17, i32 0
+  ret i32 %10
+}
+
+
+define signext i32 @func16s(i16* nocapture readonly %pix1, i16* nocapture readonly %pix2) {
+; CHECK-LABEL: @func16s
+; CHECK-DAG: vminsh [[MIN:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-DAG: vmaxsh [[MAX:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+;	CHECK-DAG: vxor [[ZERO:[0-9]+]], [[ZERO]], [[ZERO]]
+; CHECK: vsubuhm [[ABS:[0-9]+]], [[MAX]], [[MIN]]
+; CHECK-DAG: vand [[EVEN:[0-9]+]], [[ABS]], {{[0-9]+}}
+; CHECK-DAG: vperm [[ODD:[0-9]+]], [[ZERO]], [[ABS]], {{[0-9]+}}
+; CHECK: vsumsws [[SUM1:[0-9]+]], [[EVEN]], [[ZERO]]
+; CHECK: vsumsws [[SUM2:[0-9]+]], [[ODD]], [[SUM1]]
+; CHECK: mfvsrwz {{[0-9]+}}
+
+entry:
+  %0 = bitcast i16* %pix1 to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = sext <8 x i16> %1 to <8 x i32>
+  %3 = bitcast i16* %pix2 to <8 x i16>*
+  %4 = load <8 x i16>, <8 x i16>* %3, align 2
+  %5 = sext <8 x i16> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp sgt <8 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add nsw <8 x i32> %9, %rdx.shuf
+  %rdx.shuf12 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add nsw <8 x i32> %bin.rdx, %rdx.shuf12
+  %rdx.shuf14 = shufflevector <8 x i32> %bin.rdx13, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx15 = add nsw <8 x i32> %bin.rdx13, %rdx.shuf14
+  %10 = extractelement <8 x i32> %bin.rdx15, i32 0
+  ret i32 %10
+}
+
+define signext i32 @func16u(i16* nocapture readonly %pix1, i16* nocapture readonly %pix2) {
+; CHECK-LABEL: @func16u
+; CHECK-DAG: vminuh [[MIN:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-DAG: vmaxuh [[MAX:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+;	CHECK-DAG: vxor [[ZERO:[0-9]+]], [[ZERO]], [[ZERO]]
+; CHECK: vsubuhm [[ABS:[0-9]+]], [[MAX]], [[MIN]]
+; CHECK-DAG: vand [[EVEN:[0-9]+]], [[ABS]], {{[0-9]+}}
+; CHECK-DAG: vperm [[ODD:[0-9]+]], [[ZERO]], [[ABS]], {{[0-9]+}}
+; CHECK: vsumsws [[SUM1:[0-9]+]], [[EVEN]], [[ZERO]]
+; CHECK: vsumsws [[SUM2:[0-9]+]], [[ODD]], [[SUM1]]
+; CHECK: mfvsrwz {{[0-9]+}}
+
+entry:
+  %0 = bitcast i16* %pix1 to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = zext <8 x i16> %1 to <8 x i32>
+  %3 = bitcast i16* %pix2 to <8 x i16>*
+  %4 = load <8 x i16>, <8 x i16>* %3, align 2
+  %5 = zext <8 x i16> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp sgt <8 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add nsw <8 x i32> %9, %rdx.shuf
+  %rdx.shuf12 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add nsw <8 x i32> %bin.rdx, %rdx.shuf12
+  %rdx.shuf14 = shufflevector <8 x i32> %bin.rdx13, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx15 = add nsw <8 x i32> %bin.rdx13, %rdx.shuf14
+  %10 = extractelement <8 x i32> %bin.rdx15, i32 0
+  ret i32 %10
+}