diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -368,7 +368,6 @@
   bool NoInfs : 1;
   bool NoSignedZeros : 1;
   bool AllowReciprocal : 1;
-  bool VectorReduction : 1;
   bool AllowContract : 1;
   bool ApproximateFuncs : 1;
   bool AllowReassociation : 1;
@@ -385,7 +384,7 @@
   SDNodeFlags()
       : AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false),
         Exact(false), NoNaNs(false), NoInfs(false),
-        NoSignedZeros(false), AllowReciprocal(false), VectorReduction(false),
+        NoSignedZeros(false), AllowReciprocal(false),
         AllowContract(false), ApproximateFuncs(false),
         AllowReassociation(false), NoFPExcept(false) {}
 
@@ -434,10 +433,6 @@
     setDefined();
     AllowReciprocal = b;
   }
-  void setVectorReduction(bool b) {
-    setDefined();
-    VectorReduction = b;
-  }
   void setAllowContract(bool b) {
     setDefined();
     AllowContract = b;
@@ -463,7 +458,6 @@
   bool hasNoInfs() const { return NoInfs; }
   bool hasNoSignedZeros() const { return NoSignedZeros; }
   bool hasAllowReciprocal() const { return AllowReciprocal; }
-  bool hasVectorReduction() const { return VectorReduction; }
   bool hasAllowContract() const { return AllowContract; }
   bool hasApproximateFuncs() const { return ApproximateFuncs; }
   bool hasAllowReassociation() const { return AllowReassociation; }
@@ -481,7 +475,6 @@
     NoInfs &= Flags.NoInfs;
     NoSignedZeros &= Flags.NoSignedZeros;
     AllowReciprocal &= Flags.AllowReciprocal;
-    VectorReduction &= Flags.VectorReduction;
     AllowContract &= Flags.AllowContract;
     ApproximateFuncs &= Flags.ApproximateFuncs;
     AllowReassociation &= Flags.AllowReassociation;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -967,10 +967,6 @@
   if (N0.getOpcode() != Opc)
     return SDValue();
 
-  // Don't reassociate reductions.
-  if (N0->getFlags().hasVectorReduction())
-    return SDValue();
-
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
     if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
@@ -995,9 +991,6 @@
 SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                                     SDValue N1, SDNodeFlags Flags) {
   assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
-  // Don't reassociate reductions.
-  if (Flags.hasVectorReduction())
-    return SDValue();
 
   // Floating-point reassociation is not allowed without loose FP math.
   if (N0.getValueType().isFloatingPoint() ||
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2998,133 +2998,6 @@
   visitBinary(I, ISD::FSUB);
 }
 
-/// Checks if the given instruction performs a vector reduction, in which case
-/// we have the freedom to alter the elements in the result as long as the
-/// reduction of them stays unchanged.
-static bool isVectorReductionOp(const User *I) {
-  const Instruction *Inst = dyn_cast<Instruction>(I);
-  if (!Inst || !Inst->getType()->isVectorTy())
-    return false;
-
-  auto OpCode = Inst->getOpcode();
-  switch (OpCode) {
-  case Instruction::Add:
-  case Instruction::Mul:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-    break;
-  case Instruction::FAdd:
-  case Instruction::FMul:
-    if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-      if (FPOp->getFastMathFlags().isFast())
-        break;
-    LLVM_FALLTHROUGH;
-  default:
-    return false;
-  }
-
-  unsigned ElemNum = Inst->getType()->getVectorNumElements();
-  // Ensure the reduction size is a power of 2.
-  if (!isPowerOf2_32(ElemNum))
-    return false;
-
-  unsigned ElemNumToReduce = ElemNum;
-
-  // Do DFS search on the def-use chain from the given instruction. We only
-  // allow four kinds of operations during the search until we reach the
-  // instruction that extracts the first element from the vector:
-  //
-  //   1. The reduction operation of the same opcode as the given instruction.
-  //
-  //   2. PHI node.
-  //
-  //   3. ShuffleVector instruction together with a reduction operation that
-  //      does a partial reduction.
-  //
-  //   4. ExtractElement that extracts the first element from the vector, and we
-  //      stop searching the def-use chain here.
-  //
-  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
-  // from 1-3 to the stack to continue the DFS. The given instruction is not
-  // a reduction operation if we meet any other instructions other than those
-  // listed above.
-
-  SmallVector<const User *, 16> UsersToVisit{Inst};
-  SmallPtrSet<const User *, 16> Visited;
-  bool ReduxExtracted = false;
-
-  while (!UsersToVisit.empty()) {
-    auto User = UsersToVisit.back();
-    UsersToVisit.pop_back();
-    if (!Visited.insert(User).second)
-      continue;
-
-    for (const auto *U : User->users()) {
-      auto Inst = dyn_cast<Instruction>(U);
-      if (!Inst)
-        return false;
-
-      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
-        if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
-            return false;
-        UsersToVisit.push_back(U);
-      } else if (const ShuffleVectorInst *ShufInst =
-                     dyn_cast<ShuffleVectorInst>(U)) {
-        // Detect the following pattern: A ShuffleVector instruction together
-        // with a reduction that do partial reduction on the first and second
-        // ElemNumToReduce / 2 elements, and store the result in
-        // ElemNumToReduce / 2 elements in another vector.
-
-        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
-        if (ResultElements < ElemNum)
-          return false;
-
-        if (ElemNumToReduce == 1)
-          return false;
-        if (!isa<UndefValue>(U->getOperand(1)))
-          return false;
-        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
-          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
-            return false;
-        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
-          if (ShufInst->getMaskValue(i) != -1)
-            return false;
-
-        // There is only one user of this ShuffleVector instruction, which
-        // must be a reduction operation.
-        if (!U->hasOneUse())
-          return false;
-
-        auto U2 = dyn_cast<Instruction>(*U->user_begin());
-        if (!U2 || U2->getOpcode() != OpCode)
-          return false;
-
-        // Check operands of the reduction operation.
-        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
-            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
-          UsersToVisit.push_back(U2);
-          ElemNumToReduce /= 2;
-        } else
-          return false;
-      } else if (isa<ExtractElementInst>(U)) {
-        // At this moment we should have reduced all elements in the vector.
-        if (ElemNumToReduce != 1)
-          return false;
-
-        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
-        if (!Val || !Val->isZero())
-          return false;
-
-        ReduxExtracted = true;
-      } else
-        return false;
-    }
-  }
-  return ReduxExtracted;
-}
-
 void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
   SDNodeFlags Flags;
 
@@ -3143,17 +3016,6 @@
   if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
     Flags.setExact(ExactOp->isExact());
   }
-  if (isVectorReductionOp(&I)) {
-    Flags.setVectorReduction(true);
-    LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
-
-    // If no flags are set we will propagate the incoming flags, if any flags
-    // are set, we will intersect them with the incoming flag and so we need to
-    // copy the FMF flags here.
-    if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) {
-      Flags.copyFMF(*FPOp);
-    }
-  }
 
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -553,9 +553,6 @@
   if (getFlags().hasAllowReassociation())
     OS << " reassoc";
 
-  if (getFlags().hasVectorReduction())
-    OS << " vector-reduction";
-
   if (getFlags().hasNoFPExcept())
     OS << " nofpexcept";
 
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -56,6 +56,7 @@
   X86MacroFusion.cpp
   X86OptimizeLEAs.cpp
   X86PadShortFunction.cpp
+  X86PartialReduction.cpp
   X86RegisterBankInfo.cpp
   X86RegisterInfo.cpp
   X86RetpolineThunks.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -133,6 +133,11 @@
 /// fp exceptions when strict-fp enabled.
 FunctionPass *createX86InsertX87waitPass();
 
+/// This pass optimizes arithmetic based on knowledge that is only used by
+/// a reduction sequence and is therefore safe to reassociate in interesting
+/// ways.
+FunctionPass *createX86PartialReductionPass();
+
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);
@@ -154,6 +159,7 @@
 void initializeX86ExpandPseudoPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 void initializeX86OptimizeLEAPassPass(PassRegistry &);
+void initializeX86PartialReductionPass(PassRegistry &);
 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
 
 namespace X86AS {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45905,131 +45905,6 @@
                      DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
 }
 
-static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasSSE2())
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // If the vector size is less than 128, or greater than the supported RegSize,
-  // do not use PMADD.
-  if (!VT.isVector() || VT.getVectorNumElements() < 8)
-    return SDValue();
-
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-
-  auto UsePMADDWD = [&](SDValue Op) {
-    ShrinkMode Mode;
-    return Op.getOpcode() == ISD::MUL &&
-           canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
-           Mode != ShrinkMode::MULU16 &&
-           (!Subtarget.hasSSE41() ||
-            (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
-             Op->isOnlyUserOf(Op.getOperand(1).getNode())));
-  };
-
-  SDValue MulOp, OtherOp;
-  if (UsePMADDWD(Op0)) {
-    MulOp = Op0;
-    OtherOp = Op1;
-  } else if (UsePMADDWD(Op1)) {
-    MulOp = Op1;
-    OtherOp = Op0;
-  } else
-   return SDValue();
-
-  SDLoc DL(N);
-  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                                   VT.getVectorNumElements());
-  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
-                                VT.getVectorNumElements() / 2);
-
-  // Shrink the operands of mul.
-  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
-  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
-
-  // Madd vector size is half of the original vector size
-  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                           ArrayRef<SDValue> Ops) {
-    MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
-    return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
-  };
-  SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
-                                  PMADDWDBuilder);
-  // Fill the rest of the output with 0
-  SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
-  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
-
-  // Preserve the reduction flag on the ADD. We may need to revisit for the
-  // other operand.
-  SDNodeFlags Flags;
-  Flags.setVectorReduction(true);
-  return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
-}
-
-static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasSSE2())
-    return SDValue();
-
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
-  if (!VT.isVector() || !isPowerOf2_32(VT.getVectorNumElements()) ||
-      VT.getVectorElementType() != MVT::i32)
-    return SDValue();
-
-  // We know N is a reduction add. To match SAD, we need one of the operands to
-  // be an ABS.
-  SDValue AbsOp = N->getOperand(0);
-  SDValue OtherOp = N->getOperand(1);
-  if (AbsOp.getOpcode() != ISD::ABS)
-    std::swap(AbsOp, OtherOp);
-  if (AbsOp.getOpcode() != ISD::ABS)
-    return SDValue();
-
-  // Check whether we have an abs-diff pattern feeding into the select.
-  SDValue SadOp0, SadOp1;
-  if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
-    return SDValue();
-
-  // SAD pattern detected. Now build a SAD instruction and an addition for
-  // reduction. Note that the number of elements of the result of SAD is less
-  // than the number of elements of its input. Therefore, we could only update
-  // part of elements in the reduction vector.
-  SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
-
-  // The output of PSADBW is a vector of i64.
-  // We need to turn the vector of i64 into a vector of i32.
-  // If the reduction vector is at least as wide as the psadbw result, just
-  // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
-  // the PSADBW will be zero.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
-  Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
-
-  if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
-    // Fill the upper elements with zero to match the add width.
-    assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
-    unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
-    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
-    Ops[0] = Sad;
-    Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
-  } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
-    Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
-                      DAG.getIntPtrConstant(0, DL));
-  }
-
-  // Preserve the reduction flag on the ADD. We may need to revisit for the
-  // other operand.
-  SDNodeFlags Flags;
-  Flags.setVectorReduction(true);
-  return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
-}
-
 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
                             const SDLoc &DL, EVT VT,
                             const X86Subtarget &Subtarget) {
@@ -46119,30 +45994,25 @@
       Mode == ShrinkMode::MULU16)
     return SDValue();
 
+  EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                 VT.getVectorNumElements() * 2);
+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+
   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                          ArrayRef<SDValue> Ops) {
-    // Shrink by adding truncate nodes and let DAGCombine fold with the
-    // sources.
     EVT InVT = Ops[0].getValueType();
-    assert(InVT.getScalarType() == MVT::i32 &&
-           "Unexpected scalar element type");
     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements() / 2);
-    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                                   InVT.getVectorNumElements());
-    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
-                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
-                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
+    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
   };
-  return SplitOpsAndApply(DAG, Subtarget, DL, VT,
-                          { Mul.getOperand(0), Mul.getOperand(1) },
-                          PMADDBuilder);
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
 }
 
 // Attempt to turn this pattern into PMADDWD.
-// (mul (add (sext (build_vector)), (sext (build_vector))),
-//      (add (sext (build_vector)), (sext (build_vector)))
+// (add (mul (sext (build_vector)), (sext (build_vector))),
+//      (mul (sext (build_vector)), (sext (build_vector)))
 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                               const SDLoc &DL, EVT VT,
                               const X86Subtarget &Subtarget) {
@@ -46264,13 +46134,6 @@
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
-  const SDNodeFlags Flags = N->getFlags();
-  if (Flags.hasVectorReduction()) {
-    if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
-      return Sad;
-    if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
-      return MAdd;
-  }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -0,0 +1,460 @@
+//===-- X86PartialReduction.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for add instructions used by a horizontal reduction to see
+// if we might be able to use pmaddwd or psadbw. Some cases of this require
+// cross basic block knowledge and can't be done in SelectionDAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-partial-reduction"
+
+namespace {
+
+class X86PartialReduction : public FunctionPass {
+  const DataLayout *DL;
+  const X86Subtarget *ST;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  X86PartialReduction() : FunctionPass(ID) { }
+
+  bool runOnFunction(Function &Fn) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override {
+    return "X86 Partial Reduction";
+  }
+
+private:
+  bool tryMAddPattern(BinaryOperator *BO);
+  bool tryMAddReplacement(Value *Op, BinaryOperator *Add);
+
+  bool trySADPattern(BinaryOperator *BO);
+  bool trySADReplacement(Value *Op, BinaryOperator *Add);
+};
+}
+
+FunctionPass *llvm::createX86PartialReductionPass() {
+  return new X86PartialReduction();
+}
+
+char X86PartialReduction::ID = 0;
+
+INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
+                "X86 Partial Reduction", false, false)
+
+static bool isVectorReductionOp(const BinaryOperator &BO) {
+  if (!BO.getType()->isVectorTy())
+    return false;
+
+  unsigned Opcode = BO.getOpcode();
+
+  switch (Opcode) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    break;
+  case Instruction::FAdd:
+  case Instruction::FMul:
+    if (auto *FPOp = dyn_cast<FPMathOperator>(&BO))
+      if (FPOp->getFastMathFlags().isFast())
+        break;
+    LLVM_FALLTHROUGH;
+  default:
+    return false;
+  }
+
+  unsigned ElemNum = BO.getType()->getVectorNumElements();
+  // Ensure the reduction size is a power of 2.
+  if (!isPowerOf2_32(ElemNum))
+    return false;
+
+  unsigned ElemNumToReduce = ElemNum;
+
+  // Do DFS search on the def-use chain from the given instruction. We only
+  // allow four kinds of operations during the search until we reach the
+  // instruction that extracts the first element from the vector:
+  //
+  //   1. The reduction operation of the same opcode as the given instruction.
+  //
+  //   2. PHI node.
+  //
+  //   3. ShuffleVector instruction together with a reduction operation that
+  //      does a partial reduction.
+  //
+  //   4. ExtractElement that extracts the first element from the vector, and we
+  //      stop searching the def-use chain here.
+  //
+  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
+  // from 1-3 to the stack to continue the DFS. The given instruction is not
+  // a reduction operation if we meet any other instructions other than those
+  // listed above.
+
+  SmallVector<const User *, 16> UsersToVisit{&BO};
+  SmallPtrSet<const User *, 16> Visited;
+  bool ReduxExtracted = false;
+
+  while (!UsersToVisit.empty()) {
+    auto User = UsersToVisit.back();
+    UsersToVisit.pop_back();
+    if (!Visited.insert(User).second)
+      continue;
+
+    for (const auto *U : User->users()) {
+      auto *Inst = dyn_cast<Instruction>(U);
+      if (!Inst)
+        return false;
+
+      if (Inst->getOpcode() == Opcode || isa<PHINode>(U)) {
+        if (auto *FPOp = dyn_cast<FPMathOperator>(Inst))
+          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
+            return false;
+        UsersToVisit.push_back(U);
+      } else if (auto *ShufInst = dyn_cast<ShuffleVectorInst>(U)) {
+        // Detect the following pattern: A ShuffleVector instruction together
+        // with a reduction that do partial reduction on the first and second
+        // ElemNumToReduce / 2 elements, and store the result in
+        // ElemNumToReduce / 2 elements in another vector.
+
+        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
+        if (ResultElements < ElemNum)
+          return false;
+
+        if (ElemNumToReduce == 1)
+          return false;
+        if (!isa<UndefValue>(U->getOperand(1)))
+          return false;
+        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
+          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
+            return false;
+        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
+          if (ShufInst->getMaskValue(i) != -1)
+            return false;
+
+        // There is only one user of this ShuffleVector instruction, which
+        // must be a reduction operation.
+        if (!U->hasOneUse())
+          return false;
+
+        auto *U2 = dyn_cast<BinaryOperator>(*U->user_begin());
+        if (!U2 || U2->getOpcode() != Opcode)
+          return false;
+
+        // Check operands of the reduction operation.
+        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
+            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
+          UsersToVisit.push_back(U2);
+          ElemNumToReduce /= 2;
+        } else
+          return false;
+      } else if (isa<ExtractElementInst>(U)) {
+        // At this moment we should have reduced all elements in the vector.
+        if (ElemNumToReduce != 1)
+          return false;
+
+        auto *Val = dyn_cast<ConstantInt>(U->getOperand(1));
+        if (!Val || !Val->isZero())
+          return false;
+
+        ReduxExtracted = true;
+      } else
+        return false;
+    }
+  }
+  return ReduxExtracted;
+}
+
+bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
+  BasicBlock *BB = Add->getParent();
+
+  auto *BO = dyn_cast<BinaryOperator>(Op);
+  if (!BO || BO->getOpcode() != Instruction::Mul || !BO->hasOneUse() ||
+      BO->getParent() != BB)
+    return false;
+
+  Value *LHS = BO->getOperand(0);
+  Value *RHS = BO->getOperand(1);
+
+  // LHS and RHS should be only used once or if they are the same then only
+  // used twice. Only check this when SSE4.1 is enabled and we have zext/sext
+  // instructions, otherwise we use punpck to emulate zero extend in stages. The
+  // trunc/ we need to do likely won't introduce new instructions in that case.
+  if (ST->hasSSE41()) {
+    if (LHS == RHS) {
+      if (!isa<Constant>(LHS) && !LHS->hasNUses(2))
+        return false;
+    } else {
+      if (!isa<Constant>(LHS) && !LHS->hasOneUse())
+        return false;
+      if (!isa<Constant>(RHS) && !RHS->hasOneUse())
+        return false;
+    }
+  }
+
+  auto canShrinkOp = [&](Value *Op) {
+    if (isa<Constant>(Op) && ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+      return true;
+    if (auto *Cast = dyn_cast<CastInst>(Op)) {
+      if (Cast->getParent() == BB &&
+          (Cast->getOpcode() == Instruction::SExt ||
+           Cast->getOpcode() == Instruction::ZExt) &&
+          ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+        return true;
+    }
+
+    return false;
+  };
+
+  // Both Ops need to be shrinkable.
+  if (!canShrinkOp(LHS) && !canShrinkOp(RHS))
+    return false;
+
+  IRBuilder<> Builder(Add);
+
+  Type *MulTy = Op->getType();
+  unsigned NumElts = MulTy->getVectorNumElements();
+
+  // Extract even elements and odd elements and add them together. This will
+  // be pattern matched by SelectionDAG to pmaddwd. This instruction will be
+  // half the original width.
+  SmallVector<uint32_t, 16> EvenMask(NumElts / 2);
+  SmallVector<uint32_t, 16> OddMask(NumElts / 2);
+  for (int i = 0, e = NumElts / 2; i != e; ++i) {
+    EvenMask[i] = i * 2;
+    OddMask[i] = i * 2 + 1;
+  }
+  Value *EvenElts = Builder.CreateShuffleVector(BO, BO, EvenMask);
+  Value *OddElts = Builder.CreateShuffleVector(BO, BO, OddMask);
+  Value *MAdd = Builder.CreateAdd(EvenElts, OddElts);
+
+  // Concatenate zeroes to extend back to the original type.
+  SmallVector<uint32_t, 32> ConcatMask(NumElts);
+  std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+  Value *Zero = Constant::getNullValue(MAdd->getType());
+  Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask);
+
+  // Replaces the use of mul in the original Add with the pmaddwd and zeroes.
+  Add->replaceUsesOfWith(BO, Concat);
+  Add->setHasNoSignedWrap(false);
+  Add->setHasNoUnsignedWrap(false);
+
+  return true;
+}
+
+// Try to replace operans of this add with pmaddwd patterns.
+bool X86PartialReduction::tryMAddPattern(BinaryOperator *BO) {
+  if (!ST->hasSSE2())
+    return false;
+
+  // Need at least 8 elements.
+  if (BO->getType()->getVectorNumElements() < 8)
+    return false;
+
+  // Element type should be i32.
+  if (!BO->getType()->getVectorElementType()->isIntegerTy(32))
+    return false;
+
+  bool Changed = false;
+  Changed |= tryMAddReplacement(BO->getOperand(0), BO);
+  Changed |= tryMAddReplacement(BO->getOperand(1), BO);
+  return Changed;
+}
+
+bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) {
+  // Operand should be a select.
+  auto *SI = dyn_cast<SelectInst>(Op);
+  if (!SI)
+    return false;
+
+  // Select needs to implement absolute value.
+  Value *LHS, *RHS;
+  auto SPR = matchSelectPattern(SI, LHS, RHS);
+  if (SPR.Flavor != SPF_ABS)
+    return false;
+
+  // Need a subtract of two values.
+  auto *Sub = dyn_cast<BinaryOperator>(LHS);
+  if (!Sub || Sub->getOpcode() != Instruction::Sub)
+    return false;
+
+  // Look for zero extend from i8.
+  auto getZeroExtendedVal = [](Value *Op) -> Value * {
+    if (auto *ZExt = dyn_cast<ZExtInst>(Op))
+      if (ZExt->getOperand(0)->getType()->getVectorElementType()->isIntegerTy(8))
+        return ZExt->getOperand(0);
+
+    return nullptr;
+  };
+
+  // Both operands of the subtract should be extends from vXi8.
+  Value *Op0 = getZeroExtendedVal(Sub->getOperand(0));
+  Value *Op1 = getZeroExtendedVal(Sub->getOperand(1));
+  if (!Op0 || !Op1)
+    return false;
+
+  IRBuilder<> Builder(Add);
+
+  Type *OpTy = Op->getType();
+  unsigned NumElts = OpTy->getVectorNumElements();
+
+  unsigned IntrinsicNumElts;
+  Intrinsic::ID IID;
+  if (ST->hasBWI() && NumElts >= 64) {
+    IID = Intrinsic::x86_avx512_psad_bw_512;
+    IntrinsicNumElts = 64;
+  } else if (ST->hasAVX2() && NumElts >= 32) {
+    IID = Intrinsic::x86_avx2_psad_bw;
+    IntrinsicNumElts = 32;
+  } else {
+    IID = Intrinsic::x86_sse2_psad_bw;
+    IntrinsicNumElts = 16;
+  }
+
+  Function *PSADBWFn = Intrinsic::getDeclaration(Add->getModule(), IID);
+
+  if (NumElts < 16) {
+    // Pad input with zeroes.
+    SmallVector<uint32_t, 32> ConcatMask(16);
+    for (unsigned i = 0; i != NumElts; ++i)
+      ConcatMask[i] = i;
+    for (unsigned i = NumElts; i != 16; ++i)
+      ConcatMask[i] = (i % NumElts) + NumElts;
+
+    Value *Zero = Constant::getNullValue(Op0->getType());
+    Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask);
+    Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask);
+    NumElts = 16;
+  }
+
+  // Intrinsics produce vXi64 and need to be casted to vXi32.
+  Type *I32Ty = VectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4);
+
+  assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!");
+  unsigned NumSplits = NumElts / IntrinsicNumElts;
+
+  // First collect the pieces we need.
+  SmallVector<Value *, 4> Ops(NumSplits);
+  for (unsigned i = 0; i != NumSplits; ++i) {
+    SmallVector<uint32_t, 64> ExtractMask(IntrinsicNumElts);
+    std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts);
+    Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask);
+    Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask);
+    Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1});
+    Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty);
+  }
+
+  assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits");
+  unsigned Stages = Log2_32(NumSplits);
+  for (unsigned s = Stages; s > 0; --s) {
+    unsigned NumConcatElts = Ops[0]->getType()->getVectorNumElements() * 2;
+    for (unsigned i = 0; i != 1 << (s - 1); ++i) {
+      SmallVector<uint32_t, 64> ConcatMask(NumConcatElts);
+      std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+      Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask);
+    }
+  }
+
+  // At this point the final value should be in Ops[0]. Now we need to adjust
+  // it to the final original type.
+  NumElts = OpTy->getVectorNumElements();
+  if (NumElts == 2) {
+    // Extract down to 2 elements.
+    Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], {0, 1});
+  } else if (NumElts >= 8) {
+    SmallVector<uint32_t, 32> ConcatMask(NumElts);
+    unsigned SubElts = Ops[0]->getType()->getVectorNumElements();
+    for (unsigned i = 0; i != SubElts; ++i)
+      ConcatMask[i] = i;
+    for (unsigned i = SubElts; i != NumElts; ++i)
+      ConcatMask[i] = (i % SubElts) + SubElts;
+
+    Value *Zero = Constant::getNullValue(Ops[0]->getType());
+    Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
+  }
+
+  // Replaces the uses of Op in Add with the new sequence.
+  Add->replaceUsesOfWith(Op, Ops[0]);
+  Add->setHasNoSignedWrap(false);
+  Add->setHasNoUnsignedWrap(false);
+
+  return false;
+}
+
+bool X86PartialReduction::trySADPattern(BinaryOperator *BO) {
+  if (!ST->hasSSE2())
+    return false;
+
+  // TODO: There's nothing special about i32, any integer type above i16 should
+  // work just as well.
+  if (!BO->getType()->getVectorElementType()->isIntegerTy(32))
+    return false;
+
+  bool Changed = false;
+  Changed |= trySADReplacement(BO->getOperand(0), BO);
+  Changed |= trySADReplacement(BO->getOperand(1), BO);
+  return Changed;
+}
+
+bool X86PartialReduction::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<X86TargetMachine>();
+  ST = TM.getSubtargetImpl(F);
+
+  DL = &F.getParent()->getDataLayout();
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      auto *BO = dyn_cast<BinaryOperator>(&I);
+      if (!BO)
+        continue;
+
+      if (!isVectorReductionOp(*BO))
+        continue;
+
+      if (BO->getOpcode() == Instruction::Add) {
+        if (tryMAddPattern(BO)) {
+          MadeChange = true;
+          continue;
+        }
+        if (trySADPattern(BO)) {
+          MadeChange = true;
+          continue;
+        }
+      }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -84,6 +84,7 @@
   initializeX86FlagsCopyLoweringPassPass(PR);
   initializeX86CondBrFoldingPassPass(PR);
   initializeX86OptimizeLEAPassPass(PR);
+  initializeX86PartialReductionPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -400,8 +401,10 @@
 
   TargetPassConfig::addIRPasses();
 
-  if (TM->getOptLevel() != CodeGenOpt::None)
+  if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createInterleavedAccessPass());
+    addPass(createX86PartialReductionPass());
+  }
 
   // Add passes that handle indirect branch removal and insertion of a retpoline
   // thunk. These will be a no-op unless a function subtarget has the retpoline
diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll
--- a/llvm/test/CodeGen/X86/O3-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O3-pipeline.ll
@@ -51,6 +51,7 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Interleaved Access Pass
+; CHECK-NEXT:       X86 Partial Reduction
 ; CHECK-NEXT:       Expand indirectbr instructions
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Natural Loop Information
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -236,10 +236,10 @@
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm2
 ; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm3
+; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
 ; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $16, %rcx
@@ -407,16 +407,16 @@
 ; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm4
 ; AVX1-NEXT:    vmovdqu 32(%rsi,%rcx,2), %xmm5
 ; AVX1-NEXT:    vmovdqu 48(%rsi,%rcx,2), %xmm6
+; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
+; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
+; AVX1-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
 ; AVX1-NEXT:    vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $16, %rcx
@@ -453,10 +453,10 @@
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
 ; AVX2-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm4
-; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm2, %ymm4, %ymm2
 ; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    addq $16, %rcx
 ; AVX2-NEXT:    cmpq %rcx, %rax
 ; AVX2-NEXT:    jne .LBB3_1
@@ -779,18 +779,18 @@
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm3
 ; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
 ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
 ; SSE2-NEXT:    psraw $8, %xmm6
 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm4
 ; SSE2-NEXT:    pmaddwd %xmm3, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    addq $16, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB6_1
@@ -814,16 +814,16 @@
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB6_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm2
-; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm3
-; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm2
+; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm3
 ; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $16, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB6_1
@@ -943,34 +943,34 @@
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB7_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm10
-; SSE2-NEXT:    movdqu 16(%rdi,%rcx), %xmm7
-; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm9
-; SSE2-NEXT:    movdqu 16(%rsi,%rcx), %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm7
+; SSE2-NEXT:    movdqu 16(%rdi,%rcx), %xmm10
+; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm0
+; SSE2-NEXT:    movdqu 16(%rsi,%rcx), %xmm9
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
 ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
 ; SSE2-NEXT:    psraw $8, %xmm6
 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
 ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm0
 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
 ; SSE2-NEXT:    psraw $8, %xmm5
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
 ; SSE2-NEXT:    psraw $8, %xmm5
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    paddd %xmm5, %xmm3
 ; SSE2-NEXT:    addq $32, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB7_1
@@ -999,26 +999,26 @@
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB7_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm3
-; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm4
-; AVX1-NEXT:    vpmovsxbw 16(%rdi,%rcx), %xmm5
-; AVX1-NEXT:    vpmovsxbw 24(%rdi,%rcx), %xmm6
-; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
+; AVX1-NEXT:    vpmovsxbw 24(%rdi,%rcx), %xmm3
+; AVX1-NEXT:    vpmovsxbw 16(%rdi,%rcx), %xmm4
+; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm5
+; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm6
+; AVX1-NEXT:    vpmovsxbw 24(%rsi,%rcx), %xmm7
 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm7
-; AVX1-NEXT:    vpmaddwd %xmm4, %xmm7, %xmm4
 ; AVX1-NEXT:    vpmovsxbw 16(%rsi,%rcx), %xmm7
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm7
 ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vpmovsxbw 24(%rsi,%rcx), %xmm7
+; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $32, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB7_1
@@ -1051,14 +1051,14 @@
 ; AVX2-NEXT:    .p2align 4, 0x90
 ; AVX2-NEXT:  .LBB7_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
-; AVX2-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
-; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
+; AVX2-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
+; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
+; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
 ; AVX2-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
-; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
+; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    addq $32, %rcx
 ; AVX2-NEXT:    cmpq %rcx, %rax
 ; AVX2-NEXT:    jne .LBB7_1
@@ -1913,9 +1913,9 @@
 ;
 ; AVX1-LABEL: pmaddwd_16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -1944,16 +1944,16 @@
 ;
 ; AVX1-LABEL: pmaddwd_32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: pmaddwd_32:
@@ -1964,9 +1964,9 @@
 ;
 ; AVX512F-LABEL: pmaddwd_32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -2126,9 +2126,9 @@
 ;
 ; AVX1-LABEL: jumbled_indices8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -2157,16 +2157,16 @@
 ;
 ; AVX1-LABEL: jumbled_indices16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: jumbled_indices16:
@@ -2177,9 +2177,9 @@
 ;
 ; AVX512F-LABEL: jumbled_indices16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -2221,26 +2221,26 @@
 ;
 ; AVX1-LABEL: jumbled_indices32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm10
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm11
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm8, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm9, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm10, %xmm10
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm11, %xmm11
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT:    vpmaddwd %xmm8, %xmm9, %xmm8
 ; AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm8, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: jumbled_indices32:
@@ -2656,7 +2656,7 @@
 ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
 ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
@@ -2698,14 +2698,14 @@
 ; SSE2-NEXT:    movdqu (%r8), %xmm0
 ; SSE2-NEXT:    movdqu (%r9), %xmm3
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm1, %xmm3
 ; SSE2-NEXT:    movdqu (%rax), %xmm0
-; SSE2-NEXT:    movdqu (%r10), %xmm2
-; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    movdqu (%r10), %xmm1
+; SSE2-NEXT:    pmaddwd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -2721,11 +2721,11 @@
 ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu (%r8), %xmm2
 ; AVX-NEXT:    vpmaddwd (%r9), %xmm2, %xmm2
-; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu (%rax), %xmm2
 ; AVX-NEXT:    vpmaddwd (%r10), %xmm2, %xmm2
-; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -180,14 +180,14 @@
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB8_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
-; CHECK-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
-; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
+; CHECK-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
+; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
+; CHECK-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
 ; CHECK-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
-; CHECK-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
-; CHECK-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
 ; CHECK-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
+; CHECK-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
+; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; CHECK-NEXT:    addq $32, %rcx
 ; CHECK-NEXT:    cmpq %rcx, %rax
 ; CHECK-NEXT:    jne .LBB8_1
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -158,12 +158,12 @@
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm3
-; SSE2-NEXT:    psadbw b+1040(%rax), %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
 ; SSE2-NEXT:    psadbw b+1024(%rax), %xmm3
 ; SSE2-NEXT:    paddd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm3
+; SSE2-NEXT:    psadbw b+1040(%rax), %xmm3
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
@@ -188,14 +188,14 @@
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB1_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm2
-; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm3
-; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm2
+; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm3
+; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $4, %rax
 ; AVX1-NEXT:    jne .LBB1_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
@@ -320,15 +320,15 @@
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB2_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa a+1056(%rax), %xmm5
-; SSE2-NEXT:    psadbw b+1056(%rax), %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm5
-; SSE2-NEXT:    psadbw b+1040(%rax), %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm3
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm5
 ; SSE2-NEXT:    psadbw b+1024(%rax), %xmm5
 ; SSE2-NEXT:    paddd %xmm5, %xmm0
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1040(%rax), %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm3
+; SSE2-NEXT:    movdqa a+1056(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1056(%rax), %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm2
 ; SSE2-NEXT:    movdqa a+1072(%rax), %xmm5
 ; SSE2-NEXT:    psadbw b+1072(%rax), %xmm5
 ; SSE2-NEXT:    paddd %xmm5, %xmm1
@@ -364,22 +364,22 @@
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB2_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovdqa a+1072(%rax), %xmm3
-; AVX1-NEXT:    vpsadbw b+1072(%rax), %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa a+1056(%rax), %xmm4
-; AVX1-NEXT:    vpsadbw b+1056(%rax), %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm3
-; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm4
-; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm3
+; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm4
+; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa a+1056(%rax), %xmm5
+; AVX1-NEXT:    vpsadbw b+1056(%rax), %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa a+1072(%rax), %xmm6
+; AVX1-NEXT:    vpsadbw b+1072(%rax), %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX1-NEXT:    addq $4, %rax
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
@@ -416,12 +416,12 @@
 ; AVX2-NEXT:    .p2align 4, 0x90
 ; AVX2-NEXT:  .LBB2_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vmovdqa a+1056(%rax), %ymm3
-; AVX2-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vmovdqa a+1024(%rax), %ymm3
 ; AVX2-NEXT:    vpsadbw b+1024(%rax), %ymm3, %ymm3
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vmovdqa a+1056(%rax), %ymm3
+; AVX2-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    addq $4, %rax
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %middle.block
@@ -449,11 +449,11 @@
 ; AVX512F-NEXT:    .p2align 4, 0x90
 ; AVX512F-NEXT:  .LBB2_1: # %vector.body
 ; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX512F-NEXT:    vmovdqa a+1056(%rax), %ymm2
-; AVX512F-NEXT:    vpsadbw b+1056(%rax), %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa a+1024(%rax), %ymm3
-; AVX512F-NEXT:    vpsadbw b+1024(%rax), %ymm3, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa a+1024(%rax), %ymm2
+; AVX512F-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa a+1056(%rax), %ymm3
+; AVX512F-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
 ; AVX512F-NEXT:    addq $4, %rax
 ; AVX512F-NEXT:    jne .LBB2_1
@@ -554,10 +554,10 @@
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    pand %xmm1, %xmm3
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    psadbw %xmm3, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    psadbw %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm3, %xmm0
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB3_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
@@ -576,8 +576,8 @@
 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    addq $4, %rax
@@ -649,7 +649,7 @@
 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    addq $4, %rax
 ; AVX-NEXT:    jne .LBB4_1
@@ -1112,7 +1112,7 @@
 ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
 ; AVX-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]