Index: include/llvm/CodeGen/SelectionDAGNodes.h
===================================================================
--- include/llvm/CodeGen/SelectionDAGNodes.h
+++ include/llvm/CodeGen/SelectionDAGNodes.h
@@ -328,6 +328,7 @@
   bool NoInfs : 1;
   bool NoSignedZeros : 1;
   bool AllowReciprocal : 1;
+  bool VectorReduction : 1;
 
 public:
   /// Default constructor turns off all optimization flags.
@@ -340,6 +341,7 @@
     NoInfs = false;
     NoSignedZeros = false;
     AllowReciprocal = false;
+    VectorReduction = false;
   }
 
   // These are mutators for each flag.
@@ -351,6 +353,7 @@
   void setNoInfs(bool b) { NoInfs = b; }
   void setNoSignedZeros(bool b) { NoSignedZeros = b; }
   void setAllowReciprocal(bool b) { AllowReciprocal = b; }
+  void setVectorReduction(bool b) { VectorReduction = b; }
 
   // These are accessors for each flag.
   bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
@@ -361,6 +364,7 @@
   bool hasNoInfs() const { return NoInfs; }
   bool hasNoSignedZeros() const { return NoSignedZeros; }
   bool hasAllowReciprocal() const { return AllowReciprocal; }
+  bool hasVectorReduction() const { return VectorReduction; }
 
   /// Return a raw encoding of the flags.
   /// This function should only be used to add data to the NodeID value.
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2315,6 +2315,115 @@
   visitBinary(I, ISD::FSUB);
 }
 
+/// Checks if the given instruction performs a vector reduction, in which case
+/// we have the freedom to reorganize the elements in the result as long as the
+/// reduction of them stay unchanged.
+static bool isVectorReductionOp(const User *I) {
+  const Instruction *Inst = dyn_cast<Instruction>(I);
+  if (!Inst || !Inst->getType()->isVectorTy())
+    return false;
+
+  auto OpCode = Inst->getOpcode();
+  switch (OpCode) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    break;
+  default:
+    return false;
+  }
+
+  unsigned ElemNum = Inst->getType()->getVectorNumElements();
+  unsigned ElemNumToReduce = ElemNum;
+
+  // Do DFS search on the def-use chain from the given instruction. We only
+  // allow four kinds of operations during the search until we reach the
+  // instruction that extracts the first element from the vector:
+  //
+  //   1. The reduction operation of the same arithmetic as the given
+  //      instruction.
+  //
+  //   2. PHI node.
+  //
+  //   3. ShuffleVector instruction together with a reduction operation that do
+  //      partial reduction.
+  //
+  //   4. ExtractElement that extracts the first element from the vector, and we
+  //      stop searching the def-use chain here.
+  //
+  //   3 & 4 above perform reduction on all elements of the vector. We push defs
+  //   from 1-3 to the stack to continue the DFS. The given instruction is not
+  //   supposed to be a reduction operation if we meet any other insturctions
+  //   other than those listed above.
+
+  SmallVector<const User *, 16> UsersToVisit{Inst};
+  SmallPtrSet<const User *, 16> Visited;
+  bool ReduxExtracted = false;
+
+  while (!UsersToVisit.empty()) {
+    auto User = UsersToVisit.back();
+    UsersToVisit.pop_back();
+    if (!Visited.insert(User).second)
+      continue;
+
+    for (const auto &U : User->users()) {
+      auto Inst = dyn_cast<Instruction>(U);
+      if (!Inst)
+        return false;
+
+      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
+        UsersToVisit.push_back(U);
+      } else if (const ShuffleVectorInst *ShufInst =
+                     dyn_cast<ShuffleVectorInst>(U)) {
+        // Detect the following pattern: A ShuffleVector instruction together
+        // with a reduction that do partial reduction on the first and second
+        // ElemNumToReduce / 2 elements, and store the result in
+        // ElemNumToReduce / 2 elements in another vector.
+
+        if (!isa<UndefValue>(U->getOperand(1)))
+          return false;
+        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
+          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
+            return false;
+        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
+          if (ShufInst->getMaskValue(i) != -1)
+            return false;
+
+        // There is only one user of this ShuffleVector instruction, which must
+        // be a reduction operation.
+        if (!U->hasOneUse())
+          return false;
+
+        auto U2 = dyn_cast<Instruction>(*U->user_begin());
+        if (!U2 || U2->getOpcode() != OpCode)
+          return false;
+
+        // Check operands of the reduction operation.
+        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
+            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
+          UsersToVisit.push_back(U2);
+          ElemNumToReduce /= 2;
+        } else
+          return false;
+      } else if (isa<ExtractElementInst>(U)) {
+        // At this moment we should have reduced all elements in the vector.
+        if (ElemNumToReduce != 1)
+          return false;
+
+        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
+        if (!Val || Val->getZExtValue() != 0)
+          return false;
+
+        ReduxExtracted = true;
+      } else
+        return false;
+    }
+  }
+  return ReduxExtracted;
+}
+
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
@@ -2322,6 +2431,7 @@
   bool nuw = false;
   bool nsw = false;
   bool exact = false;
+  bool vec_redux = false;
   FastMathFlags FMF;
 
   if (const OverflowingBinaryOperator *OFBinOp =
@@ -2335,10 +2445,16 @@
   if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
     FMF = FPOp->getFastMathFlags();
 
+  if (isVectorReductionOp(&I)) {
+    vec_redux = true;
+    DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
+  }
+
   SDNodeFlags Flags;
   Flags.setExact(exact);
   Flags.setNoSignedWrap(nsw);
   Flags.setNoUnsignedWrap(nuw);
+  Flags.setVectorReduction(vec_redux);
   if (EnableFMFInDAG) {
     Flags.setAllowReciprocal(FMF.allowReciprocal());
     Flags.setNoInfs(FMF.noInfs());
Index: test/CodeGen/Generic/vector-redux.ll
===================================================================
--- /dev/null
+++ test/CodeGen/Generic/vector-redux.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
+
+@a = global [1024 x i32] zeroinitializer, align 16
+
+define i32 @reduce_add() {
+; CHECK-LABEL: reduce_add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+
+min.iters.checked:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next.4, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %28, %vector.body ]
+  %vec.phi4 = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %29, %vector.body ]
+  %0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 16
+  %2 = getelementptr i32, i32* %0, i64 4
+  %3 = bitcast i32* %2 to <4 x i32>*
+  %wide.load5 = load <4 x i32>, <4 x i32>* %3, align 16
+  %4 = add nsw <4 x i32> %wide.load, %vec.phi
+  %5 = add nsw <4 x i32> %wide.load5, %vec.phi4
+  %index.next = add nuw nsw i64 %index, 8
+  %6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next
+  %7 = bitcast i32* %6 to <4 x i32>*
+  %wide.load.1 = load <4 x i32>, <4 x i32>* %7, align 16
+  %8 = getelementptr i32, i32* %6, i64 4
+  %9 = bitcast i32* %8 to <4 x i32>*
+  %wide.load5.1 = load <4 x i32>, <4 x i32>* %9, align 16
+  %10 = add nsw <4 x i32> %wide.load.1, %4
+  %11 = add nsw <4 x i32> %wide.load5.1, %5
+  %index.next.1 = add nsw i64 %index, 16
+  %12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.1
+  %13 = bitcast i32* %12 to <4 x i32>*
+  %wide.load.2 = load <4 x i32>, <4 x i32>* %13, align 16
+  %14 = getelementptr i32, i32* %12, i64 4
+  %15 = bitcast i32* %14 to <4 x i32>*
+  %wide.load5.2 = load <4 x i32>, <4 x i32>* %15, align 16
+  %16 = add nsw <4 x i32> %wide.load.2, %10
+  %17 = add nsw <4 x i32> %wide.load5.2, %11
+  %index.next.2 = add nsw i64 %index, 24
+  %18 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.2
+  %19 = bitcast i32* %18 to <4 x i32>*
+  %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 16
+  %20 = getelementptr i32, i32* %18, i64 4
+  %21 = bitcast i32* %20 to <4 x i32>*
+  %wide.load5.3 = load <4 x i32>, <4 x i32>* %21, align 16
+  %22 = add nsw <4 x i32> %wide.load.3, %16
+  %23 = add nsw <4 x i32> %wide.load5.3, %17
+  %index.next.3 = add nsw i64 %index, 32
+  %24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.3
+  %25 = bitcast i32* %24 to <4 x i32>*
+  %wide.load.4 = load <4 x i32>, <4 x i32>* %25, align 16
+  %26 = getelementptr i32, i32* %24, i64 4
+  %27 = bitcast i32* %26 to <4 x i32>*
+  %wide.load5.4 = load <4 x i32>, <4 x i32>* %27, align 16
+  %28 = add nsw <4 x i32> %wide.load.4, %22
+  %29 = add nsw <4 x i32> %wide.load5.4, %23
+  %index.next.4 = add nsw i64 %index, 40
+  %30 = icmp eq i64 %index.next.4, 1000
+  br i1 %30, label %middle.block, label %vector.body
+
+middle.block:
+  %.lcssa10 = phi <4 x i32> [ %29, %vector.body ]
+  %.lcssa = phi <4 x i32> [ %28, %vector.body ]
+  %bin.rdx = add <4 x i32> %.lcssa10, %.lcssa
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx6 = add <4 x i32> %bin.rdx, %rdx.shuf
+  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx6, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <4 x i32> %bin.rdx6, %rdx.shuf7
+  %31 = extractelement <4 x i32> %bin.rdx8, i32 0
+  ret i32 %31
+}
+
+define i32 @reduce_and() {
+; CHECK-LABEL: reduce_and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+
+entry:
+  br label %vector.body
+
+vector.body:
+  %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ -4096, %entry ]
+  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %6, %vector.body ]
+  %vec.phi9 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %7, %vector.body ]
+  %uglygep33 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep3334 = bitcast i8* %uglygep33 to <4 x i32>*
+  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 256
+  %wide.load = load <4 x i32>, <4 x i32>* %scevgep35, align 16
+  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 257
+  %wide.load10 = load <4 x i32>, <4 x i32>* %scevgep36, align 16
+  %0 = and <4 x i32> %wide.load, %vec.phi
+  %1 = and <4 x i32> %wide.load10, %vec.phi9
+  %uglygep30 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep3031 = bitcast i8* %uglygep30 to <4 x i32>*
+  %scevgep32 = getelementptr <4 x i32>, <4 x i32>* %uglygep3031, i64 258
+  %wide.load.1 = load <4 x i32>, <4 x i32>* %scevgep32, align 16
+  %uglygep27 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep2728 = bitcast i8* %uglygep27 to <4 x i32>*
+  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %uglygep2728, i64 259
+  %wide.load10.1 = load <4 x i32>, <4 x i32>* %scevgep29, align 16
+  %2 = and <4 x i32> %wide.load.1, %0
+  %3 = and <4 x i32> %wide.load10.1, %1
+  %uglygep24 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep2425 = bitcast i8* %uglygep24 to <4 x i32>*
+  %scevgep26 = getelementptr <4 x i32>, <4 x i32>* %uglygep2425, i64 260
+  %wide.load.2 = load <4 x i32>, <4 x i32>* %scevgep26, align 16
+  %uglygep21 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep2122 = bitcast i8* %uglygep21 to <4 x i32>*
+  %scevgep23 = getelementptr <4 x i32>, <4 x i32>* %uglygep2122, i64 261
+  %wide.load10.2 = load <4 x i32>, <4 x i32>* %scevgep23, align 16
+  %4 = and <4 x i32> %wide.load.2, %2
+  %5 = and <4 x i32> %wide.load10.2, %3
+  %uglygep18 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep1819 = bitcast i8* %uglygep18 to <4 x i32>*
+  %scevgep20 = getelementptr <4 x i32>, <4 x i32>* %uglygep1819, i64 262
+  %wide.load.3 = load <4 x i32>, <4 x i32>* %scevgep20, align 16
+  %uglygep = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep17 = bitcast i8* %uglygep to <4 x i32>*
+  %scevgep = getelementptr <4 x i32>, <4 x i32>* %uglygep17, i64 263
+  %wide.load10.3 = load <4 x i32>, <4 x i32>* %scevgep, align 16
+  %6 = and <4 x i32> %wide.load.3, %4
+  %7 = and <4 x i32> %wide.load10.3, %5
+  %lsr.iv.next = add nsw i64 %lsr.iv, 128
+  %8 = icmp eq i64 %lsr.iv.next, 0
+  br i1 %8, label %middle.block, label %vector.body
+
+middle.block:
+  %bin.rdx = and <4 x i32> %7, %6
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx11 = and <4 x i32> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = and <4 x i32> %bin.rdx11, %rdx.shuf12
+  %9 = extractelement <4 x i32> %bin.rdx13, i32 0
+  ret i32 %9
+}