Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -46,6 +46,7 @@
 #include <algorithm>
 #include <map>
 #include <memory>
+#include <utility>
 
 using namespace llvm;
 
@@ -74,6 +75,10 @@
 
 static const unsigned RecursionMaxDepth = 12;
 
+static bool IsReturn = false;
+
+static bool IsHAdd = false;
+
 /// \returns the parent basic block if all of the instructions in \p VL
 /// are in the same block or null otherwise.
 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
@@ -1620,6 +1625,10 @@
   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
         VectorizableTree.size() << " is fully vectorizable .\n");
 
+  // Return true if a tree of size 1 feeds into a return and is horizontal Add.
+  if (VectorizableTree.size() == 1 && IsReturn && IsHAdd)
+    return true;
+
   // We only handle trees of height 2.
   if (VectorizableTree.size() != 2)
     return false;
@@ -3315,6 +3324,115 @@
     : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
     ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
 
+  /// Try to find a flat horizontal reduction. The tree structure of such
+  /// addition of type a[0]+a[1]+a[2]+a[3].... will be be like
+  /// (+(+(+ a[0], a[1]), a[2]), a[3])....
+  /// Try to vectorize such tree for Integer type only.
+  bool matchFlatReduction(PHINode *Phi, BinaryOperator *B,
+                            const DataLayout *DL) {
+        if (!B)
+         return false;
+
+        if (B->getType()->isVectorTy() || !B->getType()->isIntegerTy())
+            return false;
+
+        ReductionOpcode = B->getOpcode();
+        ReducedValueOpcode = 0;
+        ReduxWidth = MinVecRegSize / DL->getTypeAllocSizeInBits(B->getType());
+        ReductionRoot = B;
+        ReductionPHI = Phi;
+
+        if(ReduxWidth < 4)
+            return false;
+
+        if(ReductionOpcode != Instruction::Add)
+            return false;
+
+        SmallVector<BinaryOperator*, 32> Stack;
+        Value* Op0 = B->getOperand(0);
+        Value* Op1 = B->getOperand(1);
+
+        BinaryOperator* Op0Bin = dyn_cast<BinaryOperator>(Op0);
+        BinaryOperator* Op1Bin = dyn_cast<BinaryOperator>(Op1);
+
+        // None of the operands are binary.
+        if(!Op0Bin && !Op1Bin)
+            return false;
+
+        // Both of the operands are binary, we do not handle it here.
+        if(Op0Bin && Op1Bin)
+            return false;
+
+        if(Op0Bin) {
+            Stack.push_back(Op0Bin);
+            ReducedVals.push_back(Op1);
+        }
+        else {
+            Stack.push_back(Op1Bin);
+            ReducedVals.push_back(Op0);
+        }
+
+        ReductionOps.push_back(B);
+        ReductionOpcode = B->getOpcode();
+
+        // Traversal of the tree.
+        while(!Stack.empty()) {
+          BinaryOperator *Bin = Stack.back();
+
+          if(Bin->getParent() != B->getParent())
+              return false;
+
+          Value* Op0 = Bin->getOperand(0);
+          Value* Op1 = Bin->getOperand(1);
+
+          BinaryOperator* Op0Bin = dyn_cast<BinaryOperator>(Op0);
+          BinaryOperator* Op1Bin = dyn_cast<BinaryOperator>(Op1);
+
+          Stack.pop_back();
+
+          if(!Op0Bin && !Op1Bin) {
+              ReducedVals.push_back(Op1);
+              ReducedVals.push_back(Op0);
+              ReductionOps.push_back(Bin);
+              continue;
+          }
+
+          if(Op0Bin) {
+              if (Op0Bin->getOpcode() != ReductionOpcode)
+                  return false;
+              Stack.push_back(Op0Bin);
+              ReducedVals.push_back(Op1);
+              ReductionOps.push_back(Op0Bin);
+          }
+
+          if(Op1Bin) {
+              if(Op1Bin->getOpcode() != ReductionOpcode)
+                  return false;
+              Stack.push_back(Op1Bin);
+              ReducedVals.push_back(Op0);
+              ReductionOps.push_back(Op1Bin);
+          }
+
+        }
+
+        SmallVector <Value* , 16> Temp1;
+
+        // Reverse the loads from a[3], a[2], a[1], a[0]
+        // to a[0], a[1], a[2], a[3] for checking incremental
+        // consecutiveness further ahead.
+        while(!ReducedVals.empty())
+            Temp1.push_back(ReducedVals.pop_back_val());
+
+        ReducedVals.clear();
+
+        for(unsigned i = 0, e = Temp1.size(); i<e; ++i)
+            ReducedVals.push_back(Temp1[i]);
+
+        // Set the flag for horizontal flag.
+        IsHAdd = true;
+        return true;
+    }
+
   /// \brief Try to find a reduction tree.
   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
                                  const DataLayout *DL) {
@@ -3474,12 +3592,35 @@
     Type *ScalarTy = FirstReducedVal->getType();
     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
 
+    int HAddCost = INT_MAX;
+    // If horizontal addition pattern is identified, calculate cost.
+    // Such horizontal additions can be modeled into combination of
+    // shuffle sub-vectors and vector adds and one single extract element
+    // from last resultant vector.
+    // e.g. a[0]+a[1]+a[2]+a[3] can be modeled as
+    // %1 = load <4 x> %0
+    // %2 = shuffle %1 <2, 3, undef, undef>
+    // %3 = add <4 x> %1, %2
+    // %4 = shuffle %3 <1, undef, undef, undef>
+    // %5 = add <4 x> %3, %4
+    // %6 = extractelement %5 <0>
+        if(IsHAdd) {
+            unsigned VecElem = VecTy->getVectorNumElements();
+            unsigned NumRedxLevel = Log2_32(VecElem);
+            HAddCost = NumRedxLevel * (TTI->getArithmeticInstrCost(ReductionOpcode,VecTy)
+                       + TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                                 VecTy, VecElem/2, VecTy))
+                    + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 0);
+        }
+
     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
 
     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
 
+    VecReduxCost = HAddCost < VecReduxCost ? HAddCost : VecReduxCost;
+
     int ScalarReduxCost =
         ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
 
@@ -3702,8 +3843,12 @@
         if (BinaryOperator *BinOp =
                 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
           DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
-          if (tryToVectorizePair(BinOp->getOperand(0),
-                                 BinOp->getOperand(1), R)) {
+          HorizontalReduction HorRdx;
+          IsReturn = true;
+          if ((HorRdx.matchFlatReduction(nullptr, BinOp, DL) &&
+               HorRdx.tryToReduce(R, TTI)) ||
+               tryToVectorizePair(BinOp->getOperand(0),
+                                  BinOp->getOperand(1), R)) {
             Changed = true;
             it = BB->begin();
             e = BB->end();
Index: test/Transforms/SLPVectorizer/AArch64/flatadd.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/flatadd.ll
+++ test/Transforms/SLPVectorizer/AArch64/flatadd.ll
@@ -0,0 +1,59 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; return a[0]+a[1]+a[2]+a[3]
+
+; CHECK-LABEL: @flatadd1
+; CHECK: load <4 x i32>*
+; CHECK: shufflevector <4 x i32>
+; CHECK: add <4 x i32>
+; CHECK: extractelement <4 x i32> 
+define i32 @flatadd1(i32* nocapture readonly %a) {
+entry:
+  %0 = load i32* %a, align 4
+  %arrayidx1 = getelementptr inbounds i32* %a, i32 1
+  %1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 2
+  %2 = load i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %add, %2
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 3
+  %3 = load i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %add3, %3
+  ret i32 %add5
+}
+
+; return a[0]+a[1]+a[2]+a[3]+a[4]+a[5]+a[6]+a[7]
+
+; CHECK-LABEL: @flatadd2
+; CHECK: load <4 x i32>*
+; CHECK: shufflevector <4 x i32>
+; CHECK: add <4 x i32>
+; CHECK: extractelement <4 x i32>
+define i32 @flatadd2(i32* nocapture readonly %a) {
+entry:
+  %0 = load i32* %a, align 4
+  %arrayidx1 = getelementptr inbounds i32* %a, i64 1
+  %1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 2
+  %2 = load i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %add, %2
+  %arrayidx4 = getelementptr inbounds i32* %a, i64 3
+  %3 = load i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %add3, %3
+  %arrayidx6 = getelementptr inbounds i32* %a, i64 4
+  %4 = load i32* %arrayidx6, align 4
+  %add7 = add nsw i32 %add5, %4
+  %arrayidx8 = getelementptr inbounds i32* %a, i64 5
+  %5 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %add7, %5
+  %arrayidx10 = getelementptr inbounds i32* %a, i64 6
+  %6 = load i32* %arrayidx10, align 4 
+  %add11 = add nsw i32 %add9, %6
+  %arrayidx12 = getelementptr inbounds i32* %a, i64 7
+  %7 = load i32* %arrayidx12, align 4
+  %add13 = add nsw i32 %add11, %7
+  ret i32 %add13
+}