Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3498,6 +3498,8 @@
   ///                     scheduling and that don't need extracting.
   /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+                          unsigned VecRegSize = 128,
+                          bool vectorizeStoreChain = false,
                           ArrayRef<Value *> BuildVector = None,
                           bool allowReorder = false);
 
@@ -3515,11 +3517,7 @@
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
 
-  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
-                           BoUpSLP &R, unsigned VecRegSize);
-
-  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
-                       BoUpSLP &R);
+  bool vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP &R);
 
   /// The store instructions in a basic block organized by base pointer.
   StoreListMap Stores;
@@ -3548,56 +3546,7 @@
   return !std::equal(VL.begin(), VL.end(), VH.begin());
 }
 
-bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
-                                        int CostThreshold, BoUpSLP &R,
-                                        unsigned VecRegSize) {
-  unsigned ChainLen = Chain.size();
-  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
-        << "\n");
-  unsigned Sz = R.getVectorElementSize(Chain[0]);
-  unsigned VF = VecRegSize / Sz;
-
-  if (!isPowerOf2_32(Sz) || VF < 2)
-    return false;
-
-  // Keep track of values that were deleted by vectorizing in the loop below.
-  SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
-
-  bool Changed = false;
-  // Look for profitable vectorizable trees at all offsets, starting at zero.
-  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
-    if (i + VF > e)
-      break;
-
-    // Check that a previous iteration of this loop did not delete the Value.
-    if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
-      continue;
-
-    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
-          << "\n");
-    ArrayRef<Value *> Operands = Chain.slice(i, VF);
-
-    R.buildTree(Operands);
-    R.computeMinimumValueSizes();
-
-    int Cost = R.getTreeCost();
-
-    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
-    if (Cost < CostThreshold) {
-      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
-      R.vectorizeTree();
-
-      // Move to the next bundle.
-      i += VF - 1;
-      Changed = true;
-    }
-  }
-
-  return Changed;
-}
-
-bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
-                                    int costThreshold, BoUpSLP &R) {
+bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP &R) {
   SetVector<StoreInst *> Heads, Tails;
   SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
 
@@ -3652,8 +3601,9 @@
 
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
-    for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
-      if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+    for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize;
+         VecRegSize /= 2) {
+      if (tryToVectorizeList(Operands, R, VecRegSize, true)) {
         // Mark the vectorized stores so that we don't vectorize them again.
         VectorizedStores.insert(Operands.begin(), Operands.end());
         Changed = true;
@@ -3707,10 +3657,17 @@
   if (!A || !B)
     return false;
   Value *VL[] = { A, B };
-  return tryToVectorizeList(VL, R, None, true);
+  for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize;
+       VecRegSize /= 2) {
+    if (tryToVectorizeList(VL, R, VecRegSize, false, None, true))
+      return true;
+  }
+  return false;
 }
 
 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+                                       unsigned VecRegSize,
+                                       bool vectorizeStoreChain,
                                        ArrayRef<Value *> BuildVector,
                                        bool allowReorder) {
   if (VL.size() < 2)
@@ -3724,19 +3681,18 @@
     return false;
 
   unsigned Opcode0 = I0->getOpcode();
-
-  // FIXME: Register size should be a parameter to this function, so we can
-  // try different vectorization factors.
   unsigned Sz = R.getVectorElementSize(I0);
-  unsigned VF = MinVecRegSize / Sz;
+  unsigned VF = VecRegSize / Sz;
 
-  for (Value *V : VL) {
-    Type *Ty = V->getType();
-    if (!isValidElementType(Ty))
-      return false;
-    Instruction *Inst = dyn_cast<Instruction>(V);
-    if (!Inst || Inst->getOpcode() != Opcode0)
-      return false;
+  if (vectorizeStoreChain == false) {
+    for (Value *V : VL) {
+      Type *Ty = V->getType();
+      if (!isValidElementType(Ty))
+        return false;
+      Instruction *Inst = dyn_cast<Instruction>(V);
+      if (!Inst || Inst->getOpcode() != Opcode0)
+        return false;
+    }
   }
 
   bool Changed = false;
@@ -3745,12 +3701,14 @@
   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
 
   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    unsigned OpsWidth = 0;
+    unsigned OpsWidth = VF;
 
-    if (i + VF > e)
-      OpsWidth = e - i;
-    else
-      OpsWidth = VF;
+    if (i + VF > e) {
+      if (vectorizeStoreChain)
+        break;
+      else
+        OpsWidth = e - i;
+    }
 
     if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
       break;
@@ -3936,7 +3894,8 @@
         MinVecRegSize(MinVecRegSize) {}
 
   /// \brief Try to find a reduction tree.
-  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
+  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
+                                 unsigned VecRegSize) {
     assert((!Phi ||
             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
            "Thi phi needs to use the binary operator");
@@ -3964,9 +3923,7 @@
     const DataLayout &DL = B->getModule()->getDataLayout();
     ReductionOpcode = B->getOpcode();
     ReducedValueOpcode = 0;
-    // FIXME: Register size should be a parameter to this function, so we can
-    // try different vectorization factors.
-    ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
+    ReduxWidth = VecRegSize / DL.getTypeSizeInBits(Ty);
     ReductionRoot = B;
     ReductionPHI = Phi;
 
@@ -4267,7 +4224,7 @@
     return false;
 
   HorizontalReduction HorRdx(MinRegSize);
-  if (!HorRdx.matchAssociativeReduction(P, BI))
+  if (!HorRdx.matchAssociativeReduction(P, BI, MinRegSize))
     return false;
 
   // If there is a sufficient number of reduction values, reduce
@@ -4318,13 +4275,18 @@
       // Try to vectorize them.
       unsigned NumElts = (SameTypeIt - IncIt);
       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
-        // Success start over because instructions might have been changed.
-        HaveVectorizedPhiNodes = true;
-        Changed = true;
-        break;
+      if (NumElts > 1) {
+        for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize;
+             VecRegSize /= 2) {
+          if (tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, VecRegSize)) {
+            HaveVectorizedPhiNodes = true;
+            Changed = true;
+            break;
+          }
+        }
+        if (Changed)
+          break;
       }
-
       // Start over at the next instruction of a different type (or the end).
       IncIt = SameTypeIt;
     }
@@ -4354,7 +4316,15 @@
         continue;
 
       // Try to match and vectorize a horizontal reduction.
-      if (canMatchHorizontalReduction(P, BI, R, TTI, MinVecRegSize)) {
+      bool SuccessToMatchHorizontalReduction = false;
+      for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize;
+           VecRegSize /= 2) {
+        if (canMatchHorizontalReduction(P, BI, R, TTI, VecRegSize)) {
+          SuccessToMatchHorizontalReduction = true;
+          break;
+        }
+      }
+      if (SuccessToMatchHorizontalReduction) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
@@ -4381,9 +4351,16 @@
       if (StoreInst *SI = dyn_cast<StoreInst>(it))
         if (BinaryOperator *BinOp =
                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
-          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
-                                          MinVecRegSize) ||
-              tryToVectorize(BinOp, R)) {
+          bool SuccessToMatchHorizontalReduction = false;
+          for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize;
+               VecRegSize /= 2) {
+            if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
+                                            VecRegSize)) {
+              SuccessToMatchHorizontalReduction = true;
+              break;
+            }
+          }
+          if (SuccessToMatchHorizontalReduction || tryToVectorize(BinOp, R)) {
             Changed = true;
             it = BB->begin();
             e = BB->end();
@@ -4442,12 +4419,16 @@
       // Vectorize starting with the build vector operands ignoring the
       // BuildVector instructions for the purpose of scheduling and user
       // extraction.
-      if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
-        Changed = true;
-        it = BB->begin();
-        e = BB->end();
+      for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize;
+           VecRegSize /= 2) {
+        if (tryToVectorizeList(BuildVectorOpds, R, VecRegSize, false,
+                               BuildVector, false)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          break;
+        }
       }
-
       continue;
     }
   }
@@ -4532,7 +4513,13 @@
       // performed in parallel. It's likely that detecting this pattern in a
       // bottom-up phase will be simpler and less costly than building a
       // full-blown top-down phase beginning at the consecutive loads.
-      Changed |= tryToVectorizeList(Bundle, R);
+      for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize;
+           VecRegSize /= 2) {
+        if (tryToVectorizeList(Bundle, R, VecRegSize)) {
+          Changed = true;
+          break;
+        }
+      }
     }
   }
   return Changed;
@@ -4555,8 +4542,7 @@
     //       may cause a significant compile-time increase.
     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
       unsigned Len = std::min<unsigned>(CE - CI, 16);
-      Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
-                                 -SLPCostThreshold, R);
+      Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
     }
   }
   return Changed;
Index: test/Transforms/SLPVectorizer/AArch64/slp-vectorized-from-max-to-min.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/AArch64/slp-vectorized-from-max-to-min.ll
@@ -0,0 +1,37 @@
+;RUN: opt -S -slp-vectorizer -slp-max-reg-size=128 -slp-min-reg-size=64 -slp-threshold=-13 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK: @foo
+; CHECK: add nsw  <2 x i64>
+; CHECK: add nsw  <2 x i64>
+
+define i64 @foo(i64* nocapture readonly %a) #0 {
+entry:
+  %idx1 = getelementptr inbounds i64, i64* %a, i64 1
+  %idx2 = getelementptr inbounds i64, i64* %a, i64 2
+  %idx3 = getelementptr inbounds i64, i64* %a, i64 3
+  %idx4 = getelementptr inbounds i64, i64* %a, i64 4
+  %idx5 = getelementptr inbounds i64, i64* %a, i64 5
+  %idx6 = getelementptr inbounds i64, i64* %a, i64 6
+  %idx7 = getelementptr inbounds i64, i64* %a, i64 7
+  %0 = load i64, i64* %a, align 4
+  %1 = load i64, i64* %idx1, align 4
+  %2 = load i64, i64* %idx2, align 4
+  %3 = load i64, i64* %idx3, align 4
+  %4 = load i64, i64* %idx4, align 4
+  %5 = load i64, i64* %idx5, align 4
+  %6 = load i64, i64* %idx6, align 4
+  %7 = load i64, i64* %idx7, align 4
+  %add = add nsw i64 %1, %0
+  %add1 = add nsw i64 %3, %2
+  %add2 = add nsw i64 %5, %4
+  %add3 = add nsw i64 %7, %6
+  %add8 = add nsw i64 %add1, %add
+  %add9 = add nsw i64 %add3, %add2
+  %add12 = add nsw i64 %add9, %add8
+  ret i64 %add12
+}
+
+