Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3419,6 +3419,18 @@
     else
       MaxVecRegSize = TTI->getRegisterBitWidth(true);
 
+    // If the target is AArch64, MinVecRegSize can be 64 (vectorized in double
+    // register)
+    // not 128 (vectorized in quad register).
+    // Thus, we can have more opportunities to vectorize as followings
+    // <2 x 32-bit data type>, <4 x 16-bit data type>, <8 x 8-bit data type>.
+    llvm::Triple TargetTriple(F.getParent()->getTargetTriple());
+    bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64 ||
+                     TargetTriple.getArch() == llvm::Triple::aarch64_be;
+
+    if (IsAArch64)
+      MinVectorRegSizeOption = 64;
+
     MinVecRegSize = MinVectorRegSizeOption;
 
     // Don't vectorize when the attribute NoImplicitFloat is used.
@@ -3499,7 +3511,7 @@
   /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                           ArrayRef<Value *> BuildVector = None,
-                          bool allowReorder = false);
+                          bool allowReorder = false, unsigned VecRegSize = 128);
 
   /// \brief Try to vectorize a chain that may start at the operands of \V;
   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
@@ -3707,12 +3719,19 @@
   if (!A || !B)
     return false;
   Value *VL[] = { A, B };
-  return tryToVectorizeList(VL, R, None, true);
+  bool SuccessToVectorizeList = false;
+  for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+    if (tryToVectorizeList(VL, R, None, true, Size)) {
+      SuccessToVectorizeList = true;
+      break;
+    }
+  }
+  return SuccessToVectorizeList;
 }
 
 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                                        ArrayRef<Value *> BuildVector,
-                                       bool allowReorder) {
+                                       bool allowReorder, unsigned VecRegSize) {
   if (VL.size() < 2)
     return false;
 
@@ -3728,7 +3747,7 @@
   // FIXME: Register size should be a parameter to this function, so we can
   // try different vectorization factors.
   unsigned Sz = R.getVectorElementSize(I0);
-  unsigned VF = MinVecRegSize / Sz;
+  unsigned VF = VecRegSize / Sz;
 
   for (Value *V : VL) {
     Type *Ty = V->getType();
@@ -3936,7 +3955,8 @@
         MinVecRegSize(MinVecRegSize) {}
 
   /// \brief Try to find a reduction tree.
-  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
+  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
+                                 unsigned VecRegSize) {
     assert((!Phi ||
             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
            "Thi phi needs to use the binary operator");
@@ -3966,7 +3986,7 @@
     ReducedValueOpcode = 0;
     // FIXME: Register size should be a parameter to this function, so we can
     // try different vectorization factors.
-    ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
+    ReduxWidth = VecRegSize / DL.getTypeSizeInBits(Ty);
     ReductionRoot = B;
     ReductionPHI = Phi;
 
@@ -4267,7 +4287,7 @@
     return false;
 
   HorizontalReduction HorRdx(MinRegSize);
-  if (!HorRdx.matchAssociativeReduction(P, BI))
+  if (!HorRdx.matchAssociativeReduction(P, BI, MinRegSize))
     return false;
 
   // If there is a sufficient number of reduction values, reduce
@@ -4318,7 +4338,15 @@
       // Try to vectorize them.
       unsigned NumElts = (SameTypeIt - IncIt);
       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
+      bool SuccessToVectorizeList = false;
+      for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+        if (tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, None, false,
+                               Size)) {
+          SuccessToVectorizeList = true;
+          break;
+        }
+      }
+      if (NumElts > 1 && SuccessToVectorizeList) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
@@ -4354,7 +4382,14 @@
         continue;
 
       // Try to match and vectorize a horizontal reduction.
-      if (canMatchHorizontalReduction(P, BI, R, TTI, MinVecRegSize)) {
+      bool SuccessToMatchHorizontalReduction = false;
+      for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+        if (canMatchHorizontalReduction(P, BI, R, TTI, Size)) {
+          SuccessToMatchHorizontalReduction = true;
+          break;
+        }
+      }
+      if (SuccessToMatchHorizontalReduction) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
@@ -4381,9 +4416,18 @@
       if (StoreInst *SI = dyn_cast<StoreInst>(it))
         if (BinaryOperator *BinOp =
                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
-          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
-                                          MinVecRegSize) ||
-              tryToVectorize(BinOp, R)) {
+          // if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
+          //                                 MinVecRegSize) ||
+          //     tryToVectorize(BinOp, R)) {
+          bool SuccessToMatchHorizontalReduction = false;
+          for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize;
+               Size /= 2) {
+            if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, Size)) {
+              SuccessToMatchHorizontalReduction = true;
+              break;
+            }
+          }
+          if (SuccessToMatchHorizontalReduction || tryToVectorize(BinOp, R)) {
             Changed = true;
             it = BB->begin();
             e = BB->end();
@@ -4442,7 +4486,14 @@
       // Vectorize starting with the build vector operands ignoring the
       // BuildVector instructions for the purpose of scheduling and user
       // extraction.
-      if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
+      bool SuccessToVectorizeList = false;
+      for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+        if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, Size)) {
+          SuccessToVectorizeList = true;
+          break;
+        }
+      }
+      if (SuccessToVectorizeList) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
@@ -4532,7 +4583,15 @@
       // performed in parallel. It's likely that detecting this pattern in a
       // bottom-up phase will be simpler and less costly than building a
       // full-blown top-down phase beginning at the consecutive loads.
-      Changed |= tryToVectorizeList(Bundle, R);
+      // Changed |= tryToVectorizeList(Bundle, R);
+      bool SuccessToVectorizeList = false;
+      for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+        if (tryToVectorizeList(Bundle, R, None, false, Size)) {
+          SuccessToVectorizeList = true;
+          break;
+        }
+      }
+      Changed |= SuccessToVectorizeList;
     }
   }
   return Changed;
Index: test/Transforms/SLPVectorizer/AArch64/slp-vectorized-within-64bits.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/AArch64/slp-vectorized-within-64bits.ll
@@ -0,0 +1,28 @@
+;RUN: opt -slp-vectorizer -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK: @foo
+; CHECK: load <4 x i16>
+; CHECK: store <4 x i16>
+; CHECK: ret i32
+
+%struct.A = type { i16, i16 }
+%struct.B = type { i16, i16 }
+
+@array_A = global [10 x %struct.A] zeroinitializer, align 2
+@array_B = global [10 x %struct.B] zeroinitializer, align 2
+
+define i32 @foo() #0 {
+entry:
+  %0 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 0, i32 0), align 2
+  store i16 %0, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 0, i32 1), align 2
+  store i16 %1, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 0, i32 1), align 2
+  %2 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 1, i32 0), align 2
+  store i16 %2, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 1, i32 0), align 2
+  %3 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 1, i32 1), align 2
+  store i16 %3, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 1, i32 1), align 2
+  ret i32 0
+}