Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
@@ -537,6 +537,9 @@
   /// \return The width of the largest scalar or vector register type.
   unsigned getRegisterBitWidth(bool Vector) const;
 
+  /// \return The width of the smallest vector register type.
+  unsigned getMinVectorRegisterBitWidth() const;
+
   /// \return True if it should be considered for address type promotion.
   /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
   /// profitable without finding other extensions fed by the same input.
@@ -840,6 +843,7 @@
                             Type *Ty) = 0;
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) = 0;
+  virtual unsigned getMinVectorRegisterBitWidth() = 0;
   virtual bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
   virtual unsigned getCacheLineSize() = 0;
@@ -1076,6 +1080,9 @@
   unsigned getRegisterBitWidth(bool Vector) override {
     return Impl.getRegisterBitWidth(Vector);
   }
+  unsigned getMinVectorRegisterBitWidth() override {
+    return Impl.getMinVectorRegisterBitWidth();
+  }
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
     return Impl.shouldConsiderAddressTypePromotion(
Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -311,6 +311,8 @@
 
   unsigned getRegisterBitWidth(bool Vector) { return 32; }
 
+  unsigned getMinVectorRegisterBitWidth() { return 128; }
+
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
                                      bool &AllowPromotionWithoutCommonHeader) {
Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
@@ -279,6 +279,10 @@
   return TTIImpl->getRegisterBitWidth(Vector);
 }
 
+unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
+  return TTIImpl->getMinVectorRegisterBitWidth();
+}
+
 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
   return TTIImpl->shouldConsiderAddressTypePromotion(
Index: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h
+++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h
@@ -83,6 +83,9 @@
   // NegativeImmediates - transform instructions with negative immediates
   bool NegativeImmediates = true;
 
+  // Enable 64-bit vectorization in SLP.
+  unsigned MinVectorRegisterBitWidth = 64;
+
   bool UseAA = false;
   bool PredictableSelectIsExpensive = false;
   bool BalanceFPOps = false;
@@ -191,6 +194,10 @@
 
   bool isXRaySupported() const override { return true; }
 
+  unsigned getMinVectorRegisterBitWidth() const {
+    return MinVectorRegisterBitWidth;
+  }
+
   bool isX18Reserved() const { return ReserveX18; }
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
Index: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -91,6 +91,8 @@
   case Falkor:
     MaxInterleaveFactor = 4;
     VectorInsertExtractBaseCost = 2;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case Kryo:
     MaxInterleaveFactor = 4;
@@ -99,6 +101,8 @@
     PrefetchDistance = 740;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 11;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case ThunderX2T99:
     CacheLineSize = 64;
@@ -108,6 +112,8 @@
     PrefetchDistance = 128;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 4;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case ThunderX:
   case ThunderXT88:
@@ -116,6 +122,8 @@
     CacheLineSize = 128;
     PrefFunctionAlignment = 3;
     PrefLoopAlignment = 2;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case CortexA35: break;
   case CortexA53: break;
Index: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -87,6 +87,10 @@
     return 64;
   }
 
+  unsigned getMinVectorRegisterBitWidth() {
+    return ST->getMinVectorRegisterBitWidth();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -316,7 +316,10 @@
     else
       MaxVecRegSize = TTI->getRegisterBitWidth(true);
 
-    MinVecRegSize = MinVectorRegSizeOption;
+    if (MinVectorRegSizeOption.getNumOccurrences())
+      MinVecRegSize = MinVectorRegSizeOption;
+    else
+      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
   }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
Index: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
===================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
@@ -0,0 +1,22 @@
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; Currently disabled for a few subtargets (e.g. Kryo):
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s
+
+define void @f(float* %r, float* %w) {
+  %r0 = getelementptr inbounds float, float* %r, i64 0
+  %r1 = getelementptr inbounds float, float* %r, i64 1
+  %f0 = load float, float* %r0
+  %f1 = load float, float* %r1
+  %add0 = fadd float %f0, %f0
+; CHECK:  fadd <2 x float>
+; NO_SLP: fadd float
+; NO_SLP: fadd float
+  %add1 = fadd float %f1, %f1
+  %w0 = getelementptr inbounds float, float* %w, i64 0
+  %w1 = getelementptr inbounds float, float* %w, i64 1
+  store float %add0, float* %w0
+  store float %add1, float* %w1
+  ret void
+}