diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -164,6 +164,15 @@
     "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
     cl::desc("The maximum look-ahead depth for operand reordering scores"));
 
+// The maximum depth that the look-ahead score heuristic will explore
+// when it probing among candidates for vectorization tree roots.
+// The higher this value, the higher the compilation time overhead but unlike
+// similar limit for operands ordering this is less frequently used, hence
+// impact of higher value is less noticeable.
+static cl::opt<int> RootLookAheadMaxDepth(
+    "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
+    cl::desc("The maximum look-ahead depth for searching best rooting option"));
+
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -2000,6 +2009,29 @@
 #endif
   };
 
+  /// Evaluate each pair in \p Candidates and return index into \p Candidates
+  /// for a pair which have highest score deemed to have best chance to form
+  /// root of profitable tree to vectorize. Return None if no candidate scored
+  /// above the LookAheadHeuristics::ScoreFail.
+  Optional<int>
+  findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates) {
+    LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
+                                  RootLookAheadMaxDepth);
+    int BestScore = LookAheadHeuristics::ScoreFail;
+    Optional<int> Index = None;
+    for (int I : seq<int>(0, Candidates.size())) {
+      int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
+                                               Candidates[I].second,
+                                               /*U1=*/nullptr, /*U2=*/nullptr,
+                                               /*Level=*/1, None);
+      if (Score > BestScore) {
+        BestScore = Score;
+        Index = I;
+      }
+    }
+    return Index;
+  }
+
   /// Checks if the instruction is marked for deletion.
   bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
 
@@ -9150,7 +9182,8 @@
   if (!I)
     return false;
 
-  if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+  if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) ||
+      isa<VectorType>(I->getType()))
     return false;
 
   Value *P = I->getParent();
@@ -9161,32 +9194,40 @@
   if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
     return false;
 
-  // Try to vectorize V.
-  if (tryToVectorizePair(Op0, Op1, R))
-    return true;
+  // First collect all possible candidates
+  SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+  Candidates.emplace_back(Op0, Op1);
 
   auto *A = dyn_cast<BinaryOperator>(Op0);
   auto *B = dyn_cast<BinaryOperator>(Op1);
   // Try to skip B.
-  if (B && B->hasOneUse()) {
+  if (A && B && B->hasOneUse()) {
     auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
     auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
-    if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
-      return true;
-    if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
-      return true;
+    if (B0 && B0->getParent() == P)
+      Candidates.emplace_back(A, B0);
+    if (B1 && B1->getParent() == P)
+      Candidates.emplace_back(A, B1);
   }
-
   // Try to skip A.
-  if (A && A->hasOneUse()) {
+  if (B && A && A->hasOneUse()) {
     auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
     auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-    if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
-      return true;
-    if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
-      return true;
+    if (A0 && A0->getParent() == P)
+      Candidates.emplace_back(A0, B);
+    if (A1 && A1->getParent() == P)
+      Candidates.emplace_back(A1, B);
   }
-  return false;
+
+  if (Candidates.size() == 1)
+    return tryToVectorizePair(Op0, Op1, R);
+
+  // We have multiple options. Try to pick the single best.
+  Optional<int> BestCandidate = R.findBestRootPair(Candidates);
+  if (!BestCandidate)
+    return false;
+  return tryToVectorizePair(Candidates[*BestCandidate].first,
+                            Candidates[*BestCandidate].second, R);
 }
 
 namespace {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll
@@ -1,12 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S -pass-remarks-missed=slp-vectorizer 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
 ; This test check that slp vectorizer is not trying to vectorize instructions already vectorized.
-; CHECK: remark: <unknown>:0:0: Cannot SLP vectorize list: type <16 x i8> is unsupported by vectorizer
 
 define void @vector() {
+; CHECK-LABEL: @vector(
+; CHECK-NEXT:    [[LOAD0:%.*]] = tail call <16 x i8> @vector.load(<16 x i8>* undef, i32 1)
+; CHECK-NEXT:    [[LOAD1:%.*]] = tail call <16 x i8> @vector.load(<16 x i8>* undef, i32 2)
+; CHECK-NEXT:    [[ADD:%.*]] = add <16 x i8> [[LOAD1]], [[LOAD0]]
+; CHECK-NEXT:    tail call void @vector.store(<16 x i8> [[ADD]], <16 x i8>* undef, i32 1)
+; CHECK-NEXT:    ret void
+;
   %load0 = tail call <16 x i8> @vector.load(<16 x i8> *undef, i32 1)
   %load1 = tail call <16 x i8> @vector.load(<16 x i8> *undef, i32 2)
   %add = add <16 x i8> %load1, %load0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -157,8 +157,6 @@
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; MINTREESIZE-NEXT:    call void @llvm.assume(i1 [[QI]])
 ; MINTREESIZE-NEXT:    ret <4 x float> undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -192,8 +192,6 @@
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; MINTREESIZE-NEXT:    call void @llvm.assume(i1 [[QI]])
 ; MINTREESIZE-NEXT:    ret <4 x float> undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
@@ -17,27 +17,18 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fdiv fast <2 x double> [[TMP2]], <double 7.000000e+00, double 5.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT:    [[I11:%.*]] = fmul fast double [[TMP4]], undef
+; CHECK-NEXT:    [[I09:%.*]] = fmul fast double [[TMP4]], undef
+; CHECK-NEXT:    [[I10:%.*]] = fsub fast double undef, [[I09]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP3]], <double 3.000000e+00, double undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x double> undef, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <2 x double> undef, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[I11]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <2 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul fast <2 x double> [[TMP12]], <double 4.000000e+00, double 1.100000e+01>
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast <2 x double> [[TMP12]], <double 4.000000e+00, double 1.100000e+01>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = fdiv fast <2 x double> [[TMP15]], <double 1.400000e+00, double 1.200000e+01>
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul fast <2 x double> [[TMP15]], <double 1.400000e+00, double 1.200000e+01>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP17]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd fast <2 x double> [[TMP18]], <double undef, double 1.400000e+00>
-; CHECK-NEXT:    [[TMP20:%.*]] = fdiv fast <2 x double> [[TMP18]], <double undef, double 1.400000e+00>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP20]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[TMP21]], i32 1
-; CHECK-NEXT:    [[I16:%.*]] = fadd fast double [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[I10]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP7]], <double undef, double 1.100000e+01>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> [[TMP8]], <double 4.000000e+00, double 1.200000e+01>
+; CHECK-NEXT:    [[TMP10:%.*]] = fdiv fast <2 x double> [[TMP9]], <double 1.400000e+00, double 1.400000e+00>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
+; CHECK-NEXT:    [[I07:%.*]] = fadd fast double undef, [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
+; CHECK-NEXT:    [[I16:%.*]] = fadd fast double [[I07]], [[TMP12]]
 ; CHECK-NEXT:    [[I17:%.*]] = fadd fast double [[I16]], [[C:%.*]]
 ; CHECK-NEXT:    [[I18:%.*]] = fadd fast double [[I17]], [[D:%.*]]
 ; CHECK-NEXT:    ret double [[I18]]