diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -164,14 +164,13 @@
     "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
     cl::desc("The maximum look-ahead depth for operand reordering scores"));
 
-// The maximum depth that the look-ahead score heuristic will explore
-// when it probing among candidates for vectorization tree roots.
-// The higher this value, the higher the compilation time overhead but unlike
-// similar limit for operands ordering this is less frequently used, hence
-// impact of higher value is less noticeable.
-static cl::opt<int> RootLookAheadMaxDepth(
-    "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
-    cl::desc("The maximum look-ahead depth for searching best rooting option"));
+// The maximum tree size that will use when probing among candidates for
+// vectorization tree roots. The higher this value, the higher the compilation
+// time overhead but unlike similar limit for operands ordering this is less
+// frequently used, hence impact of higher value is less noticeable.
+static cl::opt<unsigned> RootLookAheadMaxSize(
+    "slp-root-look-ahead-max-size", cl::init(5), cl::Hidden,
+    cl::desc("The maximum tree size for searching best rooting option"));
 
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
@@ -839,6 +838,15 @@
 class BoUpSLP {
   struct TreeEntry;
   struct ScheduleData;
+  /// Limit the size of the SLP tree to this many nodes.
+  Optional<unsigned> MaxTreeSize;
+  /// Disables scheduling. This is to limit the compilation time used by
+  /// buildTree(), usually in combination with \p MaxTreeSize for quickly
+  /// building approximate trees that can be used for estimating which roots are
+  /// to be prefered.
+  /// WARNING: If this is enabled the tree is not guaranteed to contain valid
+  /// instruction bundles that can actually get codegened.
+  bool ForceDisableScheduling = false;
 
 public:
   using ValueList = SmallVector<Value *, 8>;
@@ -897,6 +905,15 @@
   void buildTree(ArrayRef<Value *> Roots,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
+  /// Builds a tree starting from \p roots with up to \p MaxSize nodes. For
+  /// faster compilation time scheduling is disabled by default, or can be
+  /// enabled with \p DisableScheduling = false.
+  /// This function is to be used for a look-ahead style evaluation of root
+  /// nodes and estimating which ones are worth building a full tree for.
+  InstructionCost buildTreeFastAndGetCost(ArrayRef<Value *> Roots,
+                                          Optional<unsigned> MaxSize = 5,
+                                          bool DisableScheduling = true);
+
   /// Builds external uses of the vectorized scalars, i.e. the list of
   /// vectorized scalars to be extracted, their lanes and their scalar users. \p
   /// ExternallyUsedValues contains additional list of external uses to handle
@@ -906,6 +923,8 @@
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
+    ForceDisableScheduling = false;
+    MaxTreeSize = None;
     VectorizableTree.clear();
     ScalarToTreeEntry.clear();
     MustGather.clear();
@@ -2018,17 +2037,16 @@
   /// above the LookAheadHeuristics::ScoreFail.
   Optional<int>
   findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates) {
-    LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
-                                  RootLookAheadMaxDepth);
-    int BestScore = LookAheadHeuristics::ScoreFail;
+    InstructionCost BestCost = InstructionCost::getMax();
     Optional<int> Index = None;
+
     for (int I : seq<int>(0, Candidates.size())) {
-      int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
-                                               Candidates[I].second,
-                                               /*U1=*/nullptr, /*U2=*/nullptr,
-                                               /*Level=*/1, None);
-      if (Score > BestScore) {
-        BestScore = Score;
+      SmallVector<Value *, 2> Roots(
+          {Candidates[I].first, Candidates[I].second});
+      InstructionCost Cost =
+          buildTreeFastAndGetCost(Roots, RootLookAheadMaxSize.getValue());
+      if (Cost < BestCost) {
+        BestCost = Cost;
         Index = I;
       }
     }
@@ -2527,22 +2545,24 @@
         assert(!getTreeEntry(V) && "Scalar already in tree!");
         ScalarToTreeEntry[V] = Last;
       }
-      // Update the scheduler bundle to point to this TreeEntry.
-      ScheduleData *BundleMember = Bundle.getValue();
-      assert((BundleMember || isa<PHINode>(S.MainOp) ||
-              isVectorLikeInstWithConstOps(S.MainOp) ||
-              doesNotNeedToSchedule(VL)) &&
-             "Bundle and VL out of sync");
-      if (BundleMember) {
-        for (Value *V : VL) {
-          if (doesNotNeedToBeScheduled(V))
-            continue;
-          assert(BundleMember && "Unexpected end of bundle.");
-          BundleMember->TE = Last;
-          BundleMember = BundleMember->NextInBundle;
+      if (!ForceDisableScheduling) {
+        // Update the scheduler bundle to point to this TreeEntry.
+        ScheduleData *BundleMember = Bundle.getValue();
+        assert((BundleMember || isa<PHINode>(S.MainOp) ||
+                isVectorLikeInstWithConstOps(S.MainOp) ||
+                doesNotNeedToSchedule(VL)) &&
+               "Bundle and VL out of sync");
+        if (BundleMember) {
+          for (Value *V : VL) {
+            if (doesNotNeedToBeScheduled(V))
+              continue;
+            assert(BundleMember && "Unexpected end of bundle.");
+            BundleMember->TE = Last;
+            BundleMember = BundleMember->NextInBundle;
+          }
         }
+        assert(!BundleMember && "Bundle and VL out of sync");
       }
-      assert(!BundleMember && "Bundle and VL out of sync");
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
@@ -2920,6 +2940,7 @@
     /// instructions into the ready-list.
     template <typename ReadyListType>
     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
+      assert(!DisableScheduling && "Trying to schedule when disabled");
       SD->IsScheduled = true;
       LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 
@@ -3079,6 +3100,14 @@
     tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                       const InstructionsState &S);
 
+    /// \Returns an uninitialized ScheduleBundle. This is needed when disabling
+    /// the scheduler because a non-null ScheduleData bundle is used to
+    /// determine whether a TreeEntry is marked as vectorizable.
+    static ScheduleData *getDummyBundle() {
+      static ScheduleData SD;
+      return &SD;
+    }
+
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
 
@@ -3157,6 +3186,9 @@
     /// Make sure that the initial SchedulingRegionID is greater than the
     /// initial SchedulingRegionID in ScheduleData (which is 0).
     int SchedulingRegionID = 1;
+
+    /// This is set to true if we need to skip scheduling for this block.
+    bool DisableScheduling = false;
   };
 
   /// Attaches the BlockScheduling structures to basic blocks.
@@ -4087,6 +4119,26 @@
   buildTree_rec(Roots, 0, EdgeInfo());
 }
 
+InstructionCost BoUpSLP::buildTreeFastAndGetCost(ArrayRef<Value *> Roots,
+                                                 Optional<unsigned> MaxSize,
+                                                 bool DisableScheduling) {
+  deleteTree();
+  MaxTreeSize = MaxSize;
+  ForceDisableScheduling = DisableScheduling;
+  if (!allSameType(Roots))
+    return InstructionCost::getMax();
+
+  buildTree_rec(Roots, 0, EdgeInfo());
+
+  if (isTreeTinyAndNotFullyVectorizable())
+    return InstructionCost::getMax();
+  reorderTopToBottom();
+  reorderBottomToTop(!isa<InsertElementInst>(Roots.front()));
+  buildExternalUses();
+  computeMinimumValueSizes();
+  return getTreeCost();
+}
+
 namespace {
 /// Tracks the state we can represent the loads in the given sequence.
 enum class LoadsState { Gather, Vectorize, ScatterVectorize };
@@ -4249,6 +4301,13 @@
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
+  // If we are building a fast approximate tree, then early return once we reach
+  // the tree size limit.
+  if (MaxTreeSize && VectorizableTree.size() >= *MaxTreeSize) {
+    LLVM_DEBUG(dbgs() << "SLP: Reached max tree size " << *MaxTreeSize
+                      << ".\n");
+    return;
+  }
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   SmallVector<int> ReuseShuffleIndicies;
@@ -4418,13 +4477,16 @@
     BSRef = std::make_unique<BlockScheduling>(BB);
 
   BlockScheduling &BS = *BSRef;
+  BS.DisableScheduling = ForceDisableScheduling;
 
-  Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+  Optional<ScheduleData *> Bundle = !ForceDisableScheduling
+                                        ? BS.tryScheduleBundle(VL, this, S)
+                                        : BlockScheduling::getDummyBundle();
 #ifdef EXPENSIVE_CHECKS
   // Make sure we didn't break any internal invariants
   BS.verify();
 #endif
-  if (!Bundle) {
+  if (!ForceDisableScheduling && !Bundle) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
@@ -4433,7 +4495,9 @@
                  ReuseShuffleIndicies);
     return;
   }
-  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+  LLVM_DEBUG(dbgs() << (ForceDisableScheduling
+                            ? "SLP: Scheduling was disabled.\n"
+                            : "SLP: We are able to schedule this bundle.\n"));
 
   unsigned ShuffleOrOp = S.isAltShuffle() ?
                 (unsigned) Instruction::ShuffleVector : S.getOpcode();
@@ -8075,6 +8139,7 @@
 Optional<BoUpSLP::ScheduleData *>
 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                             const InstructionsState &S) {
+  assert(!DisableScheduling && "Trying to schedule when disabled");
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
   // instructions.
   if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
@@ -8171,6 +8236,9 @@
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
                                                 Value *OpValue) {
+  // Early return if scheduling is disabled.
+  if (DisableScheduling)
+    return;
   if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
       doesNotNeedToSchedule(VL))
     return;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -157,6 +157,8 @@
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
+; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
+; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; MINTREESIZE-NEXT:    call void @llvm.assume(i1 [[QI]])
 ; MINTREESIZE-NEXT:    ret <4 x float> undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -192,6 +192,8 @@
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
+; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
+; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; MINTREESIZE-NEXT:    call void @llvm.assume(i1 [[QI]])
 ; MINTREESIZE-NEXT:    ret <4 x float> undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
@@ -76,11 +76,11 @@
 ; AVX-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]])
 ; AVX-NEXT:    [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1
 ; AVX-NEXT:    [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1
-; AVX-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2
+; AVX-NEXT:    [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3
+; AVX-NEXT:    [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3
+; AVX-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2
 ; AVX-NEXT:    [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2
-; AVX-NEXT:    [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2
 ; AVX-NEXT:    [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4
 ; AVX-NEXT:    [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4
 ; AVX-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2
@@ -88,36 +88,36 @@
 ; AVX-NEXT:    [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]])
 ; AVX-NEXT:    [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5
 ; AVX-NEXT:    [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5
-; AVX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2
-; AVX-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2
+; AVX-NEXT:    [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7
+; AVX-NEXT:    [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7
+; AVX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2
+; AVX-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2
 ; AVX-NEXT:    [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6
-; AVX-NEXT:    [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6
-; AVX-NEXT:    [[TMP12:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_2]], align 2
-; AVX-NEXT:    [[TMP13:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_2]], align 2
+; AVX-NEXT:    [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64
+; AVX-NEXT:    [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48
+; AVX-NEXT:    [[TMP12:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_1]], align 2
+; AVX-NEXT:    [[TMP13:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_1]], align 2
 ; AVX-NEXT:    [[TMP14:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP12]], <2 x i16> [[TMP13]])
 ; AVX-NEXT:    [[TMP15:%.*]] = zext <2 x i16> [[TMP14]] to <2 x i64>
-; AVX-NEXT:    [[TMP16:%.*]] = shl nuw <2 x i64> [[TMP15]], <i64 32, i64 48>
-; AVX-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP16]], i32 0
-; AVX-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP16]], i32 1
-; AVX-NEXT:    [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[TMP18]], [[TMP17]]
-; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16
-; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]]
+; AVX-NEXT:    [[TMP16:%.*]] = shl nuw nsw <2 x i64> [[TMP15]], <i64 16, i64 32>
+; AVX-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP16]], i32 1
+; AVX-NEXT:    [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[TMP17]]
+; AVX-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP16]], i32 0
+; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[TMP18]]
 ; AVX-NEXT:    [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64
 ; AVX-NEXT:    [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]]
 ; AVX-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0
-; AVX-NEXT:    [[TMP19:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_6]], align 2
-; AVX-NEXT:    [[TMP20:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_6]], align 2
+; AVX-NEXT:    [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64
+; AVX-NEXT:    [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48
+; AVX-NEXT:    [[TMP19:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_5]], align 2
+; AVX-NEXT:    [[TMP20:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_5]], align 2
 ; AVX-NEXT:    [[TMP21:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP19]], <2 x i16> [[TMP20]])
 ; AVX-NEXT:    [[TMP22:%.*]] = zext <2 x i16> [[TMP21]] to <2 x i64>
-; AVX-NEXT:    [[TMP23:%.*]] = shl nuw <2 x i64> [[TMP22]], <i64 32, i64 48>
-; AVX-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0
-; AVX-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1
-; AVX-NEXT:    [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[TMP25]], [[TMP24]]
-; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16
-; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]]
+; AVX-NEXT:    [[TMP23:%.*]] = shl nuw nsw <2 x i64> [[TMP22]], <i64 16, i64 32>
+; AVX-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1
+; AVX-NEXT:    [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[TMP24]]
+; AVX-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0
+; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[TMP25]]
 ; AVX-NEXT:    [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64
 ; AVX-NEXT:    [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]]
 ; AVX-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
@@ -61,18 +61,19 @@
 define void @root_steering() {
 ; CHECK-LABEL: @root_steering(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[CHAIN2_2:%.*]] = fadd double 4.000000e-01, 5.000000e-01
-; CHECK-NEXT:    [[CHAIN2_1:%.*]] = fmul double 3.000000e-01, [[CHAIN2_2]]
-; CHECK-NEXT:    [[ROOT5:%.*]] = fadd double 2.000000e-01, [[CHAIN2_1]]
 ; CHECK-NEXT:    [[ROOT3:%.*]] = fmul double 3.000000e-01, 2.000000e-01
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[ROOT3]], 1.000000e-01
 ; CHECK-NEXT:    [[CHAINB_3:%.*]] = fadd double 3.000000e-01, 4.000000e-01
 ; CHECK-NEXT:    [[CHAINB_2:%.*]] = fmul double 2.000000e-01, [[CHAINB_3]]
-; CHECK-NEXT:    [[CHAINB_1:%.*]] = fadd double 1.000000e-01, [[CHAINB_2]]
-; CHECK-NEXT:    [[ROOT4:%.*]] = fmul double [[MUL]], [[CHAINB_1]]
-; CHECK-NEXT:    [[ROOT2:%.*]] = fadd double 1.000000e-01, [[ROOT4]]
-; CHECK-NEXT:    [[ROOT1:%.*]] = fmul double [[ROOT3]], [[ROOT5]]
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[ROOT1]], [[ROOT2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double 5.000000e-01, double poison>, double [[CHAINB_2]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> <double 4.000000e-01, double 1.000000e-01>, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> <double 3.000000e-01, double poison>, double [[MUL]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> <double 2.000000e-01, double 1.000000e-01>, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[ROOT1:%.*]] = fmul double [[ROOT3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[ROOT1]], [[TMP6]]
 ; CHECK-NEXT:    [[SEED:%.*]] = fcmp ogt double [[DIV]], 3.000000e-01
 ; CHECK-NEXT:    ret void
 ;