diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -45,6 +45,7 @@
 class TargetLibraryInfo;
 class TargetTransformInfo;
 class Value;
+class WeakTrackingVH;
 
 /// A private "module" namespace for types and utilities used by this pass.
 /// These are implementation details and should not be used by clients.
@@ -110,20 +111,25 @@
   /// collected in GEPs.
   bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
-  /// Try to find horizontal reduction or otherwise vectorize a chain of binary
-  /// operators.
+  /// Try to find horizontal reduction or otherwise, collect instructions
+  /// for postponed vectorization attempts.
+  /// \a P if not null designates phi node the reduction is fed into
+  /// (with reduction operators \a V or one of its operands, in a basic block
+  /// \a BB).
+  /// \returns true if a horizontal reduction was matched and reduced.
+  /// \returns false if \a V is null or not an instruction,
+  /// or a horizontal reduction was not matched or not possible.
+  bool
+  vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
+                           slpvectorizer::BoUpSLP &R, TargetTransformInfo *TTI,
+                           SmallVectorImpl<WeakTrackingVH> &PostponedInsts);
+
+  /// The convinience method version that also tries to vectorize postponed
+  /// binary operations.
   bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
                                 slpvectorizer::BoUpSLP &R,
                                 TargetTransformInfo *TTI);
 
-  /// Try to vectorize trees that start at insertvalue instructions.
-  bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB,
-                                slpvectorizer::BoUpSLP &R);
-
-  /// Try to vectorize trees that start at insertelement instructions.
-  bool vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB,
-                                  slpvectorizer::BoUpSLP &R);
-
   /// Tries to vectorize constructs started from CmpInst, InsertValueInst or
   /// InsertElementInst instructions.
   bool vectorizeSimpleInstructions(InstSetVector &Instructions, BasicBlock *BB,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11559,6 +11559,8 @@
                                    SmallVectorImpl<Value *> &BuildVectorOpds,
                                    SmallVectorImpl<Value *> &InsertElts,
                                    unsigned OperandOffset) {
+  assert((isa<InsertElementInst, InsertValueInst>(LastInsertInst)) &&
+         "Expected insertelement or insertvalue instruction!");
   do {
     Value *InsertedOperand = LastInsertInst->getOperand(1);
     Optional<unsigned> OperandIndex =
@@ -11597,13 +11599,10 @@
                                TargetTransformInfo *TTI,
                                SmallVectorImpl<Value *> &BuildVectorOpds,
                                SmallVectorImpl<Value *> &InsertElts) {
-
-  assert((isa<InsertElementInst>(LastInsertInst) ||
-          isa<InsertValueInst>(LastInsertInst)) &&
-         "Expected insertelement or insertvalue instruction!");
-
   assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
          "Expected empty result vectors!");
+  if (!isa<InsertElementInst, InsertValueInst>(LastInsertInst))
+    return false;
 
   Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
   if (!AggregateSize)
@@ -11689,28 +11688,19 @@
   return false;
 }
 
-/// Attempt to reduce a horizontal reduction.
-/// If it is legal to match a horizontal reduction feeding the phi node \a P
-/// with reduction operators \a Root (or one of its operands) in a basic block
-/// \a BB, then check if it can be done. If horizontal reduction is not found
-/// and root instruction is a binary operation, vectorization of the operands is
-/// attempted.
-/// \returns true if a horizontal reduction was matched and reduced or operands
-/// of one of the binary instruction were vectorized.
-/// \returns false if a horizontal reduction was not matched (or not possible)
-/// or no vectorization of any binary operation feeding \a Root instruction was
-/// performed.
-static bool tryToVectorizeHorReductionOrInstOperands(
-    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
-    TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,
-    const TargetLibraryInfo &TLI,
-    const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
+bool SLPVectorizerPass::vectorizeRootInstruction(
+    PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
+    SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
   if (!ShouldVectorizeHor)
     return false;
 
+  auto *Root = dyn_cast_or_null<Instruction>(V);
   if (!Root)
     return false;
 
+  if (!isa<BinaryOperator>(Root))
+    P = nullptr;
+
   if (Root->getParent() != BB || isa<PHINode>(Root))
     return false;
   // Start analysis starting from Root instruction. If horizontal reduction is
@@ -11722,24 +11712,21 @@
   // horizontal reduction.
   // Interrupt the process if the Root instruction itself was vectorized or all
   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
-  // Skip the analysis of CmpInsts. Compiler implements postanalysis of the
-  // CmpInsts so we can skip extra attempts in
-  // tryToVectorizeHorReductionOrInstOperands and save compile time.
+  // If a horizintal reduction was not matched or vectorized we collect
+  // instructions for possible later attempts for vectorization.
   std::queue<std::pair<Instruction *, unsigned>> Stack;
   Stack.emplace(Root, 0);
   SmallPtrSet<Value *, 8> VisitedInstrs;
-  SmallVector<WeakTrackingVH> PostponedInsts;
   bool Res = false;
-  auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,
-                                                     Value *&B0,
-                                                     Value *&B1) -> Value * {
+  auto &&TryToReduce = [this, TTI, &P, &R](Instruction *Inst, Value *&B0,
+                                           Value *&B1) -> Value * {
     if (R.isAnalyzedReductionRoot(Inst))
       return nullptr;
     bool IsBinop = matchRdxBop(Inst, B0, B1);
     bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
     if (IsBinop || IsSelect) {
       HorizontalReduction HorRdx;
-      if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))
+      if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI))
         return HorRdx.tryToReduce(R, TTI);
     }
     return nullptr;
@@ -11781,9 +11768,8 @@
       // Set P to nullptr to avoid re-analysis of phi node in
       // matchAssociativeReduction function unless this is the root node.
       P = nullptr;
-      // Do not try to vectorize CmpInst operands, this is done separately.
-      // Final attempt for binop args vectorization should happen after the loop
-      // to try to find reductions.
+      // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
+      // analysis is done separately.
       if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
         PostponedInsts.push_back(Inst);
     }
@@ -11801,61 +11787,20 @@
                 !R.isDeleted(I) && I->getParent() == BB)
               Stack.emplace(I, Level);
   }
-  // Try to vectorized binops where reductions were not found.
-  for (Value *V : PostponedInsts)
-    if (auto *Inst = dyn_cast<Instruction>(V))
-      if (!R.isDeleted(Inst))
-        Res |= Vectorize(Inst, R);
   return Res;
 }
 
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
                                                  BasicBlock *BB, BoUpSLP &R,
                                                  TargetTransformInfo *TTI) {
-  auto *I = dyn_cast_or_null<Instruction>(V);
-  if (!I)
-    return false;
-
-  if (!isa<BinaryOperator>(I))
-    P = nullptr;
-  // Try to match and vectorize a horizontal reduction.
-  auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
-    return tryToVectorize(I, R);
-  };
-  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,
-                                                  *TLI, ExtraVectorization);
-}
-
-bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
-                                                 BasicBlock *BB, BoUpSLP &R) {
-  const DataLayout &DL = BB->getModule()->getDataLayout();
-  if (!R.canMapToVector(IVI->getType(), DL))
-    return false;
-
-  SmallVector<Value *, 16> BuildVectorOpds;
-  SmallVector<Value *, 16> BuildVectorInsts;
-  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
-    return false;
-
-  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
-  // Aggregate value is unlikely to be processed in vector register.
-  return tryToVectorizeList(BuildVectorOpds, R);
-}
-
-bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
-                                                   BasicBlock *BB, BoUpSLP &R) {
-  SmallVector<Value *, 16> BuildVectorInsts;
-  SmallVector<Value *, 16> BuildVectorOpds;
-  SmallVector<int> Mask;
-  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
-      (llvm::all_of(
-           BuildVectorOpds,
-           [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) &&
-       isFixedVectorShuffle(BuildVectorOpds, Mask)))
-    return false;
-
-  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
-  return tryToVectorizeList(BuildVectorInsts, R);
+  SmallVector<WeakTrackingVH> PostponedInsts;
+  bool Res = vectorizeRootInstruction(P, V, BB, R, TTI, PostponedInsts);
+  // Try to vectorize binops where reductions were not found.
+  for (Value *Op : PostponedInsts)
+    if (auto *Inst = dyn_cast<Instruction>(Op))
+      if (!R.isDeleted(Inst))
+        Res |= tryToVectorize(Inst, R);
+  return Res;
 }
 
 template <typename T>
@@ -11991,16 +11936,48 @@
   for (auto *I : reverse(Instructions)) {
     if (R.isDeleted(I))
       continue;
-    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
-      OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
-    } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
-      OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
-    } else if (isa<CmpInst>(I)) {
+    if (isa<CmpInst>(I)) {
       PostponedCmps.push_back(I);
       continue;
     }
-    // Try to find reductions in buildvector sequnces.
-    OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI);
+    SmallVector<Value *, 16> BuildVectorOpds;
+    SmallVector<Value *, 16> BuildVectorInsts;
+    if (!findBuildAggregate(I, TTI, BuildVectorOpds, BuildVectorInsts))
+      continue;
+
+    // Try to find reductions in buildvector sequences.
+    SmallVector<WeakTrackingVH> PostponedInsts;
+    for (Value *Op : BuildVectorOpds)
+      OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI,
+                                             PostponedInsts);
+
+    if (isa<InsertValueInst>(I)) {
+      const DataLayout &DL = BB->getModule()->getDataLayout();
+      if (!R.canMapToVector(I->getType(), DL))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *I << "\n");
+      // Aggregate value is unlikely to be processed in vector register.
+      OpsChanged |= tryToVectorizeList(BuildVectorOpds, R);
+
+    }
+    else if (isa<InsertElementInst>(I)) {
+      SmallVector<int> Mask;
+      if (all_of(BuildVectorOpds,
+                 [](Value *V) {
+                   return isa<ExtractElementInst, UndefValue>(V);
+                 }) &&
+          isFixedVectorShuffle(BuildVectorOpds, Mask))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *I << "\n");
+      OpsChanged |=  tryToVectorizeList(BuildVectorInsts, R);
+    }
+    // Try to vectorize postponed binops where reductions were not found.
+    for (Value *V : PostponedInsts)
+      if (auto *Inst = dyn_cast<Instruction>(V))
+        if (!R.isDeleted(Inst))
+          OpsChanged |= tryToVectorize(Inst, R);
   }
   if (AtTerminator) {
     // Try to find reductions first.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -10,102 +10,23 @@
 define void @test(double* nocapture readonly %arg, double* nocapture readonly %arg1, double* nocapture %arg2) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP1_0:%.*]] = getelementptr inbounds double, double* [[ARG:%.*]], i64 1
-; CHECK-NEXT:    [[LD1_0:%.*]] = load double, double* [[GEP1_0]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x double*> poison, double* [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x double*> [[TMP0]], <8 x double*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, <8 x double*> [[SHUFFLE]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
 ; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, double* [[ARG1:%.*]], i64 16
-; CHECK-NEXT:    [[GEP1_1:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 3
-; CHECK-NEXT:    [[LD1_1:%.*]] = load double, double* [[GEP1_1]], align 8
-; CHECK-NEXT:    [[GEP0_1:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 1
-; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 17
-; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 5
-; CHECK-NEXT:    [[LD1_2:%.*]] = load double, double* [[GEP1_2]], align 8
-; CHECK-NEXT:    [[GEP0_2:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 2
-; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 18
-; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 7
-; CHECK-NEXT:    [[LD1_3:%.*]] = load double, double* [[GEP1_3]], align 8
-; CHECK-NEXT:    [[GEP0_3:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 3
-; CHECK-NEXT:    [[GEP2_3:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 19
-; CHECK-NEXT:    [[GEP1_4:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 9
-; CHECK-NEXT:    [[LD1_4:%.*]] = load double, double* [[GEP1_4]], align 8
-; CHECK-NEXT:    [[GEP0_4:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 4
-; CHECK-NEXT:    [[GEP2_4:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 20
-; CHECK-NEXT:    [[GEP1_5:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 11
-; CHECK-NEXT:    [[LD1_5:%.*]] = load double, double* [[GEP1_5]], align 8
-; CHECK-NEXT:    [[GEP0_5:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 5
-; CHECK-NEXT:    [[GEP2_5:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 21
-; CHECK-NEXT:    [[GEP1_6:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 13
-; CHECK-NEXT:    [[LD1_6:%.*]] = load double, double* [[GEP1_6]], align 8
-; CHECK-NEXT:    [[GEP0_6:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 6
-; CHECK-NEXT:    [[GEP2_6:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 22
-; CHECK-NEXT:    [[GEP1_7:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 15
-; CHECK-NEXT:    [[LD1_7:%.*]] = load double, double* [[GEP1_7]], align 8
-; CHECK-NEXT:    [[GEP0_7:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 7
-; CHECK-NEXT:    [[GEP2_7:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 23
-; CHECK-NEXT:    [[LD0_0:%.*]] = load double, double* [[ARG1]], align 8
-; CHECK-NEXT:    [[LD2_0:%.*]] = load double, double* [[GEP2_0]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD0_0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD2_0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD1_0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD1_0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[LD0_1:%.*]] = load double, double* [[GEP0_1]], align 8
-; CHECK-NEXT:    [[LD2_1:%.*]] = load double, double* [[GEP2_1]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD0_1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD2_1]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[LD1_1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[LD1_1]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
-; CHECK-NEXT:    [[LD0_2:%.*]] = load double, double* [[GEP0_2]], align 8
-; CHECK-NEXT:    [[LD2_2:%.*]] = load double, double* [[GEP2_2]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[LD0_2]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[LD2_2]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[LD1_2]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[LD1_2]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP15]]
-; CHECK-NEXT:    [[LD0_3:%.*]] = load double, double* [[GEP0_3]], align 8
-; CHECK-NEXT:    [[LD2_3:%.*]] = load double, double* [[GEP2_3]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[LD0_3]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[LD2_3]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[LD1_3]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[LD1_3]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <2 x double> [[TMP18]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd fast <2 x double> [[TMP16]], [[TMP21]]
-; CHECK-NEXT:    [[LD0_4:%.*]] = load double, double* [[GEP0_4]], align 8
-; CHECK-NEXT:    [[LD2_4:%.*]] = load double, double* [[GEP2_4]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x double> poison, double [[LD0_4]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x double> [[TMP23]], double [[LD2_4]], i32 1
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <2 x double> poison, double [[LD1_4]], i32 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[LD1_4]], i32 1
-; CHECK-NEXT:    [[TMP27:%.*]] = fmul fast <2 x double> [[TMP24]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <2 x double> [[TMP22]], [[TMP27]]
-; CHECK-NEXT:    [[LD0_5:%.*]] = load double, double* [[GEP0_5]], align 8
-; CHECK-NEXT:    [[LD2_5:%.*]] = load double, double* [[GEP2_5]], align 8
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <2 x double> poison, double [[LD0_5]], i32 0
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x double> [[TMP29]], double [[LD2_5]], i32 1
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x double> poison, double [[LD1_5]], i32 0
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[LD1_5]], i32 1
-; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast <2 x double> [[TMP30]], [[TMP32]]
-; CHECK-NEXT:    [[TMP34:%.*]] = fadd fast <2 x double> [[TMP28]], [[TMP33]]
-; CHECK-NEXT:    [[LD0_6:%.*]] = load double, double* [[GEP0_6]], align 8
-; CHECK-NEXT:    [[LD2_6:%.*]] = load double, double* [[GEP2_6]], align 8
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <2 x double> poison, double [[LD0_6]], i32 0
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x double> [[TMP35]], double [[LD2_6]], i32 1
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <2 x double> poison, double [[LD1_6]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x double> [[TMP37]], double [[LD1_6]], i32 1
-; CHECK-NEXT:    [[TMP39:%.*]] = fmul fast <2 x double> [[TMP36]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = fadd fast <2 x double> [[TMP34]], [[TMP39]]
-; CHECK-NEXT:    [[LD0_7:%.*]] = load double, double* [[GEP0_7]], align 8
-; CHECK-NEXT:    [[LD2_7:%.*]] = load double, double* [[GEP2_7]], align 8
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x double> poison, double [[LD0_7]], i32 0
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <2 x double> [[TMP41]], double [[LD2_7]], i32 1
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> poison, double [[LD1_7]], i32 0
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x double> [[TMP43]], double [[LD1_7]], i32 1
-; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast <2 x double> [[TMP42]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = fadd fast <2 x double> [[TMP40]], [[TMP45]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP1]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARG1]] to <8 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, <8 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[GEP2_0]] to <8 x double>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x double>, <8 x double>* [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <8 x double> [[TMP8]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP9]])
+; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0
+; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, double* [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> [[TMP46]], <2 x double*> [[P]], i32 8, <2 x i1> <i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> [[I143]], <2 x double*> [[P]], i32 8, <2 x i1> <i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
 entry: