diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4051,8 +4051,9 @@
     for (Value *V : VL)
       CommonAlignment =
           commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
-    if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
-                                CommonAlignment))
+    auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+    if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
+        !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
       return LoadsState::ScatterVectorize;
   }
 
@@ -5563,70 +5564,96 @@
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+  auto &&GetCostDiff =
+      [this, E, VL, CommonCost,
+       VL0](function_ref<InstructionCost(unsigned)> ScalarEltCost,
+            function_ref<InstructionCost(InstructionCost)> VectorCost) {
+        // Calculate the cost of this instruction.
+        InstructionCost ScalarCost = 0;
+        if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
+          // For some of the instructions no need to calculate cost for each
+          // particular instruction, we can use the cost of the single
+          // instruction x total number of scalar instructions.
+          ScalarCost = VL.size() *
+                       ScalarEltCost(std::distance(VL.begin(), find(VL, VL0)));
+        } else {
+          for (unsigned I = 0, Sz = VL.size(); I < Sz; ++I)
+            ScalarCost += ScalarEltCost(I);
+        }
+
+        InstructionCost VecCost = VectorCost(CommonCost);
+        LLVM_DEBUG(
+            dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost));
+        // Disable warnings for `this` and `E` are unused. Required for
+        // `dumpTreeCosts`.
+        (void)this;
+        (void)E;
+        return VecCost - ScalarCost;
+      };
   switch (ShuffleOrOp) {
-    case Instruction::PHI:
-      return 0;
+  case Instruction::PHI: {
+    // Count reused scalars.
+    InstructionCost ScalarCost = 0;
+    SmallPtrSet<const TreeEntry *, 4> CountedOps;
+    for (Value *V : VL) {
+      auto *PHI = dyn_cast<PHINode>(V);
+      if (!PHI)
+        continue;
 
-    case Instruction::ExtractValue:
-    case Instruction::ExtractElement: {
-      // The common cost of removal ExtractElement/ExtractValue instructions +
-      // the cost of shuffles, if required to resuffle the original vector.
-      if (NeedToShuffleReuses) {
-        unsigned Idx = 0;
-        for (unsigned I : E->ReuseShuffleIndices) {
-          if (ShuffleOrOp == Instruction::ExtractElement) {
-            auto *EE = cast<ExtractElementInst>(VL[I]);
-            CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  EE->getVectorOperandType(),
-                                                  *getExtractIndex(EE));
-          } else {
-            CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  VecTy, Idx);
-            ++Idx;
-          }
-        }
-        Idx = EntryVF;
-        for (Value *V : VL) {
-          if (ShuffleOrOp == Instruction::ExtractElement) {
-            auto *EE = cast<ExtractElementInst>(V);
-            CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  EE->getVectorOperandType(),
-                                                  *getExtractIndex(EE));
-          } else {
-            --Idx;
-            CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  VecTy, Idx);
-          }
-        }
-      }
-      if (ShuffleOrOp == Instruction::ExtractValue) {
-        for (unsigned I = 0, E = VL.size(); I < E; ++I) {
-          auto *EI = cast<Instruction>(VL[I]);
-          // Take credit for instruction that will become dead.
-          if (EI->hasOneUse()) {
-            Instruction *Ext = EI->user_back();
-            if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
-                all_of(Ext->users(),
-                       [](User *U) { return isa<GetElementPtrInst>(U); })) {
-              // Use getExtractWithExtendCost() to calculate the cost of
-              // extractelement/ext pair.
-              CommonCost -= TTI->getExtractWithExtendCost(
-                  Ext->getOpcode(), Ext->getType(), VecTy, I);
-              // Add back the cost of s|zext which is subtracted separately.
-              CommonCost += TTI->getCastInstrCost(
-                  Ext->getOpcode(), Ext->getType(), EI->getType(),
-                  TTI::getCastContextHint(Ext), CostKind, Ext);
-              continue;
-            }
-          }
-          CommonCost -=
-              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
-        }
-      } else {
-        AdjustExtractsCost(CommonCost);
+      ValueList Operands(PHI->getNumIncomingValues(), nullptr);
+      for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
+        Value *Op = PHI->getIncomingValue(I);
+        Operands[I] = Op;
       }
-      return CommonCost;
+      if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
+        if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
+          if (!OpTE->ReuseShuffleIndices.empty())
+            ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
+                                            OpTE->Scalars.size());
     }
+
+    return CommonCost - ScalarCost;
+  }
+    case Instruction::ExtractValue:
+    case Instruction::ExtractElement:
+      return GetCostDiff(
+          [this, ShuffleOrOp, VL, ScalarTy, CostKind](unsigned Idx) {
+            auto *I = cast<Instruction>(VL[Idx]);
+            VectorType *SrcVecTy;
+            if (ShuffleOrOp == Instruction::ExtractElement) {
+              auto *EE = cast<ExtractElementInst>(I);
+              SrcVecTy = EE->getVectorOperandType();
+            } else {
+              auto *EV = cast<ExtractValueInst>(I);
+              Type *AggregateTy = EV->getAggregateOperand()->getType();
+              unsigned NumElts;
+              if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
+                NumElts = ATy->getNumElements();
+              else
+                NumElts = AggregateTy->getStructNumElements();
+              SrcVecTy = FixedVectorType::get(ScalarTy, NumElts);
+            }
+            if (I->hasOneUse()) {
+              Instruction *Ext = I->user_back();
+              if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+                  all_of(Ext->users(),
+                         [](User *U) { return isa<GetElementPtrInst>(U); })) {
+                // Use getExtractWithExtendCost() to calculate the cost of
+                // extractelement/ext pair.
+                InstructionCost Cost = TTI->getExtractWithExtendCost(
+                    Ext->getOpcode(), Ext->getType(), SrcVecTy,
+                    *getExtractIndex(I));
+                // Subtract the cost of s|zext which is subtracted separately.
+                Cost -= TTI->getCastInstrCost(
+                    Ext->getOpcode(), Ext->getType(), I->getType(),
+                    TTI::getCastContextHint(Ext), CostKind, Ext);
+                return Cost;
+              }
+            }
+            return TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                           SrcVecTy, *getExtractIndex(I));
+          },
+          [](InstructionCost CommonCost) { return CommonCost; });
     case Instruction::InsertElement: {
       assert(E->ReuseShuffleIndices.empty() &&
              "Unique insertelements only are expected.");
@@ -5699,79 +5726,76 @@
     case Instruction::UIToFP:
     case Instruction::Trunc:
     case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      Type *SrcTy = VL0->getOperand(0)->getType();
-      InstructionCost ScalarEltCost =
-          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
-                                TTI::getCastContextHint(VL0), CostKind, VL0);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-
-      // Calculate the cost of this instruction.
-      InstructionCost ScalarCost = VL.size() * ScalarEltCost;
-
-      auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
-      InstructionCost VecCost = 0;
-      // Check if the values are candidates to demote.
-      if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
-        VecCost = CommonCost + TTI->getCastInstrCost(
-                                   E->getOpcode(), VecTy, SrcVecTy,
-                                   TTI::getCastContextHint(VL0), CostKind, VL0);
-      }
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return VecCost - ScalarCost;
-    }
+    case Instruction::BitCast:
+      return GetCostDiff(
+          [this, E, ScalarTy, VL, CostKind](unsigned Idx) {
+            auto *VI = cast<Instruction>(VL[Idx]);
+            return TTI->getCastInstrCost(
+                E->getOpcode(), ScalarTy, VI->getOperand(0)->getType(),
+                TTI::getCastContextHint(VI), CostKind, VI);
+          },
+          [this, VL, VL0, VecTy, E, CostKind](InstructionCost CommonCost) {
+            Type *SrcTy = VL0->getOperand(0)->getType();
+            auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
+            InstructionCost VecCost = 0;
+            // Check if the values are candidates to demote.
+            if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+              VecCost = CommonCost +
+                        TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
+                                              TTI::getCastContextHint(VL0),
+                                              CostKind, VL0);
+            }
+            return VecCost;
+          });
     case Instruction::FCmp:
     case Instruction::ICmp:
     case Instruction::Select: {
-      // Calculate the cost of this instruction.
-      InstructionCost ScalarEltCost =
-          TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
-                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-      auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-
-      // Check if all entries in VL are either compares or selects with compares
-      // as condition that have the same predicates.
-      CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
-      bool First = true;
-      for (auto *V : VL) {
-        CmpInst::Predicate CurrentPred;
-        auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
-        if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
-             !match(V, MatchCmp)) ||
-            (!First && VecPred != CurrentPred)) {
-          VecPred = CmpInst::BAD_ICMP_PREDICATE;
-          break;
-        }
-        First = false;
-        VecPred = CurrentPred;
-      }
-
-      InstructionCost VecCost = TTI->getCmpSelInstrCost(
-          E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
-      // Check if it is possible and profitable to use min/max for selects in
-      // VL.
-      //
-      auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
-      if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
-        IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
-                                          {VecTy, VecTy});
-        InstructionCost IntrinsicCost =
-            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-        // If the selects are the only uses of the compares, they will be dead
-        // and we can adjust the cost by removing their cost.
-        if (IntrinsicAndUse.second)
-          IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
-                                                   MaskTy, VecPred, CostKind);
-        VecCost = std::min(VecCost, IntrinsicCost);
-      }
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return CommonCost + VecCost - ScalarCost;
+      CmpInst::Predicate VecPred, SwappedVecPred;
+      auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
+      if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
+          match(VL0, MatchCmp))
+        SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
+      else
+        SwappedVecPred = VecPred = CmpInst::BAD_ICMP_PREDICATE;
+      return GetCostDiff(
+          [this, E, ScalarTy, VL, CostKind, &VecPred,
+           &SwappedVecPred](unsigned Idx) {
+            auto *VI = cast<Instruction>(VL[Idx]);
+            CmpInst::Predicate CurrentPred;
+            auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
+            if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
+                 !match(VI, MatchCmp)) ||
+                (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
+              VecPred = SwappedVecPred = CmpInst::BAD_ICMP_PREDICATE;
+
+            return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+                                           Builder.getInt1Ty(), CurrentPred,
+                                           CostKind, VI);
+          },
+          [this, VL, VL0, VecTy, E, CostKind,
+           &VecPred](InstructionCost CommonCost) {
+            auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
+
+            InstructionCost VecCost = TTI->getCmpSelInstrCost(
+                E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
+            // Check if it is possible and profitable to use min/max for selects
+            // in VL.
+            //
+            auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
+            if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
+              IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
+                                                {VecTy, VecTy});
+              InstructionCost IntrinsicCost =
+                  TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+              // If the selects are the only uses of the compares, they will be
+              // dead and we can adjust the cost by removing their cost.
+              if (IntrinsicAndUse.second)
+                IntrinsicCost -= TTI->getCmpSelInstrCost(
+                    Instruction::ICmp, VecTy, MaskTy, VecPred, CostKind);
+              VecCost = std::min(VecCost, IntrinsicCost);
+            }
+            return VecCost + CommonCost;
+          });
     }
     case Instruction::FNeg:
     case Instruction::Add:
@@ -5791,137 +5815,125 @@
     case Instruction::AShr:
     case Instruction::And:
     case Instruction::Or:
-    case Instruction::Xor: {
+    case Instruction::Xor:
+    case Instruction::GetElementPtr: {
+      unsigned Opcode = ShuffleOrOp == Instruction::GetElementPtr
+                            ? Instruction::Add
+                            : ShuffleOrOp;
       // Certain instructions can be cheaper to vectorize if they have a
       // constant second vector operand.
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueKind Op2VK =
           TargetTransformInfo::OK_UniformConstantValue;
-      TargetTransformInfo::OperandValueProperties Op1VP =
-          TargetTransformInfo::OP_None;
       TargetTransformInfo::OperandValueProperties Op2VP =
           TargetTransformInfo::OP_PowerOf2;
-
-      // If all operands are exactly the same ConstantInt then set the
-      // operand kind to OK_UniformConstantValue.
-      // If instead not all operands are constants, then set the operand kind
-      // to OK_AnyValue. If all operands are constants but not the same,
-      // then set the operand kind to OK_NonUniformConstantValue.
       ConstantInt *CInt0 = nullptr;
-      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-        const Instruction *I = cast<Instruction>(VL[i]);
-        unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
-        ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
-        if (!CInt) {
-          Op2VK = TargetTransformInfo::OK_AnyValue;
-          Op2VP = TargetTransformInfo::OP_None;
-          break;
-        }
-        if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
-            !CInt->getValue().isPowerOf2())
-          Op2VP = TargetTransformInfo::OP_None;
-        if (i == 0) {
-          CInt0 = CInt;
-          continue;
-        }
-        if (CInt0 != CInt)
-          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-      }
-
-      SmallVector<const Value *, 4> Operands(VL0->operand_values());
-      InstructionCost ScalarEltCost =
-          TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
-                                      Op2VK, Op1VP, Op2VP, Operands, VL0);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecCost =
-          TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
-                                      Op2VK, Op1VP, Op2VP, Operands, VL0);
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return CommonCost + VecCost - ScalarCost;
-    }
-    case Instruction::GetElementPtr: {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
-
-      InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
-          Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecCost = TTI->getArithmeticInstrCost(
-          Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return CommonCost + VecCost - ScalarCost;
-    }
-    case Instruction::Load: {
-      // Cost of wide load - cost of scalar loads.
-      Align Alignment = cast<LoadInst>(VL0)->getAlign();
-      InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
-          Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-      InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecLdCost;
-      if (E->State == TreeEntry::Vectorize) {
-        VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,
-                                         CostKind, VL0);
-      } else {
-        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
-        Align CommonAlignment = Alignment;
-        for (Value *V : VL)
-          CommonAlignment =
-              commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
-        VecLdCost = TTI->getGatherScatterOpCost(
-            Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
-            /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
-      }
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
-      return CommonCost + VecLdCost - ScalarLdCost;
-    }
-    case Instruction::Store: {
-      // We know that we can merge the stores. Calculate the cost.
-      bool IsReorder = !E->ReorderIndices.empty();
-      auto *SI =
-          cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
-      Align Alignment = SI->getAlign();
-      InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
-          Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
-      InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecStCost = TTI->getMemoryOpCost(
-          Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost));
-      return CommonCost + VecStCost - ScalarStCost;
+      return GetCostDiff(
+          [this, ScalarTy, VL, CostKind, &Op2VK, &Op2VP, &CInt0,
+           Opcode](unsigned Idx) {
+            auto *VI = cast<Instruction>(VL[Idx]);
+            unsigned OpIdx = isa<BinaryOperator, GetElementPtrInst>(VI) ? 1 : 0;
+            TargetTransformInfo::OperandValueKind ScalarOp2VK =
+                TargetTransformInfo::OK_UniformConstantValue;
+            TargetTransformInfo::OperandValueProperties ScalarOp2VP =
+                TargetTransformInfo::OP_PowerOf2;
+            ConstantInt *CInt = dyn_cast<ConstantInt>(VI->getOperand(OpIdx));
+            if (!CInt) {
+              ScalarOp2VK = Op2VK = TargetTransformInfo::OK_AnyValue;
+              ScalarOp2VP = Op2VP = TargetTransformInfo::OP_None;
+            } else {
+              if (!CInt->getValue().isPowerOf2())
+                ScalarOp2VP = Op2VP = TargetTransformInfo::OP_None;
+              if (Op2VK == TargetTransformInfo::OK_UniformConstantValue) {
+                if (!CInt0)
+                  CInt0 = CInt;
+                else if (CInt0 != CInt)
+                  Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+              }
+            }
+            SmallVector<const Value *> Operands(VI->operand_values());
+            return TTI->getArithmeticInstrCost(
+                Opcode, ScalarTy, CostKind, TargetTransformInfo::OK_AnyValue,
+                ScalarOp2VK, TargetTransformInfo::OP_None, ScalarOp2VP,
+                Operands, VI);
+          },
+          [this, VecTy, CostKind, E, VL0, &Op2VK, &Op2VP,
+           Opcode](InstructionCost CommonCost) {
+            if (Op2VK == TargetTransformInfo::OK_AnyValue &&
+                isSplat(E->getOperand(
+                    isa<BinaryOperator, GetElementPtrInst>(VL0) ? 1 : 0)))
+              Op2VK = TargetTransformInfo::OK_UniformValue;
+            return TTI->getArithmeticInstrCost(
+                       Opcode, VecTy, CostKind,
+                       TargetTransformInfo::OK_AnyValue, Op2VK,
+                       TargetTransformInfo::OP_None, Op2VP) +
+                   CommonCost;
+          });
     }
+    case Instruction::Load:
+      return GetCostDiff(
+          [this, ScalarTy, VL, CostKind](unsigned Idx) {
+            auto *VI = cast<LoadInst>(VL[Idx]);
+            return TTI->getMemoryOpCost(
+                Instruction::Load, ScalarTy, VI->getAlign(),
+                VI->getPointerAddressSpace(), CostKind, VI);
+          },
+          [this, VL, VL0, VecTy, E, CostKind](InstructionCost CommonCost) {
+            auto *LI0 = cast<LoadInst>(VL0);
+            InstructionCost VecLdCost;
+            if (E->State == TreeEntry::Vectorize) {
+              VecLdCost = TTI->getMemoryOpCost(
+                  Instruction::Load, VecTy, LI0->getAlign(),
+                  LI0->getPointerAddressSpace(), CostKind);
+            } else {
+              assert(E->State == TreeEntry::ScatterVectorize &&
+                     "Unknown EntryState");
+              Align CommonAlignment = LI0->getAlign();
+              for (Value *V : VL)
+                CommonAlignment = commonAlignment(
+                    CommonAlignment, cast<LoadInst>(V)->getAlign());
+              VecLdCost = TTI->getGatherScatterOpCost(
+                  Instruction::Load, VecTy, LI0->getPointerOperand(),
+                  /*VariableMask=*/false, CommonAlignment, CostKind);
+            }
+            return VecLdCost + CommonCost;
+          });
+    case Instruction::Store:
+      return GetCostDiff(
+          [this, ScalarTy, VL, CostKind](unsigned Idx) {
+            auto *VI = cast<StoreInst>(VL[Idx]);
+            return TTI->getMemoryOpCost(
+                Instruction::Store, ScalarTy, VI->getAlign(),
+                VI->getPointerAddressSpace(), CostKind, VI);
+          },
+          [this, VL, VL0, VecTy, E, CostKind](InstructionCost CommonCost) {
+            // We know that we can merge the stores. Calculate the cost.
+            bool IsReorder = !E->ReorderIndices.empty();
+            auto *SI = cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()]
+                                                 : VL0);
+            return TTI->getMemoryOpCost(
+                       Instruction::Store, VecTy, SI->getAlign(),
+                       SI->getPointerAddressSpace(), CostKind) +
+                   CommonCost;
+          });
     case Instruction::Call: {
-      CallInst *CI = cast<CallInst>(VL0);
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
-      // Calculate the cost of the scalar and vector calls.
-      IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
-      InstructionCost ScalarEltCost =
-          TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-      InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
-
-      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
-      InstructionCost VecCallCost =
-          std::min(VecCallCosts.first, VecCallCosts.second);
-
-      LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
-                        << " (" << VecCallCost << "-" << ScalarCallCost << ")"
-                        << " for " << *CI << "\n");
-
-      return CommonCost + VecCallCost - ScalarCallCost;
+      return GetCostDiff(
+          [this, VL, CostKind](unsigned Idx) {
+            auto *CI = cast<CallInst>(VL[Idx]);
+            Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+            if (ID != Intrinsic::not_intrinsic) {
+              IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
+              return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+            }
+            return TTI->getCallInstrCost(
+                CI->getCalledFunction(), CI->getFunctionType()->getReturnType(),
+                CI->getFunctionType()->params(), CostKind);
+          },
+          [this, VL0, VecTy](InstructionCost CommonCost) {
+            auto *CI = cast<CallInst>(VL0);
+            auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+            return std::min(VecCallCosts.first, VecCallCosts.second) +
+                   CommonCost;
+          });
     }
     case Instruction::ShuffleVector: {
       assert(E->isAltShuffle() &&
@@ -5931,25 +5943,6 @@
                Instruction::isCast(E->getAltOpcode())) ||
               (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
              "Invalid Shuffle Vector Operand");
-      InstructionCost ScalarCost = 0;
-      if (NeedToShuffleReuses) {
-        for (unsigned Idx : E->ReuseShuffleIndices) {
-          Instruction *I = cast<Instruction>(VL[Idx]);
-          CommonCost -= TTI->getInstructionCost(I, CostKind);
-        }
-        for (Value *V : VL) {
-          Instruction *I = cast<Instruction>(V);
-          CommonCost += TTI->getInstructionCost(I, CostKind);
-        }
-      }
-      for (Value *V : VL) {
-        Instruction *I = cast<Instruction>(V);
-        assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-        ScalarCost += TTI->getInstructionCost(I, CostKind);
-      }
-      // VecCost is equal to sum of the cost of creating 2 vectors
-      // and the cost of creating shuffle.
-      InstructionCost VecCost = 0;
       // Try to find the previous shuffle node with the same operands and same
       // main/alternate ops.
       auto &&TryFindNodeWithEqualOperands = [this, E]() {
@@ -5966,53 +5959,71 @@
         }
         return false;
       };
-      if (TryFindNodeWithEqualOperands()) {
-        LLVM_DEBUG({
-          dbgs() << "SLP: diamond match for alternate node found.\n";
-          E->dump();
-        });
-        // No need to add new vector costs here since we're going to reuse
-        // same main/alternate vector ops, just do different shuffling.
-      } else if (Instruction::isBinaryOp(E->getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
-        VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
-                                               CostKind);
-      } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
-        VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
-                                          Builder.getInt1Ty(),
-                                          CI0->getPredicate(), CostKind, VL0);
-        VecCost += TTI->getCmpSelInstrCost(
-            E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
-            cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
-            E->getAltOp());
-      } else {
-        Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
-        Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
-        auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
-        auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
+      // Need to clear CommonCost since the final shuffle cost is included into
+      // vector cost.
+      return GetCostDiff(
+          [this, VL, CostKind, E](unsigned Idx) {
+            auto *VI = cast<Instruction>(VL[Idx]);
+            assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
+            (void)E;
+            return TTI->getInstructionCost(VI, CostKind);
+          },
+          [this, VecTy, E, CostKind, VL, FinalVecTy,
+           &TryFindNodeWithEqualOperands, VL0, ScalarTy](InstructionCost) {
+            // VecCost is equal to sum of the cost of creating 2 vectors
+            // and the cost of creating shuffle.
+            InstructionCost VecCost = 0;
+            if (TryFindNodeWithEqualOperands()) {
+              LLVM_DEBUG({
+                dbgs() << "SLP: diamond match for alternate node found.\n";
+                E->dump();
+              });
+              // No need to add new vector costs here since we're going to reuse
+              // same main/alternate vector ops, just do different shuffling.
+            } else if (Instruction::isBinaryOp(E->getOpcode())) {
+              VecCost =
+                  TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
+              VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
+                                                     CostKind);
+            } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+              VecCost = TTI->getCmpSelInstrCost(
+                  E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+                  CI0->getPredicate(), CostKind, VL0);
+              VecCost += TTI->getCmpSelInstrCost(
+                  E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+                  cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
+                  E->getAltOp());
+            } else {
+              Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
+              Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
+              auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
+              auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
+              VecCost =
+                  TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
                                         TTI::CastContextHint::None, CostKind);
-        VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
-                                         TTI::CastContextHint::None, CostKind);
-      }
-
-      if (E->ReuseShuffleIndices.empty()) {
-        CommonCost =
-            TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
-      } else {
-        SmallVector<int> Mask;
-        buildShuffleEntryMask(
-            E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
-            [E](Instruction *I) {
-              assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-              return I->getOpcode() == E->getAltOpcode();
-            },
-            Mask);
-        CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
-                                         FinalVecTy, Mask);
-      }
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return CommonCost + VecCost - ScalarCost;
+              VecCost +=
+                  TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
+                                        TTI::CastContextHint::None, CostKind);
+            }
+            SmallVector<int> Mask;
+            if (E->ReuseShuffleIndices.empty()) {
+              VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select,
+                                               FinalVecTy);
+            } else {
+              SmallVector<int> Mask;
+              buildShuffleEntryMask(
+                  E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+                  [E](Instruction *I) {
+                    assert(E->isOpcodeOrAlt(I) &&
+                           "Unexpected main/alternate opcode");
+                    return I->getOpcode() == E->getAltOpcode();
+                  },
+                  Mask);
+              VecCost += TTI->getShuffleCost(
+                  TargetTransformInfo::SK_PermuteTwoSrc, FinalVecTy, Mask);
+            }
+            return VecCost;
+          });
     }
     default:
       llvm_unreachable("Unknown instruction");
@@ -6445,11 +6456,6 @@
     if (isa<FixedVectorType>(EU.Scalar->getType()))
       continue;
 
-    // Already counted the cost for external uses when tried to adjust the cost
-    // for extractelements, no need to add it again.
-    if (isa<ExtractElementInst>(EU.Scalar))
-      continue;
-
     // If found user is an insertelement, do not calculate extract cost but try
     // to detect it as a final shuffled/identity match.
     if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -slp-vectorizer -slp-threshold=-3 -S -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
@@ -107,25 +107,22 @@
 define void @select_ule_ugt_mix_4xi32(i32* %ptr, i32 %x) {
 ; CHECK-LABEL: @select_ule_ugt_mix_4xi32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ult i32 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i32 [[L_0]], i32 16383
-; CHECK-NEXT:    store i32 [[S_0]], i32* [[PTR]], align 4
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ugt i32 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i32 [[L_1]], i32 16383
-; CHECK-NEXT:    store i32 [[S_1]], i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i32, i32* [[GEP_2]], align 4
-; CHECK-NEXT:    [[CMP_2:%.*]] = icmp ult i32 [[L_2]], 16383
-; CHECK-NEXT:    [[S_2:%.*]] = select i1 [[CMP_2]], i32 [[L_2]], i32 16383
-; CHECK-NEXT:    store i32 [[S_2]], i32* [[GEP_2]], align 4
-; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 3
-; CHECK-NEXT:    [[L_3:%.*]] = load i32, i32* [[GEP_3]], align 4
-; CHECK-NEXT:    [[CMP_3:%.*]] = icmp ugt i32 [[L_3]], 16383
-; CHECK-NEXT:    [[S_3:%.*]] = select i1 [[CMP_3]], i32 [[L_3]], i32 16383
-; CHECK-NEXT:    store i32 [[S_3]], i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[PTR:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 poison, i32 16383>, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> <i32 16383, i32 16383, i32 16383, i32 poison>, i32 [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt <4 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> [[TMP11]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP1]], <4 x i32> <i32 16383, i32 16383, i32 16383, i32 16383>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
@@ -53,13 +53,17 @@
 
 define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) {
 ; CHECK-LABEL: @i64_simplifiedi_extract(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD:%.*]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST:%.*]] to <4 x i64>*
-; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3
-; CHECK-NEXT:    store i64 [[TMP4]], i64* [[LD]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
+; CHECK-NEXT:    [[T0:%.*]] = load i64, i64* [[LD]], align 8
+; CHECK-NEXT:    [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    store i64 [[T0]], i64* [[ST]], align 8
+; CHECK-NEXT:    store i64 [[T0]], i64* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    store i64 [[T0]], i64* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    store i64 [[T1]], i64* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    store i64 [[T1]], i64* [[LD]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -33,8 +33,11 @@
 ; AVX-NEXT:    ret void
 ;
 ; AVX512-LABEL: @foo(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> <i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2)>, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
-; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX512-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16
+; AVX512-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
+; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
+; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 ; AVX512-NEXT:    store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16
 ; AVX512-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -104,24 +104,20 @@
 ;
 ; AVX512F-LABEL: @gather_load_2(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512F-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX512F-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX512F-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX512F-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; AVX512F-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
+; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
+; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512F-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
@@ -258,44 +254,36 @@
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_3(
-; AVX512F-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512F-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX512F-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX512F-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX512F-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX512F-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX512F-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX512F-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX512F-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX512F-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX512F-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512F-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512F-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX512F-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512F-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512F-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX512F-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512F-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512F-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX512F-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
+; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
+; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
+; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512F-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX512F-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX512F-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX512F-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512F-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0
+; AVX512F-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1
+; AVX512F-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2
+; AVX512F-NEXT:    [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3
+; AVX512F-NEXT:    [[TMP29:%.*]] = add <4 x i32> [[TMP28]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_3(
@@ -469,44 +457,36 @@
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_4(
-; AVX512F-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX512F-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX512F-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX512F-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX512F-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX512F-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX512F-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
+; AVX512F-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
 ; AVX512F-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX512F-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; AVX512F-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512F-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX512F-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512F-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX512F-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX512F-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0
+; AVX512F-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2
+; AVX512F-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3
+; AVX512F-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512F-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX512F-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX512F-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX512F-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX512F-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512F-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512F-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX512F-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0
+; AVX512F-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1
+; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2
+; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3
+; AVX512F-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_4(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -104,24 +104,20 @@
 ;
 ; AVX512F-LABEL: @gather_load_2(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512F-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX512F-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX512F-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX512F-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; AVX512F-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
+; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
+; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512F-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
@@ -258,44 +254,36 @@
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_3(
-; AVX512F-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512F-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX512F-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX512F-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX512F-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX512F-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX512F-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX512F-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX512F-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX512F-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX512F-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512F-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512F-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX512F-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512F-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512F-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX512F-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512F-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512F-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX512F-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
+; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
+; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
+; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512F-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX512F-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX512F-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX512F-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512F-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0
+; AVX512F-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1
+; AVX512F-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2
+; AVX512F-NEXT:    [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3
+; AVX512F-NEXT:    [[TMP29:%.*]] = add <4 x i32> [[TMP28]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_3(
@@ -469,44 +457,36 @@
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_4(
-; AVX512F-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX512F-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX512F-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX512F-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX512F-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX512F-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX512F-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
+; AVX512F-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
 ; AVX512F-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX512F-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; AVX512F-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512F-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX512F-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512F-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX512F-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX512F-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0
+; AVX512F-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2
+; AVX512F-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3
+; AVX512F-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512F-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX512F-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX512F-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX512F-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX512F-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512F-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512F-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX512F-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0
+; AVX512F-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1
+; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2
+; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3
+; AVX512F-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>*
+; AVX512F-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_4(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
@@ -8,21 +8,22 @@
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB2:%.*]] ], [ zeroinitializer, [[BB1:%.*]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP1]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP1]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP1]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP1]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP9]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[SHUFFLE]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = mul i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP65:%.*]] = sext i32 [[OP_RDX]] to i64
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ 0, [[BB2:%.*]] ], [ 0, [[BB1:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ 0, [[BB2]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    [[TTMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TTMP4]], i32 [[TMP4]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP4]], i32 7
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP7]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = mul i32 [[TMP8]], [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = mul i32 [[OP_RDX1]], [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = mul i32 [[OP_RDX2]], [[TMP]]
+; CHECK-NEXT:    [[TMP65:%.*]] = sext i32 [[OP_RDX3]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP65]]
 ;
 bb1:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -4,20 +4,32 @@
 define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
-; CHECK-NEXT:    ret i32 [[TMP11]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[A1:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[TMP0]], [[A2:%.*]]
+; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[TMP0]], [[A3:%.*]]
+; CHECK-NEXT:    [[ADD6:%.*]] = add i32 [[TMP0]], [[A4:%.*]]
+; CHECK-NEXT:    [[ADD8:%.*]] = add i32 [[TMP0]], [[A5:%.*]]
+; CHECK-NEXT:    [[ADD10:%.*]] = add i32 [[TMP0]], [[A6:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARR]], align 4
+; CHECK-NEXT:    [[ADD12:%.*]] = add i32 [[TMP1]], [[A7:%.*]]
+; CHECK-NEXT:    [[ADD14:%.*]] = add i32 [[TMP1]], [[A8:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]]
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]]
+; CHECK-NEXT:    [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]]
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]]
+; CHECK-NEXT:    [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]]
+; CHECK-NEXT:    [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]]
+; CHECK-NEXT:    [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]]
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]]
+; CHECK-NEXT:    [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]]
+; CHECK-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]]
+; CHECK-NEXT:    [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]]
+; CHECK-NEXT:    [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]]
+; CHECK-NEXT:    [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]]
+; CHECK-NEXT:    ret i32 [[COND44]]
 ;
 entry:
   %arrayidx = getelementptr inbounds i32, i32* %arr, i64 1