diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2195,14 +2195,6 @@
     return false;
   }
 
-  if (Hints.getInterleave() > 1) {
-    // TODO: Interleave support is future work.
-    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
-                         "outer loops.\n");
-    Hints.emitRemarkWithHints();
-    return false;
-  }
-
   return true;
 }
 
@@ -4019,16 +4011,30 @@
   auto Iter = vp_depth_first_deep(Plan.getEntry());
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
     for (VPRecipeBase &P : VPBB->phis()) {
-      VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
-      if (!VPPhi)
-        continue;
-      PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
-      // Make sure the builder has a valid insert point.
-      Builder.SetInsertPoint(NewPhi);
-      for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
-        VPValue *Inc = VPPhi->getIncomingValue(i);
-        VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
-        NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
+      if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P)) {
+        for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+          PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, Part));
+          // Make sure the builder has a valid insert point.
+          Builder.SetInsertPoint(NewPhi);
+          for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
+            VPValue *Inc = VPPhi->getIncomingValue(i);
+            VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+            NewPhi->addIncoming(State.get(Inc, Part),
+                                State.CFG.VPBB2IRBB[VPBB]);
+          }
+        }
+      }
+
+      if (auto *VPPhi = dyn_cast<VPScalarPHIRecipe>(&P)) {
+        PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, VPIteration(0, 0)));
+        // Make sure the builder has a valid insert point.
+        Builder.SetInsertPoint(NewPhi);
+        for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
+          VPValue *Inc = VPPhi->getIncomingValue(i);
+          VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+          NewPhi->addIncoming(State.get(Inc, VPIteration(0, 0)),
+                              State.CFG.VPBB2IRBB[VPBB]);
+        }
       }
     }
   }
@@ -7342,16 +7348,23 @@
 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
 // doesn't have a cost model that can choose which plan to execute if
 // more than one is generated.
-static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
-                                 LoopVectorizationCostModel &CM) {
+static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
+                                     LoopVectorizationCostModel &CM) {
   unsigned WidestType;
   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
-  return WidestVectorRegBits / WidestType;
+
+  TargetTransformInfo::RegisterKind RegKind =
+      TTI.enableScalableVectorization()
+          ? TargetTransformInfo::RGK_ScalableVector
+          : TargetTransformInfo::RGK_FixedWidthVector;
+
+  TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
+  unsigned N = RegSize.getKnownMinValue() / WidestType;
+  return ElementCount::get(N, RegSize.isScalable());
 }
 
 VectorizationFactor
 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
-  assert(!UserVF.isScalable() && "scalable vectors not yet supported");
   ElementCount VF = UserVF;
   // Outer loop handling: They may require CFG and instruction level
   // transformations before even evaluating whether vectorization is profitable.
@@ -7361,10 +7374,7 @@
     // If the user doesn't provide a vectorization factor, determine a
     // reasonable one.
     if (UserVF.isZero()) {
-      VF = ElementCount::getFixed(determineVPlanVF(
-          TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
-              .getFixedValue(),
-          CM));
+      VF = determineVPlanVF(TTI, CM);
       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
 
       // Make sure we have a VF > 1 for stress testing.
@@ -7373,6 +7383,19 @@
                           << "overriding computed VF.\n");
         VF = ElementCount::getFixed(4);
       }
+    } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
+               !ForceTargetSupportsScalableVectors) {
+
+      ORE->emit([&]() {
+        return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                          OrigLoop->getStartLoc(),
+                                          OrigLoop->getHeader())
+               << "User-specified vectorization factor "
+               << ore::NV("UserVectorizationFactor", UserVF)
+               << " cannot be used for outer-loop vectorization because the"
+               << " target does not support scalable vectors.";
+      });
+      return VectorizationFactor::Disabled();
     }
     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
@@ -8977,6 +9000,10 @@
 
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
                         CM.getTailFoldingStyle());
+  VPlanTransforms::findAndReplaceUniformRecipes(*Plan, OrigLoop, *PSE.getSE(),
+                                                *LI);
+  VPlanTransforms::optimize(*Plan, *PSE.getSE());
+  LLVM_DEBUG(Plan->dump());
   return Plan;
 }
 
@@ -9603,6 +9630,12 @@
 
   CM.collectElementTypesForWidening();
 
+  // The VPlan-native path does not have a cost model, so the only way to get
+  // a unroll factor is to query the loop vectorization hints.
+  unsigned UF = Hints.getInterleave();
+  if (!UF)
+    UF = 1;
+
   // Plan how to best vectorize, return the best VF and its cost.
   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
 
@@ -9618,10 +9651,10 @@
     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
                              F->getParent()->getDataLayout());
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
-                           VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
+                           VF.Width, UF, LVL, &CM, BFI, PSI, Checks);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                       << L->getHeader()->getParent()->getName() << "\"\n");
-    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
+    LVP.executePlan(VF.Width, UF, BestPlan, LB, DT, false);
   }
 
   reportVectorization(ORE, L, VF, 1);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1116,6 +1116,7 @@
     case VPInstruction::CalculateTripCountMinusVF:
     case VPInstruction::CanonicalIVIncrement:
     case VPInstruction::CanonicalIVIncrementForPart:
+    case VPInstruction::BranchOnCond:
     case VPInstruction::BranchOnCount:
       return true;
     };
@@ -1514,6 +1515,50 @@
   VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
 };
 
+// A recipe for handling header phis that stay scalar in the vector loop.
+// Only to be used in the VPlan native path, and only in inner loops,
+// never the top-level loop of a VPlan.
+class VPScalarPHIRecipe : public VPHeaderPHIRecipe {
+  /// List of incoming blocks.
+  SmallVector<VPBasicBlock *, 2> IncomingBlocks;
+
+public:
+  /// Create a new VPScalarPHIRecipe for \p Phi with start value \p Start.
+  VPScalarPHIRecipe(PHINode *Phi)
+      : VPHeaderPHIRecipe(VPDef::VPScalarPHISC, Phi) {}
+
+  ~VPScalarPHIRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPScalarPHISC)
+
+  /// Generate the phi nodes.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
+  void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
+    addOperand(IncomingV);
+    IncomingBlocks.push_back(IncomingBlock);
+  }
+
+  /// Returns the \p I th incoming VPBasicBlock.
+  VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
+
+  /// Returns the \p I th incoming VPValue.
+  VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+};
+
 /// A recipe for handling first-order recurrence phis. The start value is the
 /// first operand of the recipe and the incoming value from the backedge is the
 /// second operand.
@@ -1975,6 +2020,9 @@
   // Return whether the loaded-from / stored-to addresses are consecutive.
   bool isConsecutive() const { return Consecutive; }
 
+  // Mark the memory access of this recipe as beeing consecutive.
+  void makeConsecutive() { Consecutive = true; }
+
   // Return whether the consecutive loaded/stored addresses are in reverse
   // order.
   bool isReverse() const { return Reverse; }
@@ -3004,6 +3052,8 @@
     return Rep->isUniform();
   if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
     return all_of(GEP->operands(), isUniformAfterVectorization);
+  if (isa<VPScalarPHIRecipe>(VPV))
+    return true;
   return false;
 }
 } // end namespace vputils
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -795,7 +795,7 @@
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
   for (VPRecipeBase &R : Header->phis()) {
     // Skip phi-like recipes that generate their backedege values themselves.
-    if (isa<VPWidenPHIRecipe>(&R))
+    if (isa<VPWidenPHIRecipe>(&R) || isa<VPScalarPHIRecipe>(&R))
       continue;
 
     if (isa<VPWidenPointerInductionRecipe>(&R) ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -119,7 +119,7 @@
   // Get or create a region for the loop containing BB.
   Loop *CurrentLoop = LI->getLoopFor(BB);
   VPRegionBlock *ParentR = nullptr;
-  if (CurrentLoop) {
+  if (CurrentLoop && CurrentLoop->getLoopDepth() >= TheLoop->getLoopDepth()) {
     auto Iter = Loop2Region.insert({CurrentLoop, nullptr});
     if (Iter.second)
       Iter.first->second = new VPRegionBlock(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -61,6 +61,7 @@
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
+  case VPScalarPHISC:
   case VPWidenSC:
   case VPWidenSelectSC: {
     const Instruction *I =
@@ -95,6 +96,7 @@
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
+  case VPScalarPHISC:
   case VPWidenSC:
   case VPWidenSelectSC: {
     const Instruction *I =
@@ -136,6 +138,7 @@
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
+  case VPScalarPHISC:
   case VPWidenPointerInductionSC:
   case VPWidenSC:
   case VPWidenSelectSC: {
@@ -1643,10 +1646,12 @@
         StartIdx = I;
     }
   }
-  Value *Op0 = State.get(getOperand(StartIdx), 0);
-  Type *VecTy = Op0->getType();
-  Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
-  State.set(this, VecPhi, 0);
+
+  Type *VecTy = State.get(getOperand(StartIdx), 0)->getType();
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+    Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
+    State.set(this, VecPhi, Part);
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1670,6 +1675,46 @@
 }
 #endif
 
+void VPScalarPHIRecipe::execute(VPTransformState &State) {
+  assert(EnableVPlanNativePath &&
+         "Non-native vplans are not expected to have VPScalarPHIRecipes.");
+
+  // This recipe is used in outer-loop vectorization for the PHIs in
+  // the headers of inner loops. Only unifom loop nests are supported,
+  // control flow is always uniform.
+
+  // Create a phi with no operands - the phi operands will be
+  // set at the end of vector code generation.
+  VPBasicBlock *Parent = getParent();
+  VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
+  unsigned StartIdx = 0;
+  // For phis in header blocks of loop regions, use the index of the value
+  // coming from the preheader to get the type.
+  if (LoopRegion->getEntryBasicBlock() == Parent) {
+    for (unsigned I = 0; I < getNumOperands(); ++I) {
+      if (getIncomingBlock(I) ==
+          LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
+        StartIdx = I;
+    }
+  }
+
+  Type *Ty = State.get(getOperand(StartIdx), VPIteration(0, 0))->getType();
+  Value *NewPhi = State.Builder.CreatePHI(Ty, 2, "scalar.phi");
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+    State.set(this, NewPhi, VPIteration(Part, 0));
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << Indent << "SCALAR-PHI ";
+
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
 // remove VPActiveLaneMaskPHIRecipe.
 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -23,6 +23,7 @@
 class PHINode;
 class ScalarEvolution;
 class Loop;
+class LoopInfo;
 class PredicatedScalarEvolution;
 class TargetLibraryInfo;
 class VPBuilder;
@@ -37,6 +38,14 @@
                                 GetIntOrFpInductionDescriptor,
                             ScalarEvolution &SE, const TargetLibraryInfo &TLI);
 
+  /// Replace widening recipes where all users only use the first lane by
+  /// uniform VPReplicateRecipes. Also, check if memory accesses can be
+  /// marked as uniform or consecutive. This transformation is only usefull to
+  /// the VPlan-native path.
+  static void findAndReplaceUniformRecipes(VPlan &Plan, const Loop *TheLoop,
+                                           ScalarEvolution &SE,
+                                           const LoopInfo &LI);
+
   /// Sink users of fixed-order recurrences after the recipe defining their
   /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions
   /// to combine the value from the recurrence phis and previous values. The
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -13,15 +13,19 @@
 
 #include "VPlanTransforms.h"
 #include "VPRecipeBuilder.h"
+#include "VPlan.h"
 #include "VPlanCFG.h"
 #include "VPlanDominatorTree.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 
+#define DEBUG_TYPE "loop-vectorize"
+
 using namespace llvm;
 
 using namespace llvm::PatternMatch;
@@ -96,6 +100,297 @@
   }
 }
 
+// Knowing that all recipes in ScalarUse have at least one user that only uses
+// the first lane, find recipes that are widening but that can be replaced to
+// only calculate the scalar value of the first lane.
+static void
+collectScalarizeableRecipes(SetVector<VPRecipeBase *> &ScalarUse,
+                            SetVector<VPRecipeBase *> &Scalarizeable) {
+  // Return true if the value V is only used by recipes that only require
+  // the first lane or by VPWidenPHINodes, and false otherwise.
+  auto CheckUses =
+      [&](VPValue *V,
+          SmallSetVector<VPWidenPHIRecipe *, 2> &NonScalarPHIUses) -> bool {
+    for (VPUser *U : V->users()) {
+      if (U->onlyFirstLaneUsed(V))
+        continue;
+
+      if (auto *R = dyn_cast<VPRecipeBase>(U); R && Scalarizeable.contains(R))
+        continue;
+
+      if (auto *Phi = dyn_cast<VPWidenPHIRecipe>(U)) {
+        NonScalarPHIUses.insert(Phi);
+        continue;
+      }
+
+      return false;
+    }
+    return true;
+  };
+
+  // If the Phi has a single non-scalar user that is a VPWidenPHIRecipe,
+  // return that Phi. Otherwise, return nullptr.
+  auto GetOnlyNonScalarUseOfPhi =
+      [&](VPWidenPHIRecipe *Phi) -> VPWidenRecipe * {
+    VPWidenRecipe *SingleNonScalarUse = nullptr;
+    for (VPUser *U : Phi->users()) {
+      if (U->onlyFirstLaneUsed(Phi) ||
+          (isa<VPRecipeBase>(U) &&
+           Scalarizeable.contains(cast<VPRecipeBase>(U))))
+        continue;
+
+      if (SingleNonScalarUse || !isa<VPWidenRecipe>(U))
+        return nullptr;
+
+      SingleNonScalarUse = cast<VPWidenRecipe>(U);
+    }
+    return SingleNonScalarUse;
+  };
+
+  SmallSetVector<VPWidenPHIRecipe *, 2> NonScalarPHIUses;
+
+  // Start the worklist with all recipes that have at least one scalar use.
+  SetVector<VPRecipeBase *> Worklist(ScalarUse);
+  while (!Worklist.empty()) {
+    VPRecipeBase *R = Worklist.pop_back_val();
+    VPValue *V = R->getVPSingleValue();
+    if (!V || Scalarizeable.contains(R) ||
+        !(isa<VPWidenRecipe>(R) || isa<VPWidenSelectRecipe>(R) ||
+          isa<VPWidenGEPRecipe>(R) || isa<VPWidenCastRecipe>(R) ||
+          isa<VPWidenPHIRecipe>(R)))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "LV: Poped worklist item: "; V->dump());
+
+    // Phi-nodes can create def-use chain cycles, so we look exactly one
+    // instruction ahead to know if its only non-scalar use could be scalarized
+    // if the PHI itself is scalar. This allowes scalarization of inner-loop
+    // induction variables.
+    if (auto *Phi = dyn_cast<VPWidenPHIRecipe>(R)) {
+      VPWidenRecipe *User = GetOnlyNonScalarUseOfPhi(Phi);
+      if (User && is_contained(User->users(), Phi) &&
+          all_of(User->users(), [&](VPUser *U) -> bool {
+            return U == Phi || U->onlyFirstLaneUsed(User) ||
+                   (isa<VPRecipeBase>(U) &&
+                    Scalarizeable.contains(cast<VPRecipeBase>(U)));
+          })) {
+
+        // The PHI can be scalarized!
+        LLVM_DEBUG(dbgs() << "LV: Scalarize: "; V->dump());
+        Scalarizeable.insert(Phi);
+        for (VPValue *Op : R->operands())
+          if (auto *OpR = Op->getDefiningRecipe())
+            Worklist.insert(OpR);
+      }
+    }
+
+    NonScalarPHIUses.clear();
+    if (!CheckUses(V, NonScalarPHIUses))
+      continue;
+
+    // If absolutely all uses are scalar, add the recipe to the set of
+    // scalarizeable recipes and add everything it uses itself to the
+    // worklist (if that is a recipe that is not already in the set).
+    if (NonScalarPHIUses.empty()) {
+      LLVM_DEBUG(dbgs() << "LV: Scalarize: "; V->dump());
+      Scalarizeable.insert(R);
+      for (VPValue *Op : R->operands())
+        if (auto *OpR = Op->getDefiningRecipe())
+          Worklist.insert(OpR);
+      continue;
+    }
+
+    // Make sure all PHI
+    for (VPWidenPHIRecipe *UsingPhi : NonScalarPHIUses) {
+      // Add all users of the PHI to the worklist, except the current recipe.
+      for (VPUser *U : UsingPhi->users())
+        if (auto *UR = dyn_cast<VPRecipeBase>(U); UR && UR != R)
+          Worklist.insert(UR);
+
+      // Now add the PHI itself to the worklist.
+      Worklist.insert(UsingPhi);
+    }
+  }
+}
+
+enum class MemAccessKind { Unknown, Uniform, Consecutive };
+
+// Helper function for the VPlan-native path that returns what kind
+// of memory access the pointer represents: Unknown, Uniform or Consecutive.
+static MemAccessKind
+checkMemoryAccessesForVPlanNativePath(VPValue *Ptr, Type *AccessTy,
+                                      ScalarEvolution &SE, const LoopInfo &LI,
+                                      const Loop *TheLoop) {
+  Value *V = Ptr->getUnderlyingValue();
+  if (!V || !V->getType()->isPointerTy())
+    return MemAccessKind::Unknown;
+
+  const SCEV *PtrScev = SE.getSCEV(V);
+  if (isa<SCEVCouldNotCompute>(PtrScev))
+    return MemAccessKind::Unknown;
+
+  // Peel of recurrences around inner loops of TheLoop.
+  const SCEV *S = PtrScev;
+  while (true) {
+    if (auto *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+      // Stop when a recurrence around TheLoop was found, or when we hit a outer
+      // loop of TheLoop.
+      if (AR->getLoop() == TheLoop || AR->getLoop()->contains(TheLoop))
+        break;
+
+      // The step of a inner loop can be whatever it wants, as long as it
+      // does not depend on the current iteration of TheLoop.
+      const SCEV *Step = AR->getStepRecurrence(SE);
+      if (!SE.isLoopInvariant(Step, TheLoop))
+        return MemAccessKind::Unknown;
+
+      S = AR->getStart();
+      continue;
+    }
+
+    // Add's can be ignored if the value that is added is loop invariant.
+    if (auto *Add = dyn_cast<SCEVAddExpr>(S)) {
+      for (unsigned I = 1, N = Add->getNumOperands(); I < N; ++I)
+        if (!SE.isLoopInvariant(Add->getOperand(I), TheLoop))
+          return MemAccessKind::Unknown;
+
+      S = Add->getOperand(0);
+      continue;
+    }
+
+    break;
+  }
+
+  // If the unpeeled SCEV for the pointer is a recurrence around TheLoop,
+  // this memory access could be consecutive.
+  auto *AR = dyn_cast<SCEVAddRecExpr>(S);
+  if (AR && AR->getLoop() == TheLoop) {
+    const auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE));
+    if (!Step)
+      return MemAccessKind::Unknown;
+
+    // Check if the step if equal to the size of the accessed elements.
+    auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+    TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
+    int64_t Size = AllocSize.getFixedValue();
+    if (Step->getAPInt() != Size)
+      return MemAccessKind::Unknown;
+
+    // The address calculation is not allowed to wrap.
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(V); GEP && GEP->isInBounds())
+      return MemAccessKind::Consecutive;
+
+    // Even if the address calculation is not explicitly marked as not wrapping,
+    // we can assume that it does not if the null pointer is undefined.
+    if (!NullPointerIsDefined(TheLoop->getHeader()->getParent(),
+                              V->getType()->getPointerAddressSpace()))
+      return MemAccessKind::Consecutive;
+
+    return MemAccessKind::Unknown;
+  }
+
+  // If the unpeeled SCEV for the pointer is invariant to the vectorized loop,
+  // the access will be uniform accross all lanes.
+  return SE.isLoopInvariant(S, TheLoop) ? MemAccessKind::Uniform
+                                        : MemAccessKind::Unknown;
+}
+
+void VPlanTransforms::findAndReplaceUniformRecipes(VPlan &Plan,
+                                                   const Loop *TheLoop,
+                                                   ScalarEvolution &SE,
+                                                   const LoopInfo &LI) {
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+
+  // Helper function to replace a recipe by another one.
+  auto ReplaceRecipe = [](VPRecipeBase *OldRep, VPRecipeBase *NewRep) {
+    assert(NewRep->getNumDefinedValues() <= 1 &&
+           OldRep->getNumDefinedValues() <= 1 && "unexpected number of values");
+    NewRep->insertBefore(OldRep);
+    if (OldRep->getNumDefinedValues() == 1)
+      OldRep->getVPSingleValue()->replaceAllUsesWith(
+          NewRep->getVPSingleValue());
+    OldRep->eraseFromParent();
+  };
+
+  SetVector<VPRecipeBase *> HasScalarUse;
+
+  // Recipes are visited in reverse order because that minimizes the amount
+  // of work in collectScalarizeableRecipes() in the common cases.
+  for (VPBasicBlock *VPBB :
+       reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
+    // The branch-on-cond terminator recipe only uses the first lane value.
+    if (auto *Br = dyn_cast_or_null<VPInstruction>(VPBB->getTerminator())) {
+      if (Br->getOpcode() == VPInstruction::BranchOnCond)
+        if (auto *R = Br->getOperand(0)->getDefiningRecipe())
+          HasScalarUse.insert(R);
+    }
+
+    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
+      if (auto *MemRecipe = dyn_cast<VPWidenMemoryInstructionRecipe>(&R)) {
+        VPValue *Ptr = MemRecipe->getAddr();
+        Type *ETy = getLoadStoreType(&MemRecipe->getIngredient());
+        MemAccessKind MemAccess =
+            checkMemoryAccessesForVPlanNativePath(Ptr, ETy, SE, LI, TheLoop);
+
+        // Replace uniform loads by a replicating load, and check if the
+        // recipes used for the address calculation can be scalarized.
+        if (MemAccess == MemAccessKind::Uniform && !MemRecipe->isStore()) {
+          LLVM_DEBUG(dbgs() << "LV: Uniform memory access: ";
+                     MemRecipe->dump());
+          assert(MemRecipe->getMask() == nullptr);
+          auto *UniformLoad = new VPReplicateRecipe(
+              &MemRecipe->getIngredient(), MemRecipe->operands(), true);
+          ReplaceRecipe(MemRecipe, UniformLoad);
+          if (auto *R = Ptr->getDefiningRecipe())
+            HasScalarUse.insert(R);
+          continue;
+        }
+
+        // Mark consecutive loads or stores as such, and check if the address
+        // calculation recipes can be scalarized.
+        if (MemAccess == MemAccessKind::Consecutive) {
+          LLVM_DEBUG(dbgs() << "LV: Consecutive memory access: ";
+                     MemRecipe->dump());
+          MemRecipe->makeConsecutive();
+          if (auto *R = Ptr->getDefiningRecipe())
+            HasScalarUse.insert(R);
+          continue;
+        }
+
+        LLVM_DEBUG(dbgs() << "LV: Non-consecutive non-uniform memory access: ";
+                   MemRecipe->dump());
+      }
+    }
+  }
+
+  // A set of recipes where only the value of lane zero is needed.
+  SetVector<VPRecipeBase *> ScalarizeableRecipes;
+  collectScalarizeableRecipes(HasScalarUse, ScalarizeableRecipes);
+
+  // Replace all the recipes that compute vectors by ones that
+  // only compute the fist lane.
+  for (VPRecipeBase *R : ScalarizeableRecipes) {
+    Instruction *I = R->getUnderlyingInstr();
+
+    // Handle PHIs:
+    if (auto *WidenPhi = dyn_cast<VPWidenPHIRecipe>(R)) {
+      auto *ScalarPhi = new VPScalarPHIRecipe(cast<PHINode>(I));
+      for (unsigned I = 0, E = WidenPhi->getNumOperands(); I != E; I++)
+        ScalarPhi->addIncoming(WidenPhi->getIncomingValue(I),
+                               WidenPhi->getIncomingBlock(I));
+      ReplaceRecipe(R, ScalarPhi);
+      continue;
+    }
+
+    // All other widening recipes can be replaced by VPReplicateRecipe
+    // instances that are marked as uniform.
+    assert(isa<VPWidenRecipe>(R) || isa<VPWidenGEPRecipe>(R) ||
+           isa<VPWidenSelectRecipe>(R) || isa<VPWidenCastRecipe>(R));
+    ReplaceRecipe(R, new VPReplicateRecipe(I, R->operands(), true));
+  }
+}
+
 static bool sinkScalarOperands(VPlan &Plan) {
   auto Iter = vp_depth_first_deep(Plan.getEntry());
   bool Changed = false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -359,6 +359,7 @@
     VPActiveLaneMaskPHISC,
     VPFirstOrderRecurrencePHISC,
     VPWidenPHISC,
+    VPScalarPHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
     VPReductionPHISC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
@@ -16,35 +16,42 @@
 ; }
 ;
 
-; CHECK-LABEL: @foo_i32(
-; CHECK-LABEL: vector.ph:
-; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> poison, i32 %n, i64 0
-; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> poison, <4 x i32> zeroinitializer
-
-; CHECK-LABEL: vector.body:
-; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
-; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
-; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]]
-; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
-; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
-; CHECK: br label %[[InnerLoop:.+]]
-
-; CHECK: [[InnerLoop]]:
-; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
-; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StoreVal]], <4 x ptr> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
-; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
-; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
-
-; CHECK: [[ForInc]]:
-; CHECK: %[[IndNext]] = add nuw i64 %[[Ind]], 4
-; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
-; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+; CHECK-LABEL: define void @foo_i32
+; CHECK-SAME: (i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %vector.ph
+
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[SCALAR_PHI:%.*]] = phi i64 [ 0, %vector.body ], [ [[TMP8:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[SCALAR_PHI]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8]] = add nuw nsw i64 [[SCALAR_PHI]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 8
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[FOR_INC]], label %[[FOR_INNER]]
+
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %vector.body
 
 @arr2 = external global [8 x i32], align 16
 @arr = external global [8 x [8 x i32]], align 16
@@ -83,33 +90,40 @@
   ret void
 }
 
-; CHECK-LABEL: @foo_i64(
-; CHECK-LABEL: vector.ph:
-; CHECK: %[[SplatVal:.*]] = insertelement <2 x i64> poison, i64 %n, i64 0
-; CHECK: %[[Splat:.*]] = shufflevector <2 x i64> %[[SplatVal]], <2 x i64> poison, <2 x i32> zeroinitializer
-
-; CHECK-LABEL: vector.body:
-; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
-; CHECK: %[[VecInd:.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
-; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i64], ptr @arrX, i64 0, <2 x i64> %[[VecInd]]
-; CHECK: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> %[[VecInd]], <2 x ptr> %[[AAddr]], i32 4, <2 x i1> <i1 true, i1 true>)
-; CHECK: %[[StoreVal:.*]] = add nsw <2 x i64> %[[VecInd]], %[[Splat]]
-; CHECK: br label %[[InnerLoop:.+]]
-
-; CHECK: [[InnerLoop]]:
-; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
-; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i64]], ptr @arrY, i64 0, <2 x i64> %[[InnerPhi]], <2 x i64> %[[VecInd]]
-; CHECK: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> %[[StoreVal]], <2 x ptr> %[[AAddr2]], i32 4, <2 x i1> <i1 true, i1 true>
-; CHECK: %[[InnerPhiNext]] = add nuw nsw <2 x i64> %[[InnerPhi]], <i64 1, i64 1>
-; CHECK: %[[VecCond:.*]] = icmp eq <2 x i64> %[[InnerPhiNext]], <i64 8, i64 8>
-; CHECK: %[[InnerCond:.*]] = extractelement <2 x i1> %[[VecCond]], i32 0
-; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
-
-; CHECK: [[ForInc]]:
-; CHECK: %[[IndNext]] = add nuw i64 %[[Ind]], 2
-; CHECK: %[[VecIndNext]] = add <2 x i64> %[[VecInd]], <i64 2, i64 2>
-; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
-; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+; CHECK-LABEL: define void @foo_i64
+; CHECK-SAME: (i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label %vector.ph
+
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [8 x i64], ptr @arrX, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[SCALAR_PHI:%.*]] = phi i64 [ 0, %vector.body ], [ [[TMP6:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [8 x [8 x i64]], ptr @arrY, i64 0, i64 [[SCALAR_PHI]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6]] = add nuw nsw i64 [[SCALAR_PHI]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP6]], 8
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[FOR_INC]], label %[[FOR_INNER]]
+
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %vector.body
 ; Function Attrs: norecurse nounwind uwtable
 define void @foo_i64(i64 %n) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path -mtriple x86_64  < %s | FileCheck %s
 ; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path -mtriple x86_64  -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
 ; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path -mtriple x86_64  -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX
@@ -18,68 +19,130 @@
 ; }
 ;
 
-; CHECK-LABEL: vector.ph:
-; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> poison, i32 %n, i64 0
-; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> poison, <4 x i32> zeroinitializer
-
-; CHECK-LABEL: vector.body:
-; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
-; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
-; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]]
-; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
-; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
-; CHECK: br label %[[InnerLoop:.+]]
-
-; CHECK: [[InnerLoop]]:
-; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
-; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StoreVal]], <4 x ptr> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
-; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
-; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
-
-; CHECK: [[ForInc]]:
-; CHECK: %[[IndNext]] = add nuw i64 %[[Ind]], 4
-; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
-; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
-
-; AVX-LABEL: vector.ph:
-; AVX: %[[SplatVal:.*]] = insertelement <8 x i32> poison, i32 %n, i64 0
-; AVX: %[[Splat:.*]] = shufflevector <8 x i32> %[[SplatVal]], <8 x i32> poison, <8 x i32> zeroinitializer
-
-; AVX-LABEL: vector.body:
-; AVX: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
-; AVX: %[[VecInd:.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
-; AVX: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <8 x i64> %[[VecInd]]
-; AVX: %[[VecIndTr:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32>
-; AVX: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %[[VecIndTr]], <8 x ptr> %[[AAddr]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; AVX: %[[VecIndTr2:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32>
-; AVX: %[[StoreVal:.*]] = add nsw <8 x i32> %[[VecIndTr2]], %[[Splat]]
-; AVX: br label %[[InnerLoop:.+]]
-
-; AVX: [[InnerLoop]]:
-; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
-; AVX: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <8 x i64> %[[InnerPhi]], <8 x i64> %[[VecInd]]
-; AVX: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %[[StoreVal]], <8 x ptr> %[[AAddr2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true
-; AVX: %[[InnerPhiNext]] = add nuw nsw <8 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; AVX: %[[VecCond:.*]] = icmp eq <8 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX: %[[InnerCond:.*]] = extractelement <8 x i1> %[[VecCond]], i32 0
-; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
-
-; AVX: [[ForInc]]:
-; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX: %[[IndNext]] = add nuw i64 %[[Ind]], 8
-; AVX: br i1 true, label %middle.block, label %vector.body
-
 @arr2 = external global [8 x i32], align 16
 @arr = external global [8 x [8 x i32]], align 16
 
 ; Function Attrs: norecurse nounwind uwtable
 define void @foo(i32 %n) {
+; CHECK-LABEL: define void @foo
+; CHECK-SAME: (i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC82:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_INC82]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    br label [[FOR_BODY31:%.*]]
+; CHECK:       for.body31:
+; CHECK-NEXT:    [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP8:%.*]], [[FOR_BODY31]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[SCALAR_PHI]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8]] = add nuw nsw i64 [[SCALAR_PHI]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 8
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FOR_INC82]], label [[FOR_BODY31]]
+; CHECK:       for.inc82:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 8, 8
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END10:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT22:%.*]], [[FOR_INC8:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[INDVARS_IV21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV21]] to i32
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV21]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[N]]
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV21]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_INC8]], label [[FOR_BODY3]]
+; CHECK:       for.inc8:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT22]] = add nuw nsw i64 [[INDVARS_IV21]], 1
+; CHECK-NEXT:    [[EXITCOND23:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT22]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND23]], label [[FOR_END10]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end10:
+; CHECK-NEXT:    ret void
+;
+; AVX-LABEL: define void @foo
+; AVX-SAME: (i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX-NEXT:  entry:
+; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[N]], i64 0
+; AVX-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC82:%.*]] ]
+; AVX-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_INC82]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; AVX-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[TMP0]]
+; AVX-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[VEC_IND]] to <8 x i32>
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; AVX-NEXT:    store <8 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; AVX-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[VEC_IND]] to <8 x i32>
+; AVX-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; AVX-NEXT:    br label [[FOR_BODY31:%.*]]
+; AVX:       for.body31:
+; AVX-NEXT:    [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP8:%.*]], [[FOR_BODY31]] ]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[SCALAR_PHI]], i64 [[TMP0]]
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; AVX-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; AVX-NEXT:    [[TMP8]] = add nuw nsw i64 [[SCALAR_PHI]], 1
+; AVX-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 8
+; AVX-NEXT:    br i1 [[TMP9]], label [[FOR_INC82]], label [[FOR_BODY31]]
+; AVX:       for.inc82:
+; AVX-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; AVX-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX:       middle.block:
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 8, 8
+; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END10:%.*]], label [[SCALAR_PH]]
+; AVX:       scalar.ph:
+; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.body:
+; AVX-NEXT:    [[INDVARS_IV21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT22:%.*]], [[FOR_INC8:%.*]] ]
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[INDVARS_IV21]]
+; AVX-NEXT:    [[TMP10:%.*]] = trunc i64 [[INDVARS_IV21]] to i32
+; AVX-NEXT:    store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+; AVX-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV21]] to i32
+; AVX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[N]]
+; AVX-NEXT:    br label [[FOR_BODY3:%.*]]
+; AVX:       for.body3:
+; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
+; AVX-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV21]]
+; AVX-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
+; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_INC8]], label [[FOR_BODY3]]
+; AVX:       for.inc8:
+; AVX-NEXT:    [[INDVARS_IV_NEXT22]] = add nuw nsw i64 [[INDVARS_IV21]], 1
+; AVX-NEXT:    [[EXITCOND23:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT22]], 8
+; AVX-NEXT:    br i1 [[EXITCOND23]], label [[FOR_END10]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX:       for.end10:
+; AVX-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
--- a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
+++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
@@ -117,13 +117,11 @@
 }
 
 ; Case 3: Annotated outer loop WITH vector width and interleave information
-; doesn't have to be collected.
+; has to be collected.
 
 ; CHECK-LABEL: case3
-; CHECK-NOT: LV: Loop hints: force=enabled
-; CHECK-NOT: LV: We can vectorize this outer loop!
-; CHECK: LV: Loop hints: force=?
-; CHECK: LV: Found a loop: inner.body
+; CHECK: LV: Loop hints: force=enabled width=4 interleave=2
+; CHECK: LV: We can vectorize this outer loop!
 
 define void @case3(ptr nocapture %a, ptr nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
@@ -15,38 +15,33 @@
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_1_LATCH5:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_1_LATCH5]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_1_LATCH4:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_1_LATCH4]] ]
 ; CHECK-NEXT:    br label [[LOOP_2_HEADER1:%.*]]
 ; CHECK:       loop.2.header1:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[LOOP_2_LATCH4:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[LOOP_2_LATCH3:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_32:%.*]]
 ; CHECK:       loop.32:
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[LOOP_2_HEADER1]] ], [ [[TMP2:%.*]], [[LOOP_32]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [2000 x i32], ptr [[SRC:%.*]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[LOOP_2_HEADER1]] ], [ [[TMP2:%.*]], [[LOOP_32]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [2000 x i32], ptr [[SRC:%.*]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], <i32 10, i32 10, i32 10, i32 10>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI3]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP_2_LATCH4]], label [[LOOP_32]]
-; CHECK:       loop.2.latch4:
-; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOOP_1_LATCH5]], label [[LOOP_2_HEADER1]]
-; CHECK:       loop.1.latch5:
-; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP_2_LATCH3]], label [[LOOP_32]]
+; CHECK:       loop.2.latch3:
+; CHECK-NEXT:    [[TMP5]] = add nuw nsw i64 [[SCALAR_PHI]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[LOOP_1_LATCH4]], label [[LOOP_2_HEADER1]]
+; CHECK:       loop.1.latch4:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -75,7 +70,7 @@
 ; CHECK:       loop.1.latch:
 ; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1
 ; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
@@ -21,20 +21,22 @@
 ; CHECK-LABEL: vector.body:
 ; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
 ; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
-; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: %[[IndAdd:.*]] = add i64 %[[Ind]], 0
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 %[[IndAdd]]
 ; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[AAddrCpy:.*]] = getelementptr inbounds i32, ptr %[[AAddr]], i32 0
+; CHECK: store <4 x i32> %[[VecIndTr]], ptr %[[AAddrCpy]], align 4
 ; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
 ; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
 ; CHECK: br label %[[InnerLoop:.+]]
 
 ; CHECK: [[InnerLoop]]:
-; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
-; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StoreVal]], <4 x ptr> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
-; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
+; CHECK: %[[InnerPhi:.*]] = phi i64 [ 0, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 %[[InnerPhi]], i64 %[[IndAdd]]
+; CHECK: %[[AAddr2Cpy:.*]] = getelementptr inbounds i32, ptr %[[AAddr2]], i32 0
+; CHECK: store <4 x i32> %[[StoreVal]], ptr %[[AAddr2Cpy]], align 4
+; CHECK: %[[InnerPhiNext]] = add nuw nsw i64 %[[InnerPhi]], 1
+; CHECK: %[[InnerCond:.*]] = icmp eq i64 %[[InnerPhiNext]], 8
 ; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
 
 ; CHECK: [[ForInc]]:
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
@@ -22,30 +22,36 @@
 ; CHECK-LABEL: vector.body:
 ; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
 ; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
-; CHECK: %[[AAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @A, i64 0, <4 x i64> %[[VecInd]]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[CSplat]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[tmp1:.*]] = add i64 %[[Ind]], 0
+; CHECK: %[[AAddr:.*]] = getelementptr [1024 x i32], ptr @A, i64 0, i64 %[[tmp1]]
+; CHECK: %[[tmp2:.*]] = getelementptr i32, ptr %[[AAddr]], i32 0
+; CHECK: store <4 x i32> %[[CSplat]], ptr %[[tmp2]], align 4
 ; CHECK: br i1 %[[ZeroTripChk]], label %[[InnerForPh:.*]], label %[[OuterInc:.*]]
 
 ; CHECK: [[InnerForPh]]:
-; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK: %[[tmp4:.*]] = getelementptr i32, ptr %[[AAddr]], i32 0
+; CHECK: %[[WideAVal:.*]] = load <4 x i32>, ptr %[[tmp4]], align 4
 ; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
 ; CHECK: br label %[[InnerForBody:.*]]
 
 ; CHECK: [[InnerForBody]]:
-; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ zeroinitializer, %[[InnerForPh]] ], [ %[[InnerIndNext:.*]], %[[InnerForBody]] ]
+; CHECK: %[[InnerInd:.*]] = phi i64 [ 0, %[[InnerForPh]] ], [ %[[InnerIndNext:.*]], %[[InnerForBody]] ]
 ; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[WideAVal]], %[[InnerForPh]] ], [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ]
-; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @B, i64 0, <4 x i64> %[[InnerInd]]
-; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %[[BAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %[[InnerInd]]
+; CHECK: %[[tmp3:.*]] = load i32, ptr %[[BAddr]], align 4
+; CHECK: %[[tmp4:.*]] = insertelement <4 x i32> poison, i32 %[[tmp3]], i64 0
+; CHECK: %[[WideBVal:.*]] = shufflevector <4 x i32> %[[tmp4]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]]
 ; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]]
-; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK: %[[InnerVecCond:.*]] = icmp eq <4 x i64> %[[InnerIndNext]], {{.*}}
-; CHECK: %[[InnerCond:.+]] = extractelement <4 x i1> %[[InnerVecCond]], i32 0
+
+; CHECK: %[[InnerIndNext]] = add nuw nsw i64 %[[InnerInd]], 1
+; CHECK: %[[InnerCond:.*]] = icmp eq i64 %[[InnerIndNext]], {{.*}}
 ; CHECK: br i1 %[[InnerCond]], label %[[InnerCrit:.*]], label %[[InnerForBody]]
 
 ; CHECK: [[InnerCrit]]:
 ; CHECK: %[[StorePhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext]], %[[InnerForBody]] ]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StorePhi]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[tmp5:.*]] = getelementptr i32, ptr %[[AAddr]], i32 0
+; CHECK: store <4 x i32> %[[StorePhi]], ptr %[[tmp5]], align 4
 ; CHECK:  br label %[[ForInc]]
 
 ; CHECK: [[ForInc]]:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
--- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
@@ -20,30 +20,29 @@
 
 ; CHECK: vector.body:
 ; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
-; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
-; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, ptr %a.in, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[A_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> poison)
-; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, ptr %b.in, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[B_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> poison)
+; CHECK-NEXT: %[[TMP1:.*]] = add i64 %[[FOR1_INDEX]], 0
+; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, ptr %a.in, i64 %[[TMP1]]
+; CHECK-NEXT: %[[TMP2:.*]] = getelementptr inbounds double, ptr %[[A_PTR]], i32 0
+; CHECK-NEXT: %[[WIDE_LOAD1:.*]] = load <4 x double>, ptr %[[TMP2]], align 8
+; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, ptr %b.in, i64 %[[TMP1]]
+; CHECK-NEXT: %[[TMP3:.*]] = getelementptr inbounds double, ptr %[[B_PTR]], i32 0
+; CHECK-NEXT: %[[WIDE_LOAD2:.*]] = load <4 x double>, ptr %[[TMP3]], align 8
 ; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
 
 ; CHECK: [[FOR2_HEADER]]:
-; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
-; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
-; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[MASKED_GATHER2]], %[[REDUCTION]]
-; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], <i32 10000, i32 10000, i32 10000, i32 10000>
-; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0
+; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi i32 [ 0, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[WIDE_LOAD1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[WIDE_LOAD2]], %[[REDUCTION]]
+; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw i32 %[[FOR2_INDEX]], 1
+; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i32 %[[FOR2_INDEX_NEXT]], 10000
 ; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}}
 
 ; CHECK: [[FOR1_LATCH]]:
 ; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
-; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> %[[C_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], <i64 1000, i64 1000, i64 1000, i64 1000>
+; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, i64 %[[TMP1]]
+; CHECK-NEXT: %[[TMP4:.*]] = getelementptr inbounds double, ptr %[[C_PTR]], i32 0
+; CHECK-NEXT: store <4 x double> %[[REDUCTION]], ptr %[[TMP4]], align 8
 ; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
-; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
 ; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
@@ -7,35 +7,35 @@
 define void @widen_call_instruction(ptr noalias nocapture readonly %a.in, ptr noalias nocapture readonly %b.in, ptr noalias nocapture %c.out) {
 ; CHECK-LABEL: @widen_call_instruction(
 
-; CHECK: vector.body:
-; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
-; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
-; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, ptr %a.in, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[A_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> poison)
-; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, ptr %b.in, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[B_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> poison)
-; CHECK-NEXT: %[[B_SQRT:.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %[[MASKED_GATHER2]])
-; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
 
-; CHECK: [[FOR2_HEADER]]:
-; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
-; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
-; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[B_SQRT]], %[[REDUCTION]]
-; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], <i32 10000, i32 10000, i32 10000, i32 10000>
-; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0
-; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}}
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[FOR1_LATCH:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr %a.in, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr %b.in, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[WIDE_LOAD1]])
+; CHECK-NEXT:    br label %[[FOR2_HEADER:.*]]
 
-; CHECK: [[FOR1_LATCH]]:
-; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
-; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> %[[C_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
-; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body
+; CHECK:       [[FOR2_HEADER]]:
+; CHECK-NEXT:    [[SCALAR_PHI:%.*]] = phi i32 [ 0, %vector.body ], [ [[TMP7:%.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ [[WIDE_LOAD]], %vector.body ], [ [[TMP6:%.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT:    [[TMP6]] = fadd <4 x double> [[TMP5]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP7]] = add nuw nsw i32 [[SCALAR_PHI]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 10000
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[FOR1_LATCH]], label %[[FOR2_HEADER]]
+
+; CHECK:       [[FOR1_LATCH]]:
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x double> [ [[TMP6]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr %c.out, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    store <4 x double> [[VEC_PHI4]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label %vector.body
 
 entry:
   br label %for1.header
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
@@ -10,24 +10,26 @@
 ;   variables.
 
 define void @loop_invariant_select(ptr noalias nocapture %out, i1 %select, double %a, double %b) {
-; CHECK-LABEL: @loop_invariant_select(
+; CHECK-LABEL: define void @loop_invariant_select
+; CHECK-SAME: (ptr noalias nocapture [[OUT:%.*]], i1 [[SELECT:%.*]], double [[A:%.*]], double [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT2]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR2_HEADER1:%.*]]
 ; CHECK:       for2.header1:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP2:%.*]], [[FOR2_HEADER1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[SELECT:%.*]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP1]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[FOR2_HEADER1]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[SELECT]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP3]], align 8
 entry:
   br label %for1.header
 
@@ -55,25 +57,28 @@
 }
 
 define void @outer_loop_dependant_select(ptr noalias nocapture %out, double %a, double %b) {
-; CHECK-LABEL: @outer_loop_dependant_select(
+; CHECK-LABEL: define void @outer_loop_dependant_select
+; CHECK-SAME: (ptr noalias nocapture [[OUT:%.*]], double [[A:%.*]], double [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT2]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR2_HEADER1:%.*]]
 ; CHECK:       for2.header1:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP3:%.*]], [[FOR2_HEADER1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[FOR2_HEADER1]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP4]], align 8
 entry:
   br label %for1.header
 
@@ -102,25 +107,27 @@
 }
 
 define void @inner_loop_dependant_select(ptr noalias nocapture %out, double %a, double %b) {
-; CHECK-LABEL: @inner_loop_dependant_select(
+; CHECK-LABEL: define void @inner_loop_dependant_select
+; CHECK-SAME: (ptr noalias nocapture [[OUT:%.*]], double [[A:%.*]], double [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT2]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR2_HEADER1:%.*]]
 ; CHECK:       for2.header1:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP3:%.*]], [[FOR2_HEADER1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_PHI]] to <4 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[FOR2_HEADER1]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[VEC_PHI]] to <4 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP4]], align 8
 entry:
   br label %for1.header
 
@@ -149,26 +156,29 @@
 }
 
 define void @outer_and_inner_loop_dependant_select(ptr noalias nocapture %out, double %a, double %b) {
-; CHECK-LABEL: @outer_and_inner_loop_dependant_select(
+; CHECK-LABEL: define void @outer_and_inner_loop_dependant_select
+; CHECK-SAME: (ptr noalias nocapture [[OUT:%.*]], double [[A:%.*]], double [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT2]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR2_HEADER1:%.*]]
 ; CHECK:       for2.header1:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[FOR2_HEADER1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP3]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP6:%.*]], [[FOR2_HEADER1]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP5]], align 8
 entry:
   br label %for1.header