diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -514,7 +514,7 @@
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
   void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
-                           Value *StartV, VPValue *Def,
+                           VPValue *StartV, VPValue *Def,
                            VPTransformState &State);
 
   /// A helper function to scalarize a single Instruction in the innermost loop.
@@ -4517,16 +4517,9 @@
 
 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
   for (PHINode *OrigPhi : OrigPHIsToFix) {
-    PHINode *NewPhi =
-        cast<PHINode>(State.get(State.Plan->getVPValue(OrigPhi), 0));
-    unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
-
-    SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
-        predecessors(OrigPhi->getParent()));
-    SmallVector<BasicBlock *, 2> VectorBBPredecessors(
-        predecessors(NewPhi->getParent()));
-    assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
-           "Scalar and Vector BB should have the same number of predecessors");
+    VPWidenPHIRecipe *VPPhi =
+        cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
+    PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
 
     // The insertion point in Builder may be invalidated by the time we get
     // here. Force the Builder insertion point to something valid so that we do
@@ -4536,17 +4529,10 @@
 
     // The predecessor order is preserved and we can rely on mapping between
     // scalar and vector block predecessors.
-    for (unsigned i = 0; i < NumIncomingValues; ++i) {
-      BasicBlock *NewPredBB = VectorBBPredecessors[i];
-
-      // When looking up the new scalar/vector values to fix up, use incoming
-      // values from original phi.
-      Value *ScIncV =
-          OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
-
-      // Scalar incoming value may need a broadcast
-      Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
-      NewPhi->addIncoming(NewIncV, NewPredBB);
+    for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
+      VPValue *Inc = VPPhi->getIncomingValue(i);
+      VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+      NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
     }
   }
 }
@@ -4624,7 +4610,7 @@
 
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
                                               RecurrenceDescriptor *RdxDesc,
-                                              Value *StartV, VPValue *Def,
+                                              VPValue *StartVPV, VPValue *Def,
                                               VPTransformState &State) {
   assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
   PHINode *P = cast<PHINode>(PN);
@@ -4646,6 +4632,7 @@
   assert(PN->getParent() == OrigLoop->getHeader() &&
          "Non-header phis should have been handled elsewhere");
 
+  Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr;
   // In order to support recurrences we need to be able to vectorize Phi nodes.
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
@@ -9049,10 +9036,8 @@
 }
 
 void VPWidenPHIRecipe::execute(VPTransformState &State) {
-  Value *StartV =
-      getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc,
-                                 StartV, this, State);
+                                 getStartValue(), this, State);
 }
 
 void VPBlendRecipe::execute(VPTransformState &State) {
@@ -9316,8 +9301,11 @@
   if (hasVectorValue(Def, Part))
     return Data.PerPartOutput[Def][Part];
 
-  if (!hasScalarValue(Def, {Part, 0}))
-    return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
+  if (!hasScalarValue(Def, {Part, 0})) {
+    Value *V =
+        Def->getLiveInIRValue() ? Def->getLiveInIRValue() : VPValue2Value[Def];
+    return Callback.getOrCreateVectorValues(V, Part);
+  }
 
   Value *ScalarValue = get(Def, {Part, 0});
   // If we aren't vectorizing, we can just copy the scalar map values over
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1040,12 +1040,17 @@
 /// A recipe for handling all phi nodes except for integer and FP inductions.
 /// For reduction PHIs, RdxDesc must point to the corresponding recurrence
 /// descriptor and the start value is the first operand of the recipe.
+/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
+/// managed in the recipe directly.
 class VPWidenPHIRecipe : public VPRecipeBase, public VPValue {
   PHINode *Phi;
 
   /// Descriptor for a reduction PHI.
   RecurrenceDescriptor *RdxDesc = nullptr;
 
+  /// List of incoming blocks. Only used in the VPlan native path.
+  SmallVector<VPBasicBlock *, 2> IncomingBlocks;
+
 public:
   /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
@@ -1065,6 +1070,9 @@
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPWidenPHISC;
   }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVWidenPHISC;
+  }
 
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
@@ -1077,6 +1085,18 @@
   VPValue *getStartValue() {
     return getNumOperands() == 0 ? nullptr : getOperand(0);
   }
+
+  /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
+  void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
+    addOperand(IncomingV);
+    IncomingBlocks.push_back(IncomingBlock);
+  }
+
+  /// Returns the \p I th incoming VPValue.
+  VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
+
+  /// Returns the \p I th incoming VPBasicBlock.
+  VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -94,13 +94,15 @@
   for (auto *Phi : PhisToFix) {
     assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
     VPValue *VPVal = IRDef2VPValue[Phi];
-    assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node.");
-    auto *VPPhi = cast<VPInstruction>(VPVal);
+    assert(isa<VPWidenPHIRecipe>(VPVal) &&
+           "Expected WidenPHIRecipe for phi node.");
+    auto *VPPhi = cast<VPWidenPHIRecipe>(VPVal);
     assert(VPPhi->getNumOperands() == 0 &&
            "Expected VPInstruction with no operands.");
 
-    for (Value *Op : Phi->operands())
-      VPPhi->addOperand(getOrCreateVPOperand(Op));
+    for (unsigned I = 0; I != Phi->getNumOperands(); ++I)
+      VPPhi->addIncoming(getOrCreateVPOperand(Phi->getIncomingValue(I)),
+                         BB2VPBB[Phi->getIncomingBlock(I)]);
   }
 }
 
@@ -210,13 +212,13 @@
       continue;
     }
 
-    VPInstruction *NewVPInst;
+    VPValue *NewVPV;
     if (auto *Phi = dyn_cast<PHINode>(Inst)) {
       // Phi node's operands may have not been visited at this point. We create
       // an empty VPInstruction that we will fix once the whole plain CFG has
       // been built.
-      NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
-          Inst->getOpcode(), {} /*No operands*/, Inst));
+      NewVPV = new VPWidenPHIRecipe(Phi);
+      VPBB->appendRecipe(cast<VPWidenPHIRecipe>(NewVPV));
       PhisToFix.push_back(Phi);
     } else {
       // Translate LLVM-IR operands into VPValue operands and set them in the
@@ -227,11 +229,11 @@
 
       // Build VPInstruction for any arbitraty Instruction without specific
       // representation in VPlan.
-      NewVPInst = cast<VPInstruction>(
+      NewVPV = cast<VPInstruction>(
           VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
     }
 
-    IRDef2VPValue[Inst] = NewVPInst;
+    IRDef2VPValue[Inst] = NewVPV;
   }
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -33,44 +33,65 @@
     // Introduce each ingredient into VPlan.
     for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
       VPRecipeBase *Ingredient = &*I++;
+      VPValue *VPV = Ingredient->getVPValue();
       // Can only handle VPInstructions.
-      VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
-      Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+      Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
       if (DeadInstructions.count(Inst)) {
         VPValue DummyValue;
-        VPInst->replaceAllUsesWith(&DummyValue);
+        VPV->replaceAllUsesWith(&DummyValue);
         Ingredient->eraseFromParent();
         continue;
       }
 
       VPRecipeBase *NewRecipe = nullptr;
-      // Create VPWidenMemoryInstructionRecipe for loads and stores.
-      if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
-        NewRecipe = new VPWidenMemoryInstructionRecipe(
-            *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
-            nullptr /*Mask*/);
-      else if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
-        NewRecipe = new VPWidenMemoryInstructionRecipe(
-            *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
-            Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/);
-      else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+      if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(Ingredient)) {
+        auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
         InductionDescriptor II = Inductions.lookup(Phi);
         if (II.getKind() == InductionDescriptor::IK_IntInduction ||
             II.getKind() == InductionDescriptor::IK_FpInduction) {
           VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
           NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr);
-        } else
+        } else {
           NewRecipe = new VPWidenPHIRecipe(Phi);
-      } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-        NewRecipe = new VPWidenGEPRecipe(
-            GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
-      } else
-        NewRecipe =
-            new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
+          for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
+            VPValue *Inc = VPPhi->getIncomingValue(i);
+            VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+            cast<VPWidenPHIRecipe>(NewRecipe)->addIncoming(Inc, VPBB);
+          }
+        }
+      } else {
+        assert(isa<VPInstruction>(Ingredient) &&
+               "only VPInstructions expected here");
+        assert(!isa<PHINode>(Inst) && "phis should be handled above");
+        // Create VPWidenMemoryInstructionRecipe for loads and stores.
+        if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+          NewRecipe = new VPWidenMemoryInstructionRecipe(
+              *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+              nullptr /*Mask*/);
+        else if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+          NewRecipe = new VPWidenMemoryInstructionRecipe(
+              *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+              Plan->getOrAddVPValue(Store->getValueOperand()),
+              nullptr /*Mask*/);
+        else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+          InductionDescriptor II = Inductions.lookup(Phi);
+          if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+              II.getKind() == InductionDescriptor::IK_FpInduction) {
+            VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
+            NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr);
+          } else
+            NewRecipe = new VPWidenPHIRecipe(Phi);
+        } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+          NewRecipe = new VPWidenGEPRecipe(
+              GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
+        } else
+          NewRecipe =
+              new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
+      }
 
       NewRecipe->insertBefore(Ingredient);
       if (NewRecipe->getNumDefinedValues() == 1)
-        VPInst->replaceAllUsesWith(NewRecipe->getVPValue());
+        VPV->replaceAllUsesWith(NewRecipe->getVPValue());
       else
         assert(NewRecipe->getNumDefinedValues() == 0 &&
                "Only recpies with zero or one defined values expected");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
@@ -32,7 +32,7 @@
 ; CHECK: br label %[[InnerLoop:.+]]
 
 ; CHECK: [[InnerLoop]]:
-; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
 ; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
 ; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
 ; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
@@ -97,7 +97,7 @@
 ; CHECK: br label %[[InnerLoop:.+]]
 
 ; CHECK: [[InnerLoop]]:
-; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
 ; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, <2 x i64> %[[InnerPhi]], <2 x i64> %[[VecInd]]
 ; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[StoreVal]], <2 x i64*> %[[AAddr2]], i32 4, <2 x i1> <i1 true, i1 true>
 ; CHECK: %[[InnerPhiNext]] = add nuw nsw <2 x i64> %[[InnerPhi]], <i64 1, i64 1>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -33,7 +33,7 @@
 ; CHECK: br label %[[InnerLoop:.+]]
 
 ; CHECK: [[InnerLoop]]:
-; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
 ; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
 ; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
 ; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
@@ -62,7 +62,7 @@
 ; AVX: br label %[[InnerLoop:.+]]
 
 ; AVX: [[InnerLoop]]:
-; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
 ; AVX: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <8 x i64> %[[InnerPhi]], <8 x i64> %[[VecInd]]
 ; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[StoreVal]], <8 x i32*> %[[AAddr2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true
 ; AVX: %[[InnerPhiNext]] = add nuw nsw <8 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -enable-vplan-native-path -loop-vectorize -S %s | FileCheck -S
+
+; Make sure phi nodes are generated correctly, even if the use list order of
+; the predecessors in the scalar code does not match the order in the generated
+; vector blocks.
+
+; Test from PR45958.
+
+define void @test([2000 x i32]* %src, i64 %n) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_1_LATCH5:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_1_LATCH5]] ]
+; CHECK-NEXT:    br label [[LOOP_2_HEADER1:%.*]]
+; CHECK:       loop.2.header1:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[LOOP_2_LATCH4:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_32:%.*]]
+; CHECK:       loop.32:
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[LOOP_2_HEADER1]] ], [ [[TMP2:%.*]], [[LOOP_32]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [2000 x i32], [2000 x i32]* [[SRC:%.*]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> [[TMP1]], <4 x i32*> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI3]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP_2_LATCH4]], label [[LOOP_32]]
+; CHECK:       loop.2.latch4:
+; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOOP_1_LATCH5]], label [[LOOP_2_HEADER1]]
+; CHECK:       loop.1.latch5:
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_1_HEADER:%.*]]
+; CHECK:       loop.1.header:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_1_LATCH:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2_HEADER:%.*]]
+; CHECK:       loop.2.header:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ 0, [[LOOP_1_HEADER]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2_LATCH:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop.3:
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i64 [ 0, [[LOOP_2_HEADER]] ], [ [[IV_3_NEXT:%.*]], [[LOOP_3]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds [2000 x i32], [2000 x i32]* [[SRC]], i64 [[IV_1]], i64 [[IV_3]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, i32* [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[L1]], 10
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV_3_NEXT]] = add nuw nsw i64 [[IV_3]], 1
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp eq i64 [[IV_3_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC_3]], label [[LOOP_2_LATCH]], label [[LOOP_3]]
+; CHECK:       loop.2.latch:
+; CHECK-NEXT:    [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 1
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i64 [[IV_2_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC_2]], label [[LOOP_1_LATCH]], label [[LOOP_2_HEADER]]
+; CHECK:       loop.1.latch:
+; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1
+; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], [[LOOP2:!llvm.loop !.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1.header
+
+loop.1.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop.1.latch ]
+  br label %loop.2.header
+
+loop.2.header:
+  %iv.2 = phi i64 [ 0, %loop.1.header ], [ %iv.2.next, %loop.2.latch ]
+  br label %loop.3
+
+loop.3:
+  %iv.3 = phi i64 [ 0, %loop.2.header ], [ %iv.3.next, %loop.3 ]
+  %gep.src = getelementptr inbounds [2000 x i32], [2000 x i32]* %src, i64 %iv.1, i64 %iv.3
+  %l1 = load i32, i32* %gep.src, align 4
+  %mul = mul nsw i32 %l1, 10
+  store i32 %mul, i32* %gep.src, align 4
+  %iv.3.next = add nuw nsw i64 %iv.3, 1
+  %ec.3 = icmp eq i64 %iv.3.next, %n
+  br i1 %ec.3, label %loop.2.latch, label %loop.3
+
+loop.2.latch:
+  %iv.2.next = add nuw nsw i64 %iv.2, 1
+  %ec.2 = icmp eq i64 %iv.2.next, %n
+  br i1 %ec.2, label %loop.1.latch, label %loop.2.header
+
+loop.1.latch:
+  %iv.1.next = add nuw nsw i64 %iv.1, 1
+  %ec.1 = icmp eq i64 %iv.1.next, %n
+  br i1 %ec.1, label %exit, label %loop.1.header, !llvm.loop !0
+
+exit:                                             ; preds = %loop.1.latch
+  ret void
+
+; uselistorder directives
+  uselistorder label %loop.3, { 1, 0 }
+  uselistorder label %loop.2.header, { 1, 0 }
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
@@ -29,7 +29,7 @@
 ; CHECK: br label %[[InnerLoop:.+]]
 
 ; CHECK: [[InnerLoop]]:
-; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
 ; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
 ; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
 ; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
@@ -35,8 +35,8 @@
 ; CHECK: br label %[[InnerForBody:.*]]
 
 ; CHECK: [[InnerForBody]]:
-; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ %[[InnerIndNext:.*]], %[[InnerForBody]] ], [ zeroinitializer, %[[InnerForPh]] ]
-; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ], [ %[[WideAVal]], %[[InnerForPh]] ]
+; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ zeroinitializer, %[[InnerForPh]] ], [ %[[InnerIndNext:.*]], %[[InnerForBody]] ]
+; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[WideAVal]], %[[InnerForPh]] ], [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ]
 ; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, <4 x i64> %[[InnerInd]]
 ; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[BAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
 ; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
--- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
@@ -28,8 +28,8 @@
 ; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
 
 ; CHECK: [[FOR2_HEADER]]:
-; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ], [ zeroinitializer, %vector.body ]
-; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ], [ %[[MASKED_GATHER1]], %vector.body ]
+; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
 ; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[MASKED_GATHER2]], %[[REDUCTION]]
 ; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], <i32 10000, i32 10000, i32 10000, i32 10000>