Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4052,8 +4052,11 @@
     // getOrCreateVectorValue calls below.
     Builder.SetInsertPoint(NewPhi);
 
-    // The predecessor order is preserved and we can rely on mapping between
-    // scalar and vector block predecessors.
+    // The order of predecessors can be different between OrigPhi BB and NewPhi
+    // BB. Check dominance of incoming values and map incoming value and BB
+    // according to the dominace.
+    SmallVector<Value *, 2> NewIncomingValues;
+    bool NeedSwap = false;
     for (unsigned i = 0; i < NumIncomingValues; ++i) {
       BasicBlock *NewPredBB = VectorBBPredecessors[i];
 
@@ -4064,6 +4067,23 @@
 
       // Scalar incoming value may need a broadcast
       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
+
+      // Check dominance of NewIncV with NewPredBB.
+      if (Instruction *NewIncI = dyn_cast<Instruction>(NewIncV)) {
+        if (!DT->dominates(NewIncI, NewPredBB)) {
+          NeedSwap = true;
+        }
+      }
+      NewIncomingValues.push_back(NewIncV);
+    }
+
+    if (NeedSwap) {
+      std::swap(NewIncomingValues[0], NewIncomingValues[1]);
+    }
+
+    for (unsigned i = 0; i < NumIncomingValues; ++i) {
+      BasicBlock *NewPredBB = VectorBBPredecessors[i];
+      Value *NewIncV = NewIncomingValues[i];
       NewPhi->addIncoming(NewIncV, NewPredBB);
     }
   }
Index: llvm/test/Transforms/LoopVectorize/vplan-wrong-vec-phi.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/vplan-wrong-vec-phi.ll
@@ -0,0 +1,88 @@
+; RUN: opt < %s -jump-threading -loop-vectorize -enable-vplan-native-path -S
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @matrixMulScalar([4 x i32]* nocapture readonly %src1, [4 x i32]* nocapture readonly %src2, [4 x i32]* nocapture %dest, i32 %n) local_unnamed_addr #0 {
+entry:
+  %cmp57 = icmp eq i32 %n, 0
+  br i1 %cmp57, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %wide.trip.count65 = zext i32 %n to i64
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
+  %indvar = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvar.next, %for.cond.cleanup3 ]
+  br i1 true, label %for.body4.lr.ph, label %for.cond1.preheader.for.cond.cleanup3_crit_edge
+
+for.cond1.preheader.for.cond.cleanup3_crit_edge:  ; preds = %for.cond1.preheader
+  br label %for.cond.cleanup3
+
+for.body4.lr.ph:                                  ; preds = %for.cond1.preheader
+  br label %for.body4
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond.cleanup3
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.cond.cleanup3.loopexit:                       ; preds = %for.cond.cleanup9
+  br label %for.cond.cleanup3
+
+for.cond.cleanup3:                                ; preds = %for.cond1.preheader.for.cond.cleanup3_crit_edge, %for.cond.cleanup3.loopexit
+  %indvar.next = add nuw nsw i64 %indvar, 1
+  %exitcond66 = icmp eq i64 %indvar.next, %wide.trip.count65
+  br i1 %exitcond66, label %for.cond.cleanup.loopexit, label %for.cond1.preheader, !llvm.loop !2
+
+for.body4:                                        ; preds = %for.cond.cleanup9, %for.body4.lr.ph
+  %indvars.iv60 = phi i64 [ 0, %for.body4.lr.ph ], [ %indvars.iv.next61, %for.cond.cleanup9 ]
+  %arrayidx6 = getelementptr inbounds [4 x i32], [4 x i32]* %dest, i64 %indvar, i64 %indvars.iv60
+  store i32 0, i32* %arrayidx6, align 4, !tbaa !4
+  br i1 true, label %for.body10.preheader, label %for.body4.for.cond.cleanup9_crit_edge
+
+for.body4.for.cond.cleanup9_crit_edge:            ; preds = %for.body4
+  br label %for.cond.cleanup9
+
+for.body10.preheader:                             ; preds = %for.body4
+  br label %for.body10
+
+for.cond.cleanup9.loopexit:                       ; preds = %for.body10
+  br label %for.cond.cleanup9
+
+for.cond.cleanup9:                                ; preds = %for.body4.for.cond.cleanup9_crit_edge, %for.cond.cleanup9.loopexit
+  %indvars.iv.next61 = add nuw nsw i64 %indvars.iv60, 1
+  %exitcond63 = icmp eq i64 %indvars.iv.next61, %wide.trip.count65
+  br i1 %exitcond63, label %for.cond.cleanup3.loopexit, label %for.body4
+
+for.body10:                                       ; preds = %for.body10, %for.body10.preheader
+  %0 = phi i32 [ 0, %for.body10.preheader ], [ %add, %for.body10 ]
+  %indvars.iv = phi i64 [ 0, %for.body10.preheader ], [ %indvars.iv.next, %for.body10 ]
+  %arrayidx18 = getelementptr inbounds [4 x i32], [4 x i32]* %src1, i64 %indvar, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx18, align 4, !tbaa !4
+  %arrayidx22 = getelementptr inbounds [4 x i32], [4 x i32]* %src2, i64 %indvars.iv, i64 %indvars.iv60
+  %2 = load i32, i32* %arrayidx22, align 4, !tbaa !4
+  %mul = mul nsw i32 %2, %1
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx6, align 4, !tbaa !4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count65
+  br i1 %exitcond, label %for.cond.cleanup9.loopexit, label %for.body10
+}
+
+attributes #0 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git 37309fb02f60557f18971dc575904c0fc56c91ab)"}
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+