diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2193,14 +2193,6 @@
     return false;
   }
 
-  if (Hints.getInterleave() > 1) {
-    // TODO: Interleave support is future work.
-    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
-                         "outer loops.\n");
-    Hints.emitRemarkWithHints();
-    return false;
-  }
-
   return true;
 }
 
@@ -4159,13 +4151,15 @@
       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
       if (!VPPhi)
         continue;
-      PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
-      // Make sure the builder has a valid insert point.
-      Builder.SetInsertPoint(NewPhi);
-      for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
-        VPValue *Inc = VPPhi->getIncomingValue(i);
-        VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
-        NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
+
+      for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+        PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, Part));
+        Builder.SetInsertPoint(NewPhi);
+        for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
+          VPValue *Inc = VPPhi->getIncomingValue(i);
+          VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+          NewPhi->addIncoming(State.get(Inc, Part), State.CFG.VPBB2IRBB[VPBB]);
+        }
       }
     }
   }
@@ -9843,6 +9837,12 @@
 
   CM.collectElementTypesForWidening();
 
+  // The VPlan-native path does not have a cost model, so the only way to get
+  // a unroll factor is to query the loop vectorization hints.
+  unsigned UF = Hints.getInterleave();
+  if (!UF)
+    UF = 1;
+
   // Plan how to best vectorize, return the best VF and its cost.
   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
 
@@ -9858,10 +9858,10 @@
     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
                              F->getParent()->getDataLayout());
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
-                           VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
+                           VF.Width, UF, LVL, &CM, BFI, PSI, Checks);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                       << L->getHeader()->getParent()->getName() << "\"\n");
-    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
+    LVP.executePlan(VF.Width, UF, BestPlan, LB, DT, false);
   }
 
   // Mark the loop as already vectorized to avoid vectorizing again.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1332,10 +1332,12 @@
         StartIdx = I;
     }
   }
-  Value *Op0 = State.get(getOperand(StartIdx), 0);
-  Type *VecTy = Op0->getType();
-  Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
-  State.set(this, VecPhi, 0);
+
+  Type *VecTy = State.get(getOperand(StartIdx), 0)->getType();
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+    Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
+    State.set(this, VecPhi, Part);
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
--- a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
+++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
@@ -117,13 +117,11 @@
 }
 
 ; Case 3: Annotated outer loop WITH vector width and interleave information
-; doesn't have to be collected.
+; has to be collected.
 
 ; CHECK-LABEL: case3
-; CHECK-NOT: LV: Loop hints: force=enabled
-; CHECK-NOT: LV: We can vectorize this outer loop!
-; CHECK: LV: Loop hints: force=?
-; CHECK: LV: Found a loop: inner.body
+; CHECK: LV: Loop hints: force=enabled width=4 interleave=2
+; CHECK: LV: We can vectorize this outer loop!
 
 define void @case3(ptr nocapture %a, ptr nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_unroll.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_unroll.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_unroll.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -force-vector-width=4 -passes=loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
+
+@A = external local_unnamed_addr global [1024 x float], align 4
+@B = external local_unnamed_addr global [512 x float], align 4
+
+; Test if the vplan-native-path successfully unrolls/interleaves the outer loop if requested via hints.
+define void @foo() {
+; CHECK-LABEL: define void @foo() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[OUTER_LOOP_LATCH9:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[OUTER_LOOP_LATCH9]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
+; CHECK-NEXT:    br label [[INNER_LOOP3:%.*]]
+; CHECK:       inner_loop3:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP6:%.*]], [[INNER_LOOP3]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP7:%.*]], [[INNER_LOOP3]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[INNER_LOOP3]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x float> [ [[WIDE_MASKED_GATHER2]], [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[INNER_LOOP3]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <4 x i64> [[VEC_PHI4]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
+; CHECK-NEXT:    [[TMP4]] = fmul <4 x float> [[VEC_PHI5]], [[WIDE_MASKED_GATHER7]]
+; CHECK-NEXT:    [[TMP5]] = fmul <4 x float> [[VEC_PHI6]], [[WIDE_MASKED_GATHER8]]
+; CHECK-NEXT:    [[TMP6]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP7]] = add nuw nsw <4 x i64> [[VEC_PHI4]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], <i64 512, i64 512, i64 512, i64 512>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP7]], <i64 512, i64 512, i64 512, i64 512>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP10]], label [[OUTER_LOOP_LATCH9]], label [[INNER_LOOP3]]
+; CHECK:       outer_loop_latch9:
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x float> [ [[TMP4]], [[INNER_LOOP3]] ]
+; CHECK-NEXT:    [[VEC_PHI11:%.*]] = phi <4 x float> [ [[TMP5]], [[INNER_LOOP3]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[VEC_PHI10]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[VEC_PHI11]], <4 x ptr> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw <4 x i64> [[STEP_ADD]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i64> [[TMP11]], <i64 1024, i64 1024, i64 1024, i64 1024>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <4 x i64> [[TMP12]], <i64 1024, i64 1024, i64 1024, i64 1024>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECK:       outer_loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[OUTER_LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 [[I]]
+; CHECK-NEXT:    [[X_START:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECK:       inner_loop:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 0, [[OUTER_LOOP]] ], [ [[J_NEXT:%.*]], [[INNER_LOOP]] ]
+; CHECK-NEXT:    [[X:%.*]] = phi float [ [[X_START]], [[OUTER_LOOP]] ], [ [[X_NEXT:%.*]], [[INNER_LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, i64 [[J]]
+; CHECK-NEXT:    [[B:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[X_NEXT]] = fmul float [[X]], [[B]]
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT:    [[INNER_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], 512
+; CHECK-NEXT:    br i1 [[INNER_EXITCOND]], label [[OUTER_LOOP_LATCH]], label [[INNER_LOOP]]
+; CHECK:       outer_loop_latch:
+; CHECK-NEXT:    [[X_NEXT_LCSSA:%.*]] = phi float [ [[X_NEXT]], [[INNER_LOOP]] ]
+; CHECK-NEXT:    store float [[X_NEXT_LCSSA]], ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[OUTER_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[OUTER_EXITCOND]], label [[EXIT]], label [[OUTER_LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer_loop
+
+outer_loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %outer_loop_latch ]
+  %arrayidx1 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %i
+  %x.start = load float, ptr %arrayidx1, align 4
+  br label %inner_loop
+
+inner_loop:
+  %j = phi i64 [ 0, %outer_loop ], [ %j.next, %inner_loop ]
+  %x = phi float [ %x.start, %outer_loop ], [ %x.next, %inner_loop ]
+  %arrayidx2 = getelementptr inbounds [512 x float], ptr @B, i64 0, i64 %j
+  %b = load float, ptr %arrayidx2, align 4
+  %x.next = fmul float %x, %b
+  %j.next = add nuw nsw i64 %j, 1
+  %inner_exitcond = icmp eq i64 %j.next, 512
+  br i1 %inner_exitcond, label %outer_loop_latch, label %inner_loop
+
+outer_loop_latch:
+  store float %x.next, ptr %arrayidx1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %outer_exitcond = icmp eq i64 %i.next, 1024
+  br i1 %outer_exitcond, label %exit, label %outer_loop, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3, !4}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.width", i32 4}
+!4 = !{!"llvm.loop.interleave.count", i32 2}