diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -299,6 +299,8 @@
 
   /// State passed from peelKernel to peelPrologAndEpilogs().
   std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
+  /// Illegal phis that need to be deleted once we re-link stages.
+  SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
 
 public:
   PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
@@ -321,6 +323,13 @@
   /// Peels one iteration of the rewritten kernel (BB) in the specified
   /// direction.
   MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
+  // Delete instructions whose stage is less than MinStage in the given basic
+  // block.
+  void filterInstructions(MachineBasicBlock *MB, int MinStage);
+  // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
+  // instructions to keep a valid IR.
+  void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
+                              MachineBasicBlock *SourceBB, unsigned Stage);
   /// Peel the kernel forwards and backwards to produce prologs and epilogs,
   /// and stitch them together.
   void peelPrologAndEpilogs();
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -1582,6 +1582,99 @@
   return NewBB;
 }
 
+void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
+                                                       int MinStage) {
+  for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
+       I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
+    MachineInstr *MI = &*I++;
+    int Stage = getStage(MI);
+    if (Stage == -1 || Stage >= MinStage)
+      continue;
+
+    for (MachineOperand &DefMO : MI->defs()) {
+      SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
+      for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
+        // Only PHIs can use values from this block by construction.
+        // Match with the equivalent PHI in B.
+        assert(UseMI.isPHI());
+        Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
+                                               MI->getParent());
+        Subs.emplace_back(&UseMI, Reg);
+      }
+      for (auto &Sub : Subs)
+        Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
+                                      *MRI.getTargetRegisterInfo());
+    }
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+  }
+}
+
+void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
+    MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
+  auto InsertPt = DestBB->getFirstNonPHI();
+  DenseMap<Register, Register> Remaps;
+  for (auto I = SourceBB->getFirstNonPHI(); I != SourceBB->end();) {
+    MachineInstr *MI = &*I++;
+    if (MI->isPHI()) {
+      // This is an illegal PHI. If we move any instructions using an illegal
+      // PHI, we need to create a legal Phi
+      Register PhiR = MI->getOperand(0).getReg();
+      auto RC = MRI.getRegClass(PhiR);
+      Register NR = MRI.createVirtualRegister(RC);
+      MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(), DebugLoc(),
+                                 TII->get(TargetOpcode::PHI), NR)
+                             .addReg(PhiR)
+                             .addMBB(SourceBB);
+      BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
+      CanonicalMIs[NI] = CanonicalMIs[MI];
+      Remaps[PhiR] = NR;
+      continue;
+    }
+    if (getStage(MI) != Stage)
+      continue;
+    MI->removeFromParent();
+    DestBB->insert(InsertPt, MI);
+    auto *KernelMI = CanonicalMIs[MI];
+    BlockMIs[{DestBB, KernelMI}] = MI;
+    BlockMIs.erase({SourceBB, KernelMI});
+  }
+  SmallVector<MachineInstr *, 4> PhiToDelete;
+  for (MachineInstr &MI : DestBB->phis()) {
+    assert(MI.getNumOperands() == 3);
+    MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
+    // If the instruction referenced by the phi is moved inside the block
+    // we don't need the phi anymore.
+    if (getStage(Def) == Stage) {
+      Register PhiReg = MI.getOperand(0).getReg();
+      MRI.replaceRegWith(MI.getOperand(0).getReg(),
+                         Def->getOperand(0).getReg());
+      MI.getOperand(0).setReg(PhiReg);
+      PhiToDelete.push_back(&MI);
+    }
+  }
+  for (auto *P : PhiToDelete)
+    P->eraseFromParent();
+  InsertPt = DestBB->getFirstNonPHI();
+  for (MachineInstr &MI : SourceBB->phis()) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
+    DestBB->insert(InsertPt, NewMI);
+    Register OrigR = MI.getOperand(0).getReg();
+    Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
+    NewMI->getOperand(0).setReg(R);
+    NewMI->getOperand(1).setReg(OrigR);
+    NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
+    Remaps[OrigR] = R;
+    CanonicalMIs[NewMI] = CanonicalMIs[&MI];
+    BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NewMI;
+  }
+  for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I)
+    for (MachineOperand &MO : I->uses())
+      if (MO.isReg() && Remaps.count(MO.getReg()))
+        MO.setReg(Remaps[MO.getReg()]);
+}
+
 void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
   BitVector LS(Schedule.getNumStages(), true);
   BitVector AS(Schedule.getNumStages(), true);
@@ -1607,23 +1700,36 @@
 
   // Push out the epilogs, again in reverse order.
   // We can't assume anything about the minumum loop trip count at this point,
-  // so emit a fairly complex epilog:
-  //  K[0, 1, 2]     // Kernel runs stages 0, 1, 2
-  //  E0[2] <- P1    // Epilog runs stage 2 only, so the state after is [0].
-  //  E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
-  //
-  // This creates a single-successor single-predecessor sequence of blocks for
-  // each epilog, which are kept this way for simplicity at this stage and
-  // cleaned up by the optimizer later.
+  // so emit a fairly complex epilog.
+
+  // We first peel number of stages minus one epilogue. Then we remove dead
+  // stages and reorder instructions based on their stage. If we have 3 stages
+  // we generate first:
+  // E0[3, 2, 1]
+  // E1[3', 2']
+  // E2[3'']
+  // And then we move instructions based on their stages to have:
+  // E0[3]
+  // E1[2, 3']
+  // E2[1, 2', 3'']
+  // The transformation is legal because we only move instructions past
+  // instructions of a previous loop iteration.
   for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
-    Epilogs.push_back(nullptr);
-    for (int J = Schedule.getNumStages() - 1; J >= I; --J) {
-      LS.reset();
-      LS[J] = 1;
-      Epilogs.back() = peelKernel(LPD_Back);
-      LiveStages[Epilogs.back()] = LS;
-      AvailableStages[Epilogs.back()] = AS;
+    Epilogs.push_back(peelKernel(LPD_Back));
+    filterInstructions(Epilogs.back(), Schedule.getNumStages() - I);
+  }
+  for (size_t I = 0; I < Epilogs.size(); I++) {
+    LS.reset();
+    for (size_t J = I; J < Epilogs.size(); J++) {
+      int Iteration = J;
+      unsigned Stage = Schedule.getNumStages() - 1 + I - J;
+      // Move stage one block at a time so that Phi nodes are updated correctly.
+      for (size_t K = Iteration; K > I; K--)
+        moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
+      LS[Stage] = 1;
     }
+    LiveStages[Epilogs[I]] = LS;
+    AvailableStages[Epilogs[I]] = AS;
   }
 
   // Now we've defined all the prolog and epilog blocks as a fallthrough
@@ -1659,6 +1765,13 @@
       rewriteUsesOf(MI);
     }
   }
+  for (auto *MI : IllegalPhisToDelete) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+  }
+  IllegalPhisToDelete.clear();
+
   // Now all remapping has been done, we're free to optimize the generated code.
   for (MachineBasicBlock *B : reverse(Blocks))
     EliminateDeadPhis(B, MRI, LIS);
@@ -1727,9 +1840,10 @@
       R = MI->getOperand(1).getReg();
     MRI.setRegClass(R, MRI.getRegClass(PhiR));
     MRI.replaceRegWith(PhiR, R);
-    if (LIS)
-      LIS->RemoveMachineInstrFromMaps(*MI);
-    MI->eraseFromParent();
+    // Postpone deleting the Phi as it may be referenced by BlockMIs and used
+    // later to figure out how to remap registers.
+    MI->getOperand(0).setReg(PhiR);
+    IllegalPhisToDelete.push_back(MI);
     return;
   }
 
diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
--- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
@@ -1,6 +1,4 @@
 ; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
-; XFAIL: *
-; LSR changes required.
 
 ; This version of the conv3x3 test has both loops. This test checks that the
 ; inner loop has 13 packets.
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true < %s | FileCheck %s
+
+; Test epilogue generation when reading loop-carried dependency from a previous
+; stage. The first epilogue should read value from iteration N-1 of the kernel.
+
+; CHECK: loop0
+; CHECK: r{{[0-9]+}} = add([[REG0:r([0-9]+)]],#8)
+; CHECK: [[REG0:r([0-9]+)]] = [[REG1:r([0-9]+)]]
+; CHECK: endloop0
+; CHECK: = add([[REG1]],#8)
+
+; Function Attrs: nounwind
+define i32* @f0(i16* nocapture readonly %a0, i32 %a1) #0 {
+b0:
+  %v0 = alloca [129 x i32], align 8
+  br i1 undef, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i16* [ %a0, %b1 ], [ %v2, %b2 ]
+  %v2 = phi i16* [ undef, %b1 ], [ %v15, %b2 ]
+  %v3 = phi i32* [ null, %b1 ], [ %v4, %b2 ]
+  %v4 = phi i32* [ null, %b1 ], [ %v14, %b2 ]
+  %v5 = phi i32 [ 0, %b1 ], [ %v13, %b2 ]
+  %v6 = phi i16* [ undef, %b1 ], [ %v12, %b2 ]
+  %v7 = load i16, i16* %v2, align 2
+  %v8 = sext i16 %v7 to i32
+  %v9 = call i32 @llvm.hexagon.M2.mpy.ll.s0(i32 %v8, i32 %v8) #2
+  %v10 = load i16, i16* %v6, align 2
+  %v11 = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32 %v9, i32 undef, i32 undef) #2
+  store i32 %v11, i32* %v4, align 4
+  %v12 = getelementptr inbounds i16, i16* %v6, i32 -1
+  %v13 = add i32 %v5, 1
+  %v14 = getelementptr inbounds i32, i32* %v3, i32 2
+  %v15 = getelementptr inbounds i16, i16* %v1, i32 2
+  %v16 = icmp slt i32 %v13, %a1
+  br i1 %v16, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
+  %out = phi i32* [ null, %b0 ], [ %v14, %b2 ]
+  ret i32* %out
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.M2.mpy.ll.s0(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32, i32, i32) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/swp-stages4.ll b/llvm/test/CodeGen/Hexagon/swp-stages4.ll
--- a/llvm/test/CodeGen/Hexagon/swp-stages4.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-stages4.ll
@@ -5,7 +5,6 @@
 
 ; CHECK: = and
 ; CHECK: = and
-; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1)
 ; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255)
 ; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255)
 ; CHECK: loop0(.LBB0_[[LOOP:.]],