Index: llvm/trunk/lib/CodeGen/MachinePipeliner.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/MachinePipeliner.cpp
+++ llvm/trunk/lib/CodeGen/MachinePipeliner.cpp
@@ -102,6 +102,7 @@
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -171,6 +172,12 @@
                                      cl::ReallyHidden, cl::init(false),
                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
 
+// A command line option to enable the CopyToPhi DAG mutation.
+static cl::opt<bool>
+    SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                       cl::init(true), cl::ZeroOrMore,
+                       cl::desc("Enable CopyToPhi DAG Mutation"));
+
 namespace {
 
 class NodeSet;
@@ -307,12 +314,18 @@
     void unblock(int U);
   };
 
+  struct CopyToPhiMutation : public ScheduleDAGMutation {
+    void apply(ScheduleDAGInstrs *DAG) override;
+  };
+
 public:
   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
                     const RegisterClassInfo &rci)
       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
         RegClassInfo(rci), Topo(SUnits, &ExitSU) {
     P.MF->getSubtarget().getSMSMutations(Mutations);
+    if (SwpEnableCopyToPhi)
+      Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
   }
 
   void schedule() override;
@@ -391,6 +404,8 @@
     Mutations.push_back(std::move(Mutation));
   }
 
+  static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
+
 private:
   void addLoopCarriedDependences(AliasAnalysis *AA);
   void updatePhiDependences();
@@ -893,8 +908,8 @@
   addLoopCarriedDependences(AA);
   updatePhiDependences();
   Topo.InitDAGTopologicalSorting();
-  postprocessDAG();
   changeDependences();
+  postprocessDAG();
   LLVM_DEBUG(dump());
 
   NodeSetType NodeSets;
@@ -1624,6 +1639,85 @@
   swapAntiDependences(SUnits);
 }
 
+// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
+// is loop-carried to the USE in next iteration. This will help pipeliner avoid
+// additional copies that are needed across iterations. An artificial dependence
+// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
+
+// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
+// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
+// PHI-------True-Dep------> USEOfPhi
+
+// The mutation creates
+// USEOfPHI -------Artificial-Dep---> SRCOfCopy
+
+// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
+// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
+// late  to avoid additional copies across iterations. The possible scheduling
+// order would be
+// USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
+
+void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
+  for (SUnit &SU : DAG->SUnits) {
+    // Find the COPY/REG_SEQUENCE instruction.
+    if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
+      continue;
+
+    // Record the loop carried PHIs.
+    SmallVector<SUnit *, 4> PHISUs;
+    // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
+    SmallVector<SUnit *, 4> SrcSUs;
+
+    for (auto &Dep : SU.Preds) {
+      SUnit *TmpSU = Dep.getSUnit();
+      MachineInstr *TmpMI = TmpSU->getInstr();
+      SDep::Kind DepKind = Dep.getKind();
+      // Save the loop carried PHI.
+      if (DepKind == SDep::Anti && TmpMI->isPHI())
+        PHISUs.push_back(TmpSU);
+      // Save the source of COPY/REG_SEQUENCE.
+      // If the source has no pre-decessors, we will end up creating cycles.
+      else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
+        SrcSUs.push_back(TmpSU);
+    }
+
+    if (PHISUs.size() == 0 || SrcSUs.size() == 0)
+      continue;
+
+    // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
+    // SUnit to the container.
+    SmallVector<SUnit *, 8> UseSUs;
+    for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) {
+      for (auto &Dep : (*I)->Succs) {
+        if (Dep.getKind() != SDep::Data)
+          continue;
+
+        SUnit *TmpSU = Dep.getSUnit();
+        MachineInstr *TmpMI = TmpSU->getInstr();
+        if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
+          PHISUs.push_back(TmpSU);
+          continue;
+        }
+        UseSUs.push_back(TmpSU);
+      }
+    }
+
+    if (UseSUs.size() == 0)
+      continue;
+
+    SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
+    // Add the artificial dependencies if it does not form a cycle.
+    for (auto I : UseSUs) {
+      for (auto Src : SrcSUs) {
+        if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
+          Src->addPred(SDep(I, SDep::Artificial));
+          SDAG->Topo.AddPred(Src, I);
+        }
+      }
+    }
+  }
+}
+
 /// Return true for DAG nodes that we ignore when computing the cost functions.
 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
 /// in the calculation of the ASAP, ALAP, etc functions.
Index: llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll
===================================================================
--- llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll
+++ llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=hexagon -enable-pipeliner=true -debug-only=pipeliner < %s \
+; RUN: 2>&1 | FileCheck %s
+
+; Test that the artificial dependence is created as a result of
+; CopyToPhi DAG mutation.
+; CHECK: Ord  Latency=0 Artificial
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @foo(i64* nocapture readonly %r64, i16 zeroext %n, i16 zeroext %s, i64* nocapture %p64) #0 {
+entry:
+  %conv = zext i16 %n to i32
+  %cmp = icmp eq i16 %n, 0
+  br i1 %cmp, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = load i64, i64* %r64, align 8
+  %v.sroa.0.0.extract.trunc = trunc i64 %tmp to i16
+  %v.sroa.4.0.extract.shift = lshr i64 %tmp, 16
+  %v.sroa.4.0.extract.trunc = trunc i64 %v.sroa.4.0.extract.shift to i16
+  %v.sroa.5.0.extract.shift = lshr i64 %tmp, 32
+  %v.sroa.5.0.extract.trunc = trunc i64 %v.sroa.5.0.extract.shift to i16
+  %v.sroa.6.0.extract.shift = lshr i64 %tmp, 48
+  %v.sroa.6.0.extract.trunc = trunc i64 %v.sroa.6.0.extract.shift to i16
+  %tmp1 = bitcast i64* %p64 to i16*
+  %conv2 = zext i16 %s to i32
+  %add.ptr = getelementptr inbounds i16, i16* %tmp1, i32 %conv2
+  %add.ptr.sum = add nuw nsw i32 %conv2, 1
+  %add.ptr3 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum
+  %add.ptr.sum50 = add nuw nsw i32 %conv2, 2
+  %add.ptr4 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum50
+  %add.ptr.sum51 = add nuw nsw i32 %conv2, 3
+  %add.ptr5 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum51
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %add.ptr11.phi = phi i16* [ %add.ptr11.inc, %for.body ], [ %add.ptr, %for.body.preheader ]
+  %add.ptr16.phi = phi i16* [ %add.ptr16.inc, %for.body ], [ %add.ptr3, %for.body.preheader ]
+  %add.ptr21.phi = phi i16* [ %add.ptr21.inc, %for.body ], [ %add.ptr4, %for.body.preheader ]
+  %add.ptr26.phi = phi i16* [ %add.ptr26.inc, %for.body ], [ %add.ptr5, %for.body.preheader ]
+  %i.058.pmt = phi i32 [ %inc.pmt, %for.body ], [ 0, %for.body.preheader ]
+  %v.sroa.0.157 = phi i16 [ %v.sroa.0.0.extract.trunc34, %for.body ], [ %v.sroa.0.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.4.156 = phi i16 [ %v.sroa.4.0.extract.trunc36, %for.body ], [ %v.sroa.4.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.5.155 = phi i16 [ %v.sroa.5.0.extract.trunc38, %for.body ], [ %v.sroa.5.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.6.154 = phi i16 [ %v.sroa.6.0.extract.trunc40, %for.body ], [ %v.sroa.6.0.extract.trunc, %for.body.preheader ]
+  %q64.153.pn = phi i64* [ %q64.153, %for.body ], [ %r64, %for.body.preheader ]
+  %q64.153 = getelementptr inbounds i64, i64* %q64.153.pn, i32 1
+  store i16 %v.sroa.0.157, i16* %add.ptr11.phi, align 2
+  store i16 %v.sroa.4.156, i16* %add.ptr16.phi, align 2
+  store i16 %v.sroa.5.155, i16* %add.ptr21.phi, align 2
+  store i16 %v.sroa.6.154, i16* %add.ptr26.phi, align 2
+  %tmp2 = load i64, i64* %q64.153, align 8
+  %v.sroa.0.0.extract.trunc34 = trunc i64 %tmp2 to i16
+  %v.sroa.4.0.extract.shift35 = lshr i64 %tmp2, 16
+  %v.sroa.4.0.extract.trunc36 = trunc i64 %v.sroa.4.0.extract.shift35 to i16
+  %v.sroa.5.0.extract.shift37 = lshr i64 %tmp2, 32
+  %v.sroa.5.0.extract.trunc38 = trunc i64 %v.sroa.5.0.extract.shift37 to i16
+  %v.sroa.6.0.extract.shift39 = lshr i64 %tmp2, 48
+  %v.sroa.6.0.extract.trunc40 = trunc i64 %v.sroa.6.0.extract.shift39 to i16
+  %inc.pmt = add i32 %i.058.pmt, 1
+  %cmp8 = icmp slt i32 %inc.pmt, %conv
+  %add.ptr11.inc = getelementptr i16, i16* %add.ptr11.phi, i32 4
+  %add.ptr16.inc = getelementptr i16, i16* %add.ptr16.phi, i32 4
+  %add.ptr21.inc = getelementptr i16, i16* %add.ptr21.phi, i32 4
+  %add.ptr26.inc = getelementptr i16, i16* %add.ptr26.phi, i32 4
+  br i1 %cmp8, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv65" }