Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -138,6 +138,7 @@
   ScheduleDAGMILive *DAG =
       new ScheduleDAGMILive(C,
                             llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
+  DAG->addMutation(AMDGPU::createPhysRegCSEDAGMutation());
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
Index: lib/Target/AMDGPU/GCNSchedStrategy.h
===================================================================
--- lib/Target/AMDGPU/GCNSchedStrategy.h
+++ lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -49,6 +49,11 @@
   SUnit *pickNode(bool &IsTopNode) override;
 };
 
+namespace AMDGPU {
+
+std::unique_ptr<ScheduleDAGMutation> createPhysRegCSEDAGMutation();
+
+}
 } // End namespace llvm
 
 #endif // GCNSCHEDSTRATEGY_H
Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp
===================================================================
--- lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -311,3 +311,100 @@
   DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
   return SU;
 }
+
+namespace {
+
+static bool isCSECandidate(const MachineInstr &MI) {
+  // Only CSE instruction that define physical registers
+  for (const MachineOperand &MO : MI.defs()) {
+    if (!TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      return false;
+  }
+
+  // Don't CSE memory instructions
+  if (MI.hasUnmodeledSideEffects() || MI.mayLoadOrStore())
+    return false;
+
+  // Only CSE instruction with immediate uses.
+  for (const MachineOperand &MO : MI.uses()) {
+    if (!MO.isImm())
+      return false;
+  }
+
+  return true;
+}
+
+static void collectIdenticalSuccs(SUnit *SU, SmallVectorImpl<SUnit*> &Worklist) {
+  for (SUnit::const_succ_iterator
+       SI = SU->Succs.begin(), SE = SU->Succs.end(); SI != SE; ++SI) {
+    SUnit *Succ = SI->getSUnit();
+    if (SU->getInstr()->isIdenticalTo(*Succ->getInstr())) {
+      Worklist.push_back(Succ);
+      collectIdenticalSuccs(Succ, Worklist);
+    }
+  }
+}
+
+class PhysRegCSE : public ScheduleDAGMutation {
+public:
+  PhysRegCSE() { }
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+void PhysRegCSE::apply(ScheduleDAGInstrs *DAGInstrs) {
+
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  std::vector<SUnit>::iterator I, E;
+  for (I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
+    SUnit *SU = &*I;
+    MachineInstr *MI = SU->getInstr();
+
+    if (!isCSECandidate(*MI))
+      continue;
+
+    // Scan successors for an identical instructions
+    SmallVector<SUnit*, 4> Worklist;
+    collectIdenticalSuccs(SU, Worklist);
+    if (Worklist.empty())
+      continue;
+
+    for (SUnit *IdenticalSU : Worklist) {
+      // Copy successors over to SU
+      SUnit::succ_iterator SI;
+      while ((SI = IdenticalSU->Succs.begin()) != IdenticalSU->Succs.end()) {
+        SI->getSUnit()->addPred(SDep(SU, SI->getKind(), SI->getReg()));
+        for (SUnit::pred_iterator PI = SI->getSUnit()->Preds.begin(),
+             PE = SI->getSUnit()->Preds.end(); PI != PE; ++PI) {
+
+          if (PI->getSUnit() == IdenticalSU) {
+            SI->getSUnit()->removePred(*PI);
+            break;
+          }
+        }
+      }
+
+      // Remove all predecessors
+      SUnit::pred_iterator PI;
+      while ((PI = IdenticalSU->Preds.begin()) != IdenticalSU->Preds.end())
+        IdenticalSU->removePred(*PI);
+
+      // Try to schedule the two SCEs together so MachineCSE can more easily
+      // remove them.
+      IdenticalSU->addPred(SDep(SU, SDep::Cluster));
+    }
+  }
+}
+
+} // End anonymous namespace
+
+namespace llvm {
+namespace AMDGPU {
+
+std::unique_ptr<ScheduleDAGMutation> createPhysRegCSEDAGMutation() {
+  return make_unique<PhysRegCSE>();
+}
+
+}
+}
Index: test/CodeGen/AMDGPU/init-m0-sched-deps.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/init-m0-sched-deps.mir
@@ -0,0 +1,201 @@
+# RUN: llc -march=amdgcn -run-pass machine-scheduler -o - %s | FileCheck %s
+
+--- |
+  define void @reinit_m0_neg1(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) {
+    ret void
+  }
+
+  define void @reinit_m0_multi_value(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) {
+    ret void
+  }
+
+  define void @reinit_m0_neg1_multi_use_same_def(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1, i32 addrspace(3)* %lds2) {
+    ret void
+  }
+
+  define void @reinit_m0_add_impdef_scc(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) {
+    ret void
+  }
+
+  define void @reinit_m0_brev(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) {
+    ret void
+  }
+
+  define void @reinit_m0_diff_opcode(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) {
+    ret void
+  }
+
+  define void @reinit_m0_copy(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) {
+    ret void
+  }
+
+   define void @reinit_m0_other_use(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) {
+    ret void
+  }
+
+...
+---
+
+# CHECK-LABEL: name: reinit_m0_neg1
+# CHECK: DS_READ_B32
+# CHECK-NEXT: DS_READ_B32
+name: reinit_m0_neg1
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body: |
+  bb.0:
+    %m0 = S_MOV_B32 -1
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0)
+    %m0 = S_MOV_B32 -1
+    %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1)
+...
+---
+
+# CHECK-LABEL: name: reinit_m0_multi_value
+# CHECK: %m0 = S_MOV_B32 400
+# CHECK: DS_READ_B32
+# CHECK: %m0 = S_MOV_B32 800
+# CHECK: DS_READ_B32
+name: reinit_m0_multi_value
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body: |
+  bb.0:
+    %m0 = S_MOV_B32 400
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0)
+    %m0 = S_MOV_B32 800
+    %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1)
+...
+---
+
+# CHECK-LABEL: name: reinit_m0_neg1_multi_use_same_def
+# CHECK: %m0 = S_MOV_B32 -1
+# CHECK: DS_READ_B32
+# CHECK-NEXT: DS_READ_B32
+# CHECK-NEXT: DS_READ_B32
+
+name: reinit_m0_neg1_multi_use_same_def
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+body: |
+  bb.0:
+    %m0 = S_MOV_B32 -1
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0)
+    %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1)
+    %m0 = S_MOV_B32 -1
+    %3 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds2)
+...
+---
+
+# CHECK-LABEL: name: reinit_m0_add_impdef_scc
+# CHECK: %m0 = S_ADD_U32 0, 4, implicit-def %scc
+# CHECK: DS_READ_B32
+# CHECK: %m0 = S_ADD_U32 0, 4, implicit-def %scc
+# CHECK: DS_READ_B32
+name: reinit_m0_add_impdef_scc
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body: |
+  bb.0:
+    %m0 = S_ADD_U32 0, 4, implicit-def %scc
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0)
+    %m0 = S_ADD_U32 0, 4, implicit-def %scc
+    %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1)
+...
+---
+
+# CHECK-LABEL: name: reinit_m0_brev
+# CHECK: %m0 = S_BREV_B32 2
+# CHECK: DS_READ_B32
+# CHECK-NEXT: DS_READ_B32
+name: reinit_m0_brev
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body: |
+  bb.0:
+    %m0 = S_BREV_B32 2
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0)
+    %m0 = S_BREV_B32 2
+    %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1)
+...
+---
+
+# CHECK-LABEL: name: reinit_m0_diff_opcode
+# CHECK: %m0 = S_MOV_B32 400
+# CHECK: DS_READ_B32
+# CHECK: %m0 = S_BREV_B32 400
+# CHECK: DS_READ_B32
+name: reinit_m0_diff_opcode
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body: |
+  bb.0:
+    %m0 = S_MOV_B32 400
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0)
+    %m0 = S_BREV_B32 400
+    %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1)
+...
+---
+
+# CHECK-LABEL: name: reinit_m0_copy
+# CHECK: %3 = S_MOV_B32 400
+# CHECK: %m0 = COPY %3
+# CHECK: DS_READ_B32
+# CHECK: %m0 = COPY %3
+# CHECK: DS_READ_B32
+name: reinit_m0_copy
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_32_xm0 }
+body: |
+  bb.0:
+    %3 = S_MOV_B32 400
+    %m0 = COPY %3
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0)
+    %m0 = COPY %3
+    %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1)
+...
+---
+#FIXME: Why isn't this combined?
+
+# CHECK-LABEL: name: reinit_m0_other_use
+# CHECK: %m0 = S_MOV_B32 8
+# CHECK: S_ADD_U32 %m0, 4
+# CHECK: %m0 = S_MOV_B32 8
+# CHECK: S_ADD_U32 %m0, 8
+name: reinit_m0_other_use
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sreg_32_xm0 }
+  - { id: 3, class: sreg_32_xm0 }
+body: |
+  bb.0:
+    %m0 = S_MOV_B32 8
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %1 = S_ADD_U32 %m0, 4, implicit-def %scc
+    %m0 = S_MOV_B32 8
+    %2 = S_ADD_U32 %m0, 8, implicit-def %scc
+...