Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp
===================================================================
--- lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -599,7 +599,7 @@
         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
           unsigned SrcReg = MI.getOperand(1).getReg();
           if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
-            TII->moveToVALU(MI);
+            TII->moveToVALU(MI, *MDT);
             break;
           }
 
@@ -614,7 +614,7 @@
             MI.setDesc(TII->get(SMovOp));
             break;
           }
-          TII->moveToVALU(MI);
+          TII->moveToVALU(MI, *MDT);
         } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
@@ -677,7 +677,7 @@
         SmallSet<unsigned, 8> Visited;
         if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
           LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
-          TII->moveToVALU(MI);
+          TII->moveToVALU(MI, *MDT);
         }
         break;
       }
@@ -690,7 +690,7 @@
 
         LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
 
-        TII->moveToVALU(MI);
+        TII->moveToVALU(MI, *MDT);
         break;
       case AMDGPU::INSERT_SUBREG: {
         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
@@ -700,7 +700,7 @@
         if (TRI->isSGPRClass(DstRC) &&
             (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
           LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
-          TII->moveToVALU(MI);
+          TII->moveToVALU(MI, *MDT);
         }
         break;
       }
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -37,6 +37,7 @@
 namespace llvm {
 
 class APInt;
+class MachineDominatorTree;
 class MachineRegisterInfo;
 class RegScavenger;
 class GCNSubtarget;
@@ -79,8 +80,8 @@
 private:
   void swapOperands(MachineInstr &Inst) const;
 
-  bool moveScalarAddSub(SetVectorType &Worklist,
-                        MachineInstr &Inst) const;
+  bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+                        MachineDominatorTree &MDT) const;
 
   void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;
@@ -91,11 +92,12 @@
   void splitScalar64BitUnaryOp(SetVectorType &Worklist,
                                MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitAddSub(SetVectorType &Worklist,
-                              MachineInstr &Inst) const;
+  void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+                              MachineDominatorTree &MDT) const;
 
-  void splitScalar64BitBinaryOp(SetVectorType &Worklist,
-                                MachineInstr &Inst, unsigned Opcode) const;
+  void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst,
+                                unsigned Opcode,
+                                MachineDominatorTree &MDT) const;
 
   void splitScalar64BitBCNT(SetVectorType &Worklist,
                             MachineInstr &Inst) const;
@@ -787,12 +789,12 @@
 
   /// Legalize all operands in this instruction.  This function may
   /// create new instruction and insert them before \p MI.
-  void legalizeOperands(MachineInstr &MI) const;
+  void legalizeOperands(MachineInstr &MI, MachineDominatorTree &MDT) const;
 
   /// Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
   /// VALU if necessary.
-  void moveToVALU(MachineInstr &MI) const;
+  void moveToVALU(MachineInstr &MI, MachineDominatorTree &MDT) const;
 
   void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
                         int Count) const;
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -17,10 +17,10 @@
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "GCNHazardRecognizer.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -31,6 +31,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -3657,9 +3658,8 @@
 
 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
 // with SGPRs by iterating over all unique values across all lanes.
-static void loadSRsrcFromVGPR(const SIInstrInfo &TII,
-                              MachineInstr &MI,
-                              MachineOperand &Rsrc) {
+static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineDominatorTree &MDT,
+                              MachineInstr &MI, MachineOperand &Rsrc) {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -3699,6 +3699,16 @@
 
   MBB.addSuccessor(LoopBB);
 
+  // Update dominators. We know that MBB immediately dominates LoopBB, that
+  // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
+  // dominates all of the successors transferred to it from MBB that MBB used
+  // to dominate.
+  MDT.addNewBlock(LoopBB, &MBB);
+  MDT.addNewBlock(RemainderBB, LoopBB);
+  for (auto &Succ : RemainderBB->successors())
+    if (MDT.dominates(&MBB, Succ))
+      MDT.changeImmediateDominator(Succ, RemainderBB);
+
   emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
 
   // Restore the EXEC mask
@@ -3750,7 +3760,8 @@
   return std::tie(RsrcPtr, NewSRsrc);
 }
 
-void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
+void SIInstrInfo::legalizeOperands(MachineInstr &MI,
+                                   MachineDominatorTree &MDT) const {
   MachineFunction &MF = *MI.getParent()->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
@@ -4023,12 +4034,13 @@
     } else {
       // This is another variant; legalize Rsrc with waterfall loop from VGPRs
       // to SGPRs.
-      loadSRsrcFromVGPR(*this, MI, *Rsrc);
+      loadSRsrcFromVGPR(*this, MDT, MI, *Rsrc);
     }
   }
 }
 
-void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
+                             MachineDominatorTree &MDT) const {
   SetVectorType Worklist;
   Worklist.insert(&TopInst);
 
@@ -4046,29 +4058,29 @@
       break;
     case AMDGPU::S_ADD_U64_PSEUDO:
     case AMDGPU::S_SUB_U64_PSEUDO:
-      splitScalar64BitAddSub(Worklist, Inst);
+      splitScalar64BitAddSub(Worklist, Inst, MDT);
       Inst.eraseFromParent();
       continue;
     case AMDGPU::S_ADD_I32:
     case AMDGPU::S_SUB_I32:
       // FIXME: The u32 versions currently selected use the carry.
-      if (moveScalarAddSub(Worklist, Inst))
+      if (moveScalarAddSub(Worklist, Inst, MDT))
         continue;
 
       // Default handling
       break;
     case AMDGPU::S_AND_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT);
       Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_OR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT);
       Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_XOR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT);
       Inst.eraseFromParent();
       continue;
 
@@ -4155,7 +4167,7 @@
       continue;
 
     case AMDGPU::S_XNOR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
       Inst.eraseFromParent();
       continue;
 
@@ -4255,7 +4267,7 @@
 
       // Legalize all operands other than the offset. Notably, convert the srsrc
       // into SGPRs using v_readfirstlane if needed.
-      legalizeOperands(*NewInstr);
+      legalizeOperands(*NewInstr, MDT);
       continue;
     }
     }
@@ -4263,7 +4275,7 @@
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
       // legalize its operands instead.
-      legalizeOperands(Inst);
+      legalizeOperands(Inst, MDT);
       continue;
     }
 
@@ -4352,7 +4364,7 @@
     }
 
     // Legalize the operands
-    legalizeOperands(Inst);
+    legalizeOperands(Inst, MDT);
 
     if (HasDst)
      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
@@ -4360,8 +4372,8 @@
 }
 
 // Add/sub require special handling to deal with carry outs.
-bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
-                                   MachineInstr &Inst) const {
+bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+                                   MachineDominatorTree &MDT) const {
   if (ST.hasAddNoCarry()) {
     // Assume there is no user of scc since we don't select this in that case.
     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
@@ -4385,7 +4397,7 @@
     Inst.setDesc(get(NewOpc));
     Inst.addImplicitDefUseOperands(*MBB.getParent());
     MRI.replaceRegWith(OldDstReg, ResultReg);
-    legalizeOperands(Inst);
+    legalizeOperands(Inst, MDT);
 
     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
     return true;
@@ -4505,8 +4517,9 @@
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitAddSub(
-  SetVectorType &Worklist, MachineInstr &Inst) const {
+void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
+                                         MachineInstr &Inst,
+                                         MachineDominatorTree &MDT) const {
   bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
 
   MachineBasicBlock &MBB = *Inst.getParent();
@@ -4566,16 +4579,16 @@
 
   // Try to legalize the operands in case we need to swap the order to keep it
   // valid.
-  legalizeOperands(*LoHalf);
-  legalizeOperands(*HiHalf);
+  legalizeOperands(*LoHalf, MDT);
+  legalizeOperands(*HiHalf, MDT);
 
   // Move all users of this moved vlaue.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitBinaryOp(
-    SetVectorType &Worklist, MachineInstr &Inst,
-    unsigned Opcode) const {
+void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
+                                           MachineInstr &Inst, unsigned Opcode,
+                                           MachineDominatorTree &MDT) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -4633,8 +4646,8 @@
 
   // Try to legalize the operands in case we need to swap the order to keep it
   // valid.
-  legalizeOperands(LoHalf);
-  legalizeOperands(HiHalf);
+  legalizeOperands(LoHalf, MDT);
+  legalizeOperands(HiHalf, MDT);
 
   // Move all users of this moved vlaue.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
Index: test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
===================================================================
--- test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
-; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK-O0
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=CHECK-O0
 
 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
 
Index: test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
===================================================================
--- test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -1,5 +1,5 @@
-# RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=COMMON,ADDR64
-# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=COMMON,NO-ADDR64
+# RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=COMMON,ADDR64
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=COMMON,NO-ADDR64
 
 # Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
 #