Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -218,9 +218,10 @@
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
   CALL,        // Function call based on a single integer
   UMUL,        // 32bit unsigned multiplication
-  RET_FLAG,
   BRANCH_COND,
   // End AMDIL ISD Opcodes
+  ENDPGM,
+  RETURN,
   DWORDADDR,
   FRACT,
   CLAMP,
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -652,7 +652,7 @@
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                                   const SmallVectorImpl<SDValue> &OutVals,
                                   const SDLoc &DL, SelectionDAG &DAG) const {
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
 }
 
 //===---------------------------------------------------------------------===//
@@ -2722,10 +2722,11 @@
   // AMDIL DAG nodes
   NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
-  NODE_NAME_CASE(RET_FLAG);
   NODE_NAME_CASE(BRANCH_COND);
 
   // AMDGPU DAG nodes
+  NODE_NAME_CASE(ENDPGM)
+  NODE_NAME_CASE(RETURN)
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(CLAMP)
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -261,5 +261,8 @@
 //===----------------------------------------------------------------------===//
 // Call/Return DAG Nodes
 //===----------------------------------------------------------------------===//
-def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+
+def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -17,7 +17,6 @@
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPUTargetMachine.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
-#include "R600InstrInfo.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -107,6 +106,29 @@
       ++I;
     }
   } else {
+    // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
+    // terminator instructions and should only be printed as comments.
+    if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+      if (isVerbose()) {
+        SmallVector<char, 16> BBStr;
+        raw_svector_ostream Str(BBStr);
+
+        const MachineBasicBlock *MBB = MI->getOperand(1).getMBB();
+        const MCSymbolRefExpr *Expr
+          = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+        Expr->print(Str, MAI);
+        OutStreamer->emitRawComment(" mask branch " + BBStr);
+      }
+
+      return;
+    }
+
+    if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" return");
+      return;
+    }
+
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -448,8 +448,8 @@
 
   addPass(createSIInsertWaitsPass());
   addPass(createSIShrinkInstructionsPass());
-  addPass(createSILowerControlFlowPass(), false);
-  addPass(createSIDebuggerInsertNopsPass(), false);
+  addPass(createSILowerControlFlowPass());
+  addPass(createSIDebuggerInsertNopsPass());
 }
 
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
Index: lib/Target/AMDGPU/R600Instructions.td
===================================================================
--- lib/Target/AMDGPU/R600Instructions.td
+++ lib/Target/AMDGPU/R600Instructions.td
@@ -1539,8 +1539,9 @@
 //===---------------------------------------------------------------------===//
 let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
     usesCustomInserter = 1 in {
-  def RETURN          : ILFormat<(outs), (ins variable_ops),
-      "RETURN", [(IL_retflag)]>;
+  def RETURN : ILFormat<(outs), (ins variable_ops),
+    "RETURN", [(AMDGPUendpgm)]
+  >;
 }
 
 //===----------------------------------------------------------------------===//
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1002,7 +1002,8 @@
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
@@ -1463,8 +1464,8 @@
 
   // FIXME: This should really be selected to s_trap, but that requires
   // setting up the trap handler for it o do anything.
-  return DAG.getNode(AMDGPUISD::RET_FLAG, SDLoc(Op), MVT::Other, Op.
-                     getOperand(0));
+  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
+                     Op.getOperand(0));
 }
 
 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
Index: lib/Target/AMDGPU/SIInstrFormats.td
===================================================================
--- lib/Target/AMDGPU/SIInstrFormats.td
+++ lib/Target/AMDGPU/SIInstrFormats.td
@@ -11,8 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+class InstSI <dag outs, dag ins, string asm = "",
+              list<dag> pattern = []> :
+  AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
 
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -426,7 +426,7 @@
 let isTerminator = 1 in {
 
 def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
-  [(IL_retflag)]> {
+  [(AMDGPUendpgm)]> {
   let simm16 = 0;
   let isBarrier = 1;
   let hasCtrlDep = 1;
@@ -1908,7 +1908,7 @@
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
 let hasSideEffects = 1, SALU = 1 in {
-def SGPR_USE : InstSI <(outs),(ins), "", []>;
+def SGPR_USE : InstSI <(outs), (ins)>;
 }
 
 let usesCustomInserter = 1, SALU = 1 in {
@@ -1919,61 +1919,57 @@
 // SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
-let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+let hasSideEffects = 1, isPseudo = 1, isCodeGenOnly = 1 in {
+
+// Dummy terminator instruction to use after control flow instructions
+// replaced with exec mask operations.
+def SI_MASK_BRANCH : InstSI <
+  (outs SReg_64:$dst), (ins brtarget:$target)> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let SALU = 1;
+}
+
 let Uses = [EXEC], Defs = [EXEC] in {
 
 let isBranch = 1, isTerminator = 1 in {
 
 def SI_IF: InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, brtarget:$target),
-  "",
+  (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), "",
   [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]
 >;
 
 def SI_ELSE : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src, brtarget:$target),
-  "",
-  [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]
-> {
+  (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), "",
+  [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> {
   let Constraints = "$src = $dst";
 }
 
 def SI_LOOP : InstSI <
-  (outs),
-  (ins SReg_64:$saved, brtarget:$target),
-  "si_loop $saved, $target",
+  (outs), (ins SReg_64:$saved, brtarget:$target), "",
   [(int_amdgcn_loop i64:$saved, bb:$target)]
 >;
 
 } // End isBranch = 1, isTerminator = 1
 
 def SI_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src),
-  "si_else $dst, $src",
+  (outs SReg_64:$dst), (ins SReg_64:$src), "",
   [(set i64:$dst, (int_amdgcn_break i64:$src))]
 >;
 
 def SI_IF_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, SReg_64:$src),
-  "si_if_break $dst, $vcc, $src",
+  (outs SReg_64:$dst),   (ins SReg_64:$vcc, SReg_64:$src), "",
   [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]
 >;
 
 def SI_ELSE_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src0, SReg_64:$src1),
-  "si_else_break $dst, $src0, $src1",
+  (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), "",
   [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]
 >;
 
 def SI_END_CF : InstSI <
-  (outs),
-  (ins SReg_64:$saved),
-  "si_end_cf $saved",
+  (outs), (ins SReg_64:$saved), "",
   [(int_amdgcn_end_cf i64:$saved)]
 >;
 
@@ -1981,30 +1977,24 @@
 
 let Uses = [EXEC], Defs = [EXEC,VCC] in {
 def SI_KILL : InstSI <
-  (outs),
-  (ins VSrc_32:$src),
-  "si_kill $src",
+  (outs), (ins VSrc_32:$src), "",
   [(int_AMDGPU_kill f32:$src)]
 >;
 } // End Uses = [EXEC], Defs = [EXEC,VCC]
 
 } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
 
-let SALU = 1 in
 def SI_PS_LIVE : InstSI <
-  (outs SReg_64:$dst),
-  (ins),
-  "si_ps_live $dst",
-  [(set i1:$dst, (int_amdgcn_ps_live))]
->;
+  (outs SReg_64:$dst), (ins), "",
+  [(set i1:$dst, (int_amdgcn_ps_live))]> {
+  let SALU = 1;
+}
 
 // Used as an isel pseudo to directly emit initialization with an
 // s_mov_b32 rather than a copy of another initialized
 // register. MachineCSE skips copies, and we don't want to have to
 // fold operands before it runs.
-def SI_INIT_M0 : InstSI <
-  (outs),
-  (ins SSrc_32:$src), "", []> {
+def SI_INIT_M0 : InstSI <(outs), (ins SSrc_32:$src)> {
   let Defs = [M0];
   let usesCustomInserter = 1;
   let isPseudo = 1;
@@ -2014,21 +2004,28 @@
   let isReMaterializable = 1;
 }
 
+def SI_RETURN : InstSI <
+  (outs), (ins variable_ops), "", [(AMDGPUreturn)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let hasSideEffects = 1;
+  let SALU = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
 let Uses = [EXEC], Defs = [EXEC, VCC, M0] in {
 
 class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
   (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins rc:$src, VSrc_32:$idx, i32imm:$off),
-  "si_indirect_src $dst, $temp, $src, $idx, $off",
-  []
+  (ins rc:$src, VSrc_32:$idx, i32imm:$off)
 >;
 
 class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
   (outs rc:$dst, SReg_64:$temp),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
-  "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
-  []
-> {
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val)> {
   let Constraints = "$src = $dst";
 }
 
@@ -2052,16 +2049,14 @@
   let UseNamedOperandTable = 1, Uses = [EXEC] in {
     def _SAVE : InstSI <
       (outs),
-      (ins sgpr_class:$src, i32imm:$frame_idx),
-      "", []> {
+      (ins sgpr_class:$src, i32imm:$frame_idx)> {
       let mayStore = 1;
       let mayLoad = 0;
     }
 
     def _RESTORE : InstSI <
       (outs sgpr_class:$dst),
-      (ins i32imm:$frame_idx),
-      "", []> {
+      (ins i32imm:$frame_idx)> {
       let mayStore = 0;
       let mayLoad = 1;
     }
@@ -2082,8 +2077,7 @@
     def _SAVE : InstSI <
       (outs),
       (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
-           SReg_32:$scratch_offset, i32imm:$offset),
-      "", []> {
+           SReg_32:$scratch_offset, i32imm:$offset)> {
       let mayStore = 1;
       let mayLoad = 0;
     }
@@ -2091,8 +2085,7 @@
     def _RESTORE : InstSI <
       (outs vgpr_class:$dst),
       (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset,
-           i32imm:$offset),
-      "", []> {
+           i32imm:$offset)> {
       let mayStore = 0;
       let mayLoad = 1;
     }
Index: lib/Target/AMDGPU/SILowerControlFlow.cpp
===================================================================
--- lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -88,10 +88,14 @@
   void Kill(MachineInstr &MI);
   void Branch(MachineInstr &MI);
 
-  void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
+  void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
+                              MachineInstr *MovRel,
+                              unsigned SaveReg, unsigned IdxReg, int Offset);
+
+  bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
   void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
-  void IndirectSrc(MachineInstr &MI);
-  void IndirectDst(MachineInstr &MI);
+  bool indirectSrc(MachineInstr &MI);
+  bool indirectDst(MachineInstr &MI);
 
 public:
   static char ID;
@@ -104,11 +108,6 @@
   const char *getPassName() const override {
     return "SI Lower control flow pseudo instructions";
   }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
 };
 
 } // End anonymous namespace
@@ -227,6 +226,10 @@
 
   Skip(MI, MI.getOperand(2));
 
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Reg)
+    .addOperand(MI.getOperand(2));
+
   MI.eraseFromParent();
 }
 
@@ -255,6 +258,10 @@
 
   Skip(MI, MI.getOperand(2));
 
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Dst)
+    .addOperand(MI.getOperand(2));
+
   MI.eraseFromParent();
 }
 
@@ -331,7 +338,8 @@
 }
 
 void SILowerControlFlow::Branch(MachineInstr &MI) {
-  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
+  MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+  if (MBB == MI.getParent()->getNextNode())
     MI.eraseFromParent();
 
   // If these aren't equal, this is probably an infinite loop.
@@ -365,75 +373,109 @@
   MI.eraseFromParent();
 }
 
-void SILowerControlFlow::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
+                                                DebugLoc DL,
+                                                MachineInstr *MovRel,
+                                                unsigned SaveReg,
+                                                unsigned IdxReg,
+                                                int Offset) {
+  MachineBasicBlock::iterator I = LoopBB.begin();
+
+  // Read the next variant into VCC (lower 32 bits) <- also loop target
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
+    .addReg(IdxReg);
+
+  // Move index from VCC into M0
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+    .addReg(AMDGPU::VCC_LO);
+
+  // Compare the just read M0 value to all possible Idx values
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+    .addReg(AMDGPU::M0)
+    .addReg(IdxReg);
+
+  // Update EXEC, save the original EXEC value to VCC
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+    .addReg(AMDGPU::VCC);
+
+  if (Offset) {
+    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+      .addReg(AMDGPU::M0)
+      .addImm(Offset);
+  }
+
+  // Do the actual move
+  LoopBB.insert(I, MovRel);
+
+  // Update EXEC, switch all done bits to 0 and all todo bits to 1
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addReg(AMDGPU::VCC);
 
+  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&LoopBB);
+}
+
+// Returns true if a new block was inserted.
+bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I = MI;
 
-  unsigned Save = MI.getOperand(1).getReg();
   unsigned Idx = MI.getOperand(3).getReg();
 
   if (AMDGPU::SReg_32RegClass.contains(Idx)) {
     if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(Idx)
-              .addImm(Offset);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+        .addReg(Idx)
+        .addImm(Offset);
     } else {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-              .addReg(Idx);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addReg(Idx);
     }
-    MBB.insert(I, MovRel);
-  } else {
 
-    assert(AMDGPU::SReg_64RegClass.contains(Save));
-    assert(AMDGPU::VGPR_32RegClass.contains(Idx));
+    MBB.insert(I, MovRel);
+    MI.eraseFromParent();
+    return false;
+  }
 
-    // Save the EXEC mask
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
-            .addReg(AMDGPU::EXEC);
+  MachineFunction &MF = *MBB.getParent();
+  unsigned Save = MI.getOperand(1).getReg();
 
-    // Read the next variant into VCC (lower 32 bits) <- also loop target
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-            AMDGPU::VCC_LO)
-            .addReg(Idx);
+  // Reading from a VGPR requires looping over all workitems in the wavefront.
+  assert(AMDGPU::SReg_64RegClass.contains(Save) &&
+         AMDGPU::VGPR_32RegClass.contains(Idx));
 
-    // Move index from VCC into M0
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-            .addReg(AMDGPU::VCC_LO);
+  // Save the EXEC mask
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+    .addReg(AMDGPU::EXEC);
 
-    // Compare the just read M0 value to all possible Idx values
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
-      .addReg(AMDGPU::M0)
-      .addReg(Idx);
+  // To insert the loop we need to split the block. Move everything after this
+  // point to a new block, and insert a new empty block between the two.
+  MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
 
-    // Update EXEC, save the original EXEC value to VCC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
-            .addReg(AMDGPU::VCC);
+  MF.insert(MBBI, LoopBB);
+  MF.insert(MBBI, RemainderBB);
 
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(AMDGPU::M0)
-              .addImm(Offset);
-    }
-    // Do the actual move
-    MBB.insert(I, MovRel);
+  LoopBB->addSuccessor(LoopBB);
+  LoopBB->addSuccessor(RemainderBB);
 
-    // Update EXEC, switch all done bits to 0 and all todo bits to 1
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-            .addReg(AMDGPU::EXEC)
-            .addReg(AMDGPU::VCC);
+  // Move the rest of the block into a new block.
+  RemainderBB->transferSuccessors(&MBB);
+  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
 
-    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-      .addImm(-7);
+  emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Save, Idx, Offset);
 
-    // Restore EXEC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-            .addReg(Save);
+  MachineBasicBlock::iterator First = RemainderBB->begin();
+  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+    .addReg(Save);
 
-  }
   MI.eraseFromParent();
+  return true;
 }
 
 /// \param @VecReg The register which holds element zero of the vector
@@ -463,8 +505,8 @@
   Reg = RC->getRegister(RegIdx);
 }
 
-void SILowerControlFlow::IndirectSrc(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
@@ -480,11 +522,11 @@
             .addReg(Reg)
             .addReg(Vec, RegState::Implicit);
 
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Off);
 }
 
-void SILowerControlFlow::IndirectDst(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
@@ -501,7 +543,7 @@
             .addReg(Val)
             .addReg(Dst, RegState::Implicit);
 
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Off);
 }
 
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
@@ -514,11 +556,14 @@
   bool NeedFlat = false;
   unsigned Depth = 0;
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
+  MachineFunction::iterator NextBB;
 
-    MachineBasicBlock *EmptyMBBAtEnd = NULL;
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; BI = NextBB) {
+    NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
+
+    MachineBasicBlock *EmptyMBBAtEnd = nullptr;
     MachineBasicBlock::iterator I, Next;
     bool ExecModified = false;
 
@@ -591,7 +636,15 @@
         case AMDGPU::SI_INDIRECT_SRC_V4:
         case AMDGPU::SI_INDIRECT_SRC_V8:
         case AMDGPU::SI_INDIRECT_SRC_V16:
-          IndirectSrc(MI);
+          if (indirectSrc(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
           break;
 
         case AMDGPU::SI_INDIRECT_DST_V1:
@@ -599,7 +652,15 @@
         case AMDGPU::SI_INDIRECT_DST_V4:
         case AMDGPU::SI_INDIRECT_DST_V8:
         case AMDGPU::SI_INDIRECT_DST_V16:
-          IndirectDst(MI);
+          if (indirectDst(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
           break;
 
         case AMDGPU::S_ENDPGM: {
Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll
===================================================================
--- test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -174,6 +174,213 @@
   ret void
 }
 
+; When the block is split to insert the loop, make sure any other
+; places that need to be expanded in the same block are also handled.
+
+; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
+
+; CHECK: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
+; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
+; CHECK: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: s_mov_b64 [[MASK]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dword [[MOVREL0]]
+; CHECK: buffer_store_dword [[MOVREL1]]
+define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %id.ext = zext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %idx1 = add i32 %idx0, 1
+  %val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
+  %val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
+  store volatile i32 %val0, i32 addrspace(1)* %out0
+  store volatile i32 %val1, i32 addrspace(1)* %out0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
+; CHECK-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
+; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], s[[S_ELT0]]
+; CHECK-DAG: v_mov_b32_e32 [[INS0:v[0-9]+]], 62
+; CHECK-DAG: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL0:[0-9]+]], [[INS0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: v_mov_b32_e32 [[INS1:v[0-9]+]], 63
+; CHECK: s_mov_b64 [[MASK]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL1:[0-9]+]], [[INS1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dwordx4 v{{\[}}[[MOVREL0]]:
+define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %id.ext = zext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %idx1 = add i32 %idx0, 1
+  %vec1 = insertelement <4 x i32> %vec0, i32 62, i32 %idx0
+  %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
+  store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}extract_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @extract_adjacent_blocks(i32 %arg) #0 {
+bb:
+  %tmp = icmp eq i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb4
+
+bb1:
+  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp3 = extractelement <4 x float> %tmp2, i32 undef
+  br label %bb7
+
+bb4:
+  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp6 = extractelement <4 x float> %tmp5, i32 undef
+  br label %bb7
+
+bb7:
+  %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+  store volatile float %tmp8, float addrspace(1)* undef
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+bb:
+  %tmp = icmp eq i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb4
+
+bb1:                                              ; preds = %bb
+  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
+  br label %bb7
+
+bb4:                                              ; preds = %bb
+  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
+  br label %bb7
+
+bb7:                                              ; preds = %bb4, %bb1
+  %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+  store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
+  ret void
+}
+
+; FIXME: Should be able to fold zero input to movreld to inline imm?
+
+; CHECK-LABEL: {{^}}multi_same_block:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; CHECK-DAG: s_add_i32 m0, [[ARG]], -16
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, [[ZERO]]
+
+; CHECK: s_add_i32 m0, [[ARG]], -14
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
+; CHECK: s_mov_b32 m0, -1
+; CHECK: ds_write_b32
+; CHECK: ds_write_b32
+; CHECK: s_endpgm
+define void @multi_same_block(i32 %arg) #0 {
+bb:
+  %tmp1 = add i32 %arg, -16
+  %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 0.000000e+00, i32 %tmp1
+  %tmp3 = add i32 %arg, -16
+  %tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float 0x3FB99999A0000000, i32 %tmp3
+  %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
+  %tmp6 = extractelement <6 x i32> %tmp5, i32 1
+  %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
+  %tmp8 = extractelement <6 x i32> %tmp7, i32 5
+  store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
+  store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/ret_jump.ll
===================================================================
--- test/CodeGen/AMDGPU/ret_jump.ll
+++ test/CodeGen/AMDGPU/ret_jump.ll
@@ -1,17 +1,22 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-target triple = "amdgcn--"
+; This should end with an no-op sequence of exec mask manipulations
+; Mask should be in original state after executed unreachable block
 
 ; GCN-LABEL: {{^}}main:
-; GCN: BB0_3:
-; GCN-NEXT: s_branch [[LASTBB:BB[0-9]*_[0-9]*]]
-; GCN-NEXT: BB0_
-; GCN: [[LASTBB]]
-; GCN-NEXT: .Lfunc_end0:
-; ModuleID = 'bugpoint-reduced-simplified.bc'
-target triple = "amdgcn--"
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
 
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: [[RET_BB]]:
+; GCN-NEXT: ; return
+
+; GCN-NEXT: [[UNREACHABLE_BB]]:
+; GCN-NEXT: s_or_b64 exec, exec, [[XOR_EXEC]]
+; GCN-NEXT: .Lfunc_end0
 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
 main_body:
   %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
Index: test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- test/CodeGen/AMDGPU/wqm.ll
+++ test/CodeGen/AMDGPU/wqm.ll
@@ -122,9 +122,13 @@
 ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
 ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
 ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
-;CHECK-NEXT: %ELSE
-;CHECK: store
-;CHECK: %END
+;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
+;CHECK-NEXT: ; BB#3: ; %ELSE
+;CHECK: store_dword
+;CHECK: [[END_BB]]: ; %END
+;CHECK: s_or_b64 exec, exec,
+;CHECK: v_mov_b32_e32 v0
+;CHECK: ; return
 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
 main_body:
   %cmp = icmp eq i32 %z, 0