Index: include/llvm/CodeGen/MachineInstr.h
===================================================================
--- include/llvm/CodeGen/MachineInstr.h
+++ include/llvm/CodeGen/MachineInstr.h
@@ -70,7 +70,11 @@
     FrameDestroy = 1 << 1,              // Instruction is used as a part of
                                         // function frame destruction code.
     BundledPred  = 1 << 2,              // Instruction has bundled predecessors.
-    BundledSucc  = 1 << 3               // Instruction has bundled successors.
+    BundledSucc  = 1 << 3,              // Instruction has bundled successors.
+    Initiator    = 1 << 4,              // Instruction is used as a part of
+                                        // target-specific basic block prolog
+    Terminator   = 1 << 5               // Instruction is used as a part of
+                                        // target-specific basic block epilog
   };
 private:
   const MCInstrDesc *MCID;              // Instruction descriptor.
@@ -445,13 +449,20 @@
     return hasProperty(MCID::Barrier, Type);
   }
 
+  /// Returns true if this instruction is part of the initiator for a basic
+  /// block. This can be used by targets that have non-uniform control flow
+  /// to set up execution masks.
+  bool isInitiator() const {
+    return getFlag(Initiator); // TODO: QueryType?
+  }
+
   /// Returns true if this instruction part of the terminator for a basic block.
   /// Typically this is things like return and branch instructions.
   ///
   /// Various passes use this to insert code into the bottom of a basic block,
   /// but before control flow occurs.
   bool isTerminator(QueryType Type = AnyInBundle) const {
-    return hasProperty(MCID::Terminator, Type);
+    return hasProperty(MCID::Terminator, Type) || getFlag(Terminator); // TODO: QueryType?
   }
 
   /// Returns true if this is a conditional, unconditional, or indirect branch.
Index: lib/CodeGen/MachineInstr.cpp
===================================================================
--- lib/CodeGen/MachineInstr.cpp
+++ lib/CodeGen/MachineInstr.cpp
@@ -1929,19 +1929,25 @@
   }
 
   bool HaveSemi = false;
-  const unsigned PrintableFlags = FrameSetup | FrameDestroy;
+  const unsigned PrintableFlags = FrameSetup | FrameDestroy | Initiator | Terminator;
   if (Flags & PrintableFlags) {
     if (!HaveSemi) {
       OS << ";";
       HaveSemi = true;
     }
-    OS << " flags: ";
+    OS << " flags:";
 
     if (Flags & FrameSetup)
-      OS << "FrameSetup";
+      OS << " FrameSetup";
 
     if (Flags & FrameDestroy)
-      OS << "FrameDestroy";
+      OS << " FrameDestroy";
+
+    if (Flags & Initiator)
+      OS << " Initiator";
+
+    if (Flags & Terminator)
+      OS << " Terminator";
   }
 
   if (!memoperands_empty()) {
Index: lib/CodeGen/MachineVerifier.cpp
===================================================================
--- lib/CodeGen/MachineVerifier.cpp
+++ lib/CodeGen/MachineVerifier.cpp
@@ -81,6 +81,7 @@
     typedef SmallPtrSet<const MachineBasicBlock*, 8> BlockSet;
 
     const MachineInstr *FirstTerminator;
+    bool SeenNonInitiator;
     BlockSet FunctionBlocks;
 
     BitVector regsReserved;
@@ -573,6 +574,7 @@
 void
 MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   FirstTerminator = nullptr;
+  SeenNonInitiator = false;
 
   if (!MF->getProperties().hasProperty(
       MachineFunctionProperties::Property::NoPHIs)) {
@@ -788,6 +790,13 @@
     lastIndex = idx;
   }
 
+  // Ensure initiators don't follow non-initiators.
+  if (!MI->isInitiator()) {
+    SeenNonInitiator = true;
+  } else if (SeenNonInitiator) {
+    report("Initiator instruction after a non-initiator", MI);
+  }
+
   // Ensure non-terminators don't follow terminators.
   // Ignore predicated terminators formed by if conversion.
   // FIXME: If conversion shouldn't need to violate this rule.
Index: lib/Target/AMDGPU/SIInsertWaits.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaits.cpp
+++ lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -357,12 +357,6 @@
                                MachineBasicBlock::iterator I,
                                const Counters &Required) {
 
-  // End of program? No need to wait on anything
-  // A function not returning void needs to wait, because other bytecode will
-  // be appended after it and we don't know what it will be.
-  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
-    return false;
-
   // Figure out if the async instructions execute in order
   bool Ordered[3];
 
@@ -409,11 +403,17 @@
     ExpInstrTypesSeen = 0;
 
   // Build the wait instruction
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  MachineInstr *Wait =
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
           .addImm((Counts.Named.VM & 0xF) |
                   ((Counts.Named.EXP & 0x7) << 4) |
                   ((Counts.Named.LGKM & 0xF) << 8));
 
+  if (MachineInstr *Prev = Wait->getPrevNode()) {
+    if (Prev->isTerminator())
+      Wait->setFlag(MachineInstr::Terminator);
+  }
+
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
   return true;
@@ -585,12 +585,13 @@
 
       Counters Required;
 
-      // Wait for everything before a barrier.
+      // Wait for everything before a branch or barrier.
       //
       // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
       // but we also want to wait for any other outstanding transfers before
       // signalling other hardware blocks
-      if (I->getOpcode() == AMDGPU::S_BARRIER ||
+      if (I->isBranch() || I->getOpcode() == AMDGPU::SI_MASK_BRANCH ||
+          I->getOpcode() == AMDGPU::S_BARRIER ||
           I->getOpcode() == AMDGPU::S_SENDMSG)
         Required = LastIssued;
       else
@@ -607,8 +608,11 @@
       handleSendMsg(MBB, I);
     }
 
-    // Wait for everything at the end of the MBB
-    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+    // Wait for everything at the end of the MBB, in case there are no
+    // branches. No need to wait at the end of the (void-returning) program,
+    // since the hardware does so automatically.
+    if (!MBB.empty() && MBB.back().getOpcode() != AMDGPU::S_ENDPGM)
+      Changes |= insertWait(MBB, MBB.end(), LastIssued);
   }
 
   for (MachineInstr *I : RemoveMI)
Index: lib/Target/AMDGPU/SILowerControlFlow.cpp
===================================================================
--- lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -125,12 +125,14 @@
 
   MachineInstr *AndSaveExec =
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg)
-    .addOperand(Cond);
+    .addOperand(Cond)
+    .setMIFlag(MachineInstr::Terminator);
 
   MachineInstr *Xor =
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
     .addReg(AMDGPU::EXEC)
-    .addReg(SaveExecReg);
+    .addReg(SaveExecReg)
+    .setMIFlag(MachineInstr::Terminator);
 
   // Insert a pseudo terminator to help keep the verifier happy. This will also
   // be used later when inserting skips.
@@ -171,7 +173,8 @@
   // else.
   MachineInstr *OrSaveExec =
     BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg)
-    .addOperand(MI.getOperand(1)); // Saved EXEC
+    .addOperand(MI.getOperand(1)) // Saved EXEC
+    .setMIFlag(MachineInstr::Initiator);
   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
 
   MachineBasicBlock::iterator ElsePt(MI);
@@ -180,7 +183,8 @@
     MachineInstr *And =
       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
       .addReg(AMDGPU::EXEC)
-      .addReg(DstReg);
+      .addReg(DstReg)
+      .setMIFlag(MachineInstr::Terminator);
 
     if (LIS)
       LIS->InsertMachineInstrInMaps(*And);
@@ -189,12 +193,13 @@
   MachineInstr *Xor =
     BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
     .addReg(AMDGPU::EXEC)
-    .addReg(DstReg);
+    .addReg(DstReg)
+    .setMIFlag(MachineInstr::Terminator);
 
-  MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
-  // Insert a pseudo terminator to help keep the verifier happy.
+  // Insert an additional pseudo terminator to help keep the verifier happy
+  // and mark the location for skips to be inserted later.
   MachineInstr *Branch =
-    BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
     .addMBB(DestBB);
 
   if (!LIS) {
@@ -248,7 +253,8 @@
   MachineInstr *AndN2 =
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
     .addReg(AMDGPU::EXEC)
-    .addOperand(MI.getOperand(0));
+    .addOperand(MI.getOperand(0))
+    .setMIFlag(MachineInstr::Terminator);
 
   MachineInstr *Branch =
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
@@ -270,7 +276,8 @@
   MachineInstr *NewMI =
     BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
     .addReg(AMDGPU::EXEC)
-    .addOperand(MI.getOperand(0));
+    .addOperand(MI.getOperand(0))
+    .setMIFlag(MachineInstr::Initiator);
 
   if (LIS)
     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
Index: test/CodeGen/AMDGPU/loop_break.ll
===================================================================
--- test/CodeGen/AMDGPU/loop_break.ll
+++ test/CodeGen/AMDGPU/loop_break.ll
@@ -41,7 +41,7 @@
 ; GCN: s_andn2_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]
 
-; GCN: ; BB#4: ; %bb9
+; GCN-NEXT: BB0_4: ; %bb9
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_endpgm
 define void @break_loop(i32 %arg) #0 {
Index: test/CodeGen/AMDGPU/valu-i1.ll
===================================================================
--- test/CodeGen/AMDGPU/valu-i1.ll
+++ test/CodeGen/AMDGPU/valu-i1.ll
@@ -167,7 +167,7 @@
 ; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
 ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
 
-; SI: BB#5
+; SI: BB{{[0-9]+_[0-9]+}}: ; %Flow8
 ; SI: s_or_b64 exec, exec, [[COND_STATE]]
 
 ; SI: [[LABEL_EXIT]]: