diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -18,9 +18,11 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -28,6 +30,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -52,16 +55,16 @@
   const SIRegisterInfo *TRI = nullptr;
   const SIInstrInfo *TII = nullptr;
   unsigned SkipThreshold = 0;
+  MachineDominatorTree *MDT = nullptr;
 
   bool shouldSkip(const MachineBasicBlock &From,
                   const MachineBasicBlock &To) const;
 
-  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
+  bool dominatesAllReachable(MachineBasicBlock &MBB);
+  void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                  DebugLoc DL);
 
-  void kill(MachineInstr &MI);
-
-  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+  bool kill(MachineInstr &MI);
 
   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
 
@@ -79,6 +82,8 @@
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -87,8 +92,11 @@
 
 char SIInsertSkips::ID = 0;
 
-INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
-                "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
+                      "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
+                    "SI insert s_cbranch_execz instructions", false, false)
 
 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
 
@@ -146,42 +154,73 @@
   return false;
 }
 
-bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction *MF = MBB.getParent();
-
-  if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
-      !shouldSkip(MBB, MBB.getParent()->back()))
-    return false;
-
-  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
-
-  const DebugLoc &DL = MI.getDebugLoc();
-
-  // If the exec mask is non-zero, skip the next two instructions
-  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-    .addMBB(&NextBB);
+/// Check whether \p MBB dominates all blocks that are reachable from it.
+bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
+  for (MachineBasicBlock *Other : depth_first(&MBB)) {
+    if (!MDT->dominates(&MBB, Other))
+      return false;
+  }
+  return true;
+}
 
-  MachineBasicBlock::iterator Insert = SkipBB->begin();
+/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
+/// iterator. Only applies to pixel shaders.
+void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, DebugLoc DL) {
+  MachineFunction *MF = MBB.getParent();
+  assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
+
+  // Currently, SI_KILL_*_TERMINATOR is expected to occur only as the last
+  // terminator of a basic block. If this ever changes, we need to optionally
+  // split MBB here.
+  assert(I == MBB.end());
+
+  // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
+  // basic block that has no further successors (e.g., there was an
+  // `unreachable` there in IR). This can happen with original source of the
+  // form:
+  //
+  //   if (uniform_condition) {
+  //     write_to_memory();
+  //     discard;
+  //   }
+  //
+  // In this case, we write the "null_export; s_endpgm" skip code in the
+  // already-existing basic block.
+  auto NextBBI = std::next(MBB.getIterator());
+  bool NoSuccessor = llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
+  MachineBasicBlock *SkipBB;
+
+  if (NoSuccessor) {
+    SkipBB = &MBB;
+  } else {
+    // Create a new basic block that will contain the "null export; s_endpgm"
+    // and set up the branching to go around it.
+    SkipBB = MF->CreateMachineBasicBlock();
+    MF->insert(NextBBI, SkipBB);
 
-  // Exec mask is zero: Export to NULL target...
-  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
-    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addImm(1)  // vm
-    .addImm(0)  // compr
-    .addImm(0); // en
+    BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&*NextBBI);
+    MBB.addSuccessor(SkipBB);
 
-  // ... and terminate wavefront.
-  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
+    MDT->addNewBlock(SkipBB, &MBB);
+  }
 
-  return true;
+  // Generate "null export; s_endpgm".
+  BuildMI(SkipBB, DL, TII->get(AMDGPU::EXP_DONE))
+      .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+      .addReg(AMDGPU::VGPR0, RegState::Undef)
+      .addReg(AMDGPU::VGPR0, RegState::Undef)
+      .addReg(AMDGPU::VGPR0, RegState::Undef)
+      .addReg(AMDGPU::VGPR0, RegState::Undef)
+      .addImm(1)  // vm
+      .addImm(0)  // compr
+      .addImm(0); // en
+  BuildMI(SkipBB, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
 }
 
-void SIInsertSkips::kill(MachineInstr &MI) {
+/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
+/// Return true unless the terminator is a no-op.
+bool SIInsertSkips::kill(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
@@ -268,7 +307,7 @@
 
       I.addImm(0);  // omod
     }
-    break;
+    return true;
   }
   case AMDGPU::SI_KILL_I1_TERMINATOR: {
     const MachineFunction *MF = MI.getParent()->getParent();
@@ -283,11 +322,13 @@
       int64_t Imm = Op.getImm();
       assert(Imm == 0 || Imm == -1);
 
-      if (Imm == KillVal)
+      if (Imm == KillVal) {
         BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
                                                      : AMDGPU::S_MOV_B64), Exec)
           .addImm(0);
-      break;
+        return true;
+      }
+      return false;
     }
 
     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
@@ -296,27 +337,13 @@
     BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
         .addReg(Exec)
         .add(Op);
-    break;
+    return true;
   }
   default:
     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
   }
 }
 
-MachineBasicBlock *SIInsertSkips::insertSkipBlock(
-  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
-  MachineFunction *MF = MBB.getParent();
-
-  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
-  MachineFunction::iterator MBBI(MBB);
-  ++MBBI;
-
-  MF->insert(MBBI, SkipBB);
-  MBB.addSuccessor(SkipBB);
-
-  return SkipBB;
-}
-
 // Returns true if a branch over the block was inserted.
 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
                                    MachineBasicBlock &SrcMBB) {
@@ -430,47 +457,21 @@
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
   SkipThreshold = SkipThresholdFlag;
 
-  bool HaveKill = false;
-  bool MadeChange = false;
-
-  // Track depth of exec mask, divergent branches.
-  SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
-
-  MachineFunction::iterator NextBB;
-
   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+  SmallVector<MachineInstr *, 4> KillInstrs;
+  bool MadeChange = false;
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; BI = NextBB) {
-    NextBB = std::next(BI);
-    MachineBasicBlock &MBB = *BI;
-    bool HaveSkipBlock = false;
-
-    if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
-      // Reached convergence point for last divergent branch.
-      ExecBranchStack.pop_back();
-    }
-
-    if (HaveKill && ExecBranchStack.empty()) {
-      HaveKill = false;
-
-      // TODO: Insert skip if exec is 0?
-    }
-
+  for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator I, Next;
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
-
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
-      case AMDGPU::S_CBRANCH_EXECZ:
-        ExecBranchStack.push_back(MI.getOperand(0).getMBB());
-        break;
       case AMDGPU::SI_MASK_BRANCH:
-        ExecBranchStack.push_back(MI.getOperand(0).getMBB());
         MadeChange |= skipMaskBranch(MI, MBB);
         break;
 
@@ -478,32 +479,37 @@
         // Optimize out branches to the next block.
         // FIXME: Shouldn't this be handled by BranchFolding?
         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
+          assert(&MI == &MBB.back());
           MI.eraseFromParent();
-        } else if (HaveSkipBlock) {
-          // Remove the given unconditional branch when a skip block has been
-          // inserted after the current one and let skip the two instructions
-          // performing the kill if the exec mask is non-zero.
-          MI.eraseFromParent();
+          MadeChange = true;
         }
         break;
 
       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
-      case AMDGPU::SI_KILL_I1_TERMINATOR:
+      case AMDGPU::SI_KILL_I1_TERMINATOR: {
         MadeChange = true;
-        kill(MI);
-
-        if (ExecBranchStack.empty()) {
-          if (NextBB != BE && skipIfDead(MI, *NextBB)) {
-            HaveSkipBlock = true;
-            NextBB = std::next(BI);
-            BE = MF.end();
-          }
+        bool CanKill = kill(MI);
+
+        // Check if we can add an early "if exec=0 { end shader }".
+        //
+        // Note that we _always_ do this if it is correct, even if the kill
+        // happens fairly late in the shader, because the null export should
+        // generally still be cheaper than normal export(s).
+        //
+        // TODO: The dominatesAllReachable check is conservative: if the
+        //       dominance is only missing due to _uniform_ branches, we could
+        //       in fact insert the early-exit as well.
+        if (CanKill &&
+            MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
+            dominatesAllReachable(MBB)) {
+          // Mark the instruction for kill-if-dead insertion. We delay this
+          // change because it modifies the CFG.
+          KillInstrs.push_back(&MI);
         } else {
-          HaveKill = true;
+          MI.eraseFromParent();
         }
-
-        MI.eraseFromParent();
         break;
+      }
 
       case AMDGPU::SI_RETURN_TO_EPILOG:
         // FIXME: Should move somewhere else
@@ -511,7 +517,7 @@
 
         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
         // because external bytecode will be appended at the end.
-        if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+        if (&MBB != &MF.back() || &MI != &MBB.back()) {
           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
           // the end and jump there.
           if (!EmptyMBBAtEnd) {
@@ -520,9 +526,9 @@
           }
 
           MBB.addSuccessor(EmptyMBBAtEnd);
-          BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+          BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
             .addMBB(EmptyMBBAtEnd);
-          I->eraseFromParent();
+          MI.eraseFromParent();
         }
         break;
 
@@ -537,5 +543,12 @@
     }
   }
 
+  for (MachineInstr *Kill : KillInstrs) {
+    skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
+               Kill->getDebugLoc());
+    Kill->eraseFromParent();
+  }
+  KillInstrs.clear();
+
   return MadeChange;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -12,7 +12,11 @@
 ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
 ; CHECK-NEXT: ; %bb.0:
 ; CHECK-NEXT: s_mov_b64 exec, 0
+; CHECK-NEXT: s_cbranch_execnz BB1_2
 ; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: exp null off, off, off, off done vm
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB1_2:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
   call void @llvm.amdgcn.kill(i1 false)
@@ -23,9 +27,15 @@
 ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2:
 ; CHECK-NEXT: ; %bb.0:
 ; CHECK-NEXT: s_mov_b64 exec, 0
-; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_cbranch_execnz BB2_2
+; CHECK:      exp null
+; CHECK-NEXT: s_endpgm
+; CHECK:      BB2_2:
 ; CHECK-NEXT: s_mov_b64 exec, 0
-; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_cbranch_execnz BB2_4
+; CHECK:      exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB2_4:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
   call void @llvm.amdgcn.kill(i1 false)
@@ -36,7 +46,10 @@
 ; CHECK-LABEL: {{^}}test_kill_depth_var:
 ; CHECK-NEXT: ; %bb.0:
 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_cbranch_execnz BB3_2
+; CHECK:      exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB3_2:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
   %cmp = fcmp olt float %x, 0.0
@@ -48,9 +61,15 @@
 ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
 ; CHECK-NEXT: ; %bb.0:
 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_cbranch_execnz BB4_2
+; CHECK:      exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB4_2:
 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_cbranch_execnz BB4_4
+; CHECK:      exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB4_4:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
   %cmp = fcmp olt float %x, 0.0
@@ -59,12 +78,19 @@
   ret void
 }
 
+; FIXME: Ideally only one early-exit would be emitted
 ; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
 ; CHECK-NEXT: ; %bb.0:
 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_cbranch_execnz BB5_2
+; CHECK:      exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB5_2:
 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1
-; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_cbranch_execnz BB5_4
+; CHECK:      exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB5_4:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
   %cmp.x = fcmp olt float %x, 0.0
@@ -119,14 +145,12 @@
 ; CHECK: v_nop_e64
 
 ; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
-; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
-; CHECK-NEXT: ; %bb.2:
-; CHECK-NEXT: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
 
-; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
-; CHECK-NEXT: s_endpgm
-define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 {
+; TODO: We could do an early-exit here (the branch above is uniform!)
+; CHECK-NOT: exp null
+
+; CHECK: v_mov_b32_e32 v0, 1.0
+define amdgpu_ps float @test_kill_control_flow(i32 inreg %arg) #0 {
 entry:
   %cmp = icmp eq i32 %arg, 0
   br i1 %cmp, label %bb, label %exit
@@ -149,7 +173,7 @@
   br label %exit
 
 exit:
-  ret void
+  ret float 1.0
 }
 
 ; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
@@ -171,13 +195,10 @@
 ; CHECK: v_mov_b32_e64 v8, -1
 ; CHECK: ;;#ASMEND
 ; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
-; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
 
-; CHECK-NEXT: ; %bb.2:
-; CHECK-NEXT: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
+; TODO: We could do an early-exit here (the branch above is uniform!)
+; CHECK-NOT: exp null
 
-; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
 ; CHECK: buffer_store_dword v8
 ; CHECK: v_mov_b32_e64 v9, -2
 
@@ -435,10 +456,7 @@
 
 ; CHECK-LABEL: {{^}}complex_loop:
 ; CHECK: s_mov_b64 exec, 0
-; The following is an error, since it happens nested inside the loop:
-; CHECK-NEXT: s_cbranch_execnz
-; CHECK-NEXT: ; %bb.{{[0-9]+}}
-; CHECK-NEXT: exp null
+; CHECK-NOT: exp null
 define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 .entry:
   %flaga = icmp sgt i32 %cmpa, 0