diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -68,7 +68,7 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
                                   ArrayRef<BasicBlock *> ReturningBlocks,
-                                  bool InsertExport, StringRef Name);
+                                  bool UpdateExports);
   bool runOnFunction(Function &F) override;
 };
 
@@ -133,46 +133,94 @@
   return true;
 }
 
-static void removeDoneExport(Function &F) {
-  ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
-  for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
-      if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
-        if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
-          Intrin->setArgOperand(6, BoolFalse); // done
-        } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
-          Intrin->setArgOperand(4, BoolFalse); // done
-        }
+// Reverse search block to find a "done" export
+static IntrinsicInst *findExportDone(BasicBlock *BB, ConstantInt *BoolTrue) {
+  for (auto &I : reverse(*BB)) {
+    if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
+      if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
+        if (Intrin->getArgOperand(6) == BoolTrue)
+          return Intrin;
+      } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
+        if (Intrin->getArgOperand(4) == BoolTrue)
+          return Intrin;
       }
     }
   }
+  return nullptr;
+}
+
+// Search a block and its predecessors to see if there is a reachable
+// "done" export.
+// Note: this assumes the IR is already well-formed
+static bool hasExportDone(BasicBlock *BB, ConstantInt *BoolTrue) {
+  SmallPtrSet<BasicBlock *, 8> Visited;
+  SmallVector<BasicBlock *> Stack;
+
+  Stack.push_back(BB);
+  do {
+    BasicBlock *Top = Stack.pop_back_val();
+    if (findExportDone(Top, BoolTrue))
+      return true;
+
+    for (BasicBlock *Pred : predecessors(Top)) {
+      if (Visited.insert(Pred).second)
+        Stack.push_back(Pred);
+    }
+  } while (!Stack.empty());
+
+  return false;
 }
 
 BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
     Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
-    bool InsertExport, StringRef Name) {
-  // Otherwise, we need to insert a new basic block into the function, add a PHI
-  // nodes (if the function returns values), and convert all of the return
-  // instructions into unconditional branches.
-  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+    bool UpdateExports) {
+  const StringRef ExportBlockName = "UnifiedExportBlock";
+  const StringRef ReturnBlockName = "UnifiedReturnBlock";
+
+  // Process exports first so that any new blocks are inserted in order.
+  BasicBlock *ExportBlock = nullptr;
+  SmallPtrSet<const BasicBlock *, 2> NeedsExport;
+
+  if (UpdateExports) {
+    // Determine whether return blocks have a reachable "done" export.
+    // Note: this assume any reachable export is uniform, it is up to
+    // frontends to ensure this is true.
+    for (BasicBlock *BB : ReturningBlocks) {
+      if (!hasExportDone(BB, ConstantInt::getTrue(F.getContext())))
+        NeedsExport.insert(BB);
+    }
+    if (!NeedsExport.empty()) {
+      // Insert export block with export null.
+      // If every return needs an export, this will become the return block.
+      ExportBlock = BasicBlock::Create(
+          F.getContext(),
+          NeedsExport.size() == ReturningBlocks.size() ? ReturnBlockName
+                                                       : ExportBlockName,
+          &F);
+      IRBuilder<> B(ExportBlock);
+      Value *Undef = UndefValue::get(B.getFloatTy());
+      B.CreateIntrinsic(Intrinsic::amdgcn_exp, {B.getFloatTy()},
+                        {
+                            B.getInt32(AMDGPU::Exp::ET_NULL),
+                            B.getInt32(0),              // enabled channels
+                            Undef, Undef, Undef, Undef, // values
+                            B.getTrue(),                // done
+                            B.getTrue(),                // valid mask
+                        });
+    }
+  }
+
+  // We need to insert a new basic block into the function,
+  // add a PHI nodes (if the function returns values),
+  // and convert all of the return instructions into unconditional branches.
+  BasicBlock *NewRetBlock =
+      NeedsExport.size() == ReturningBlocks.size()
+          ? ExportBlock
+          : BasicBlock::Create(F.getContext(), ReturnBlockName, &F);
   IRBuilder<> B(NewRetBlock);
 
-  if (InsertExport) {
-    // Ensure that there's only one "done" export in the shader by removing the
-    // "done" bit set on the original final export. More than one "done" export
-    // can lead to undefined behavior.
-    removeDoneExport(F);
-
-    Value *Undef = UndefValue::get(B.getFloatTy());
-    B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
-                      {
-                        B.getInt32(AMDGPU::Exp::ET_NULL),
-                        B.getInt32(0), // enabled channels
-                        Undef, Undef, Undef, Undef, // values
-                        B.getTrue(), // done
-                        B.getTrue(), // valid mask
-                      });
-  }
+  if (ExportBlock && ExportBlock != NewRetBlock)
+    BranchInst::Create(NewRetBlock, ExportBlock);
 
   PHINode *PN = nullptr;
   if (F.getReturnType()->isVoidTy()) {
@@ -181,14 +229,16 @@
     // If the function doesn't return void... add a PHI node to the block...
     PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
                      "UnifiedRetVal");
-    assert(!InsertExport);
+    assert(!UpdateExports);
     B.CreateRet(PN);
   }
 
   // Loop over all of the blocks, replacing the return instruction with an
   // unconditional branch.
   std::vector<DominatorTree::UpdateType> Updates;
-  Updates.reserve(ReturningBlocks.size());
+  Updates.reserve(ReturningBlocks.size() + 1);
+  if (ExportBlock)
+    Updates.push_back({DominatorTree::Insert, NewRetBlock, NewRetBlock});
   for (BasicBlock *BB : ReturningBlocks) {
     // Add an incoming element to the PHI node for every return instruction that
     // is merging into this new block...
@@ -197,8 +247,13 @@
 
     // Remove and delete the return inst.
     BB->getTerminator()->eraseFromParent();
-    BranchInst::Create(NewRetBlock, BB);
-    Updates.push_back({DominatorTree::Insert, BB, NewRetBlock});
+    if (NeedsExport.count(BB)) {
+      BranchInst::Create(ExportBlock, BB);
+      Updates.push_back({DominatorTree::Insert, BB, ExportBlock});
+    } else {
+      BranchInst::Create(NewRetBlock, BB);
+      Updates.push_back({DominatorTree::Insert, BB, NewRetBlock});
+    }
   }
 
   if (RequireAndPreserveDomTree)
@@ -239,7 +294,7 @@
   // Dummy return block for infinite loop.
   BasicBlock *DummyReturnBB = nullptr;
 
-  bool InsertExport = false;
+  bool UpdateExports = false;
 
   bool Changed = false;
   std::vector<DominatorTree::UpdateType> Updates;
@@ -271,24 +326,14 @@
         // return value, since then there is an epilog afterwards which will
         // still export.
         //
-        // Note: In the case where only some threads enter the infinite loop,
-        // this can result in the null export happening redundantly after the
-        // original exports. However, The last "real" export happens after all
-        // the threads that didn't enter an infinite loop converged, which
-        // means that the only extra threads to execute the null export are
-        // threads that entered the infinite loop, and they only could've
-        // exited through being killed which sets their exec bit to 0.
-        // Therefore, unless there's an actual infinite loop, which can have
-        // invalid results, or there's a kill after the last export, which we
-        // assume the frontend won't do, this export will have the same exec
-        // mask as the last "real" export, and therefore the valid mask will be
-        // overwritten with the same value and will still be correct. Also,
-        // even though this forces an extra unnecessary export wait, we assume
-        // that this happens rare enough in practice to that we don't have to
-        // worry about performance.
+        // Note: we only insert a null export in the case where an existing
+        // "done" export is not reachable from a return block.
+        // This avoids double "done" exports for well-formed IR generated by
+        // existing frontends (which ensure there is only one such export in
+        // occuring in uniform control flow).
         if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
             RetTy->isVoidTy()) {
-          InsertExport = true;
+          UpdateExports = true;
         }
 
         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
@@ -382,7 +427,7 @@
   if (ReturningBlocks.empty())
     return Changed; // No blocks return
 
-  if (ReturningBlocks.size() == 1 && !InsertExport)
+  if (ReturningBlocks.size() == 1 && !UpdateExports)
     return Changed; // Already has a single return block
 
   // Unify returning blocks. If we are going to insert the export it is also
@@ -391,11 +436,10 @@
   // and we do not want to end up with the normal export in a non-unified,
   // uniformly reached block with the "done" bit cleared.
   auto BlocksToUnify = std::move(ReturningBlocks);
-  if (InsertExport) {
+  if (UpdateExports) {
     llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
   }
 
-  unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport,
-                      "UnifiedReturnBlock");
+  unifyReturnBlockSet(F, DTU, BlocksToUnify, UpdateExports);
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
--- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
@@ -9,10 +9,9 @@
 ; tests.
 
 ; CHECK-LABEL: return_void
-; Make sure that we remove the done bit from the original export
-; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm
-; CHECK: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
+; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} done vm
+; CHECK-NOT: exp
+; CHECK: s_endpgm
 define amdgpu_ps void @return_void(float %0) #0 {
 main_body:
   %cmp = fcmp olt float %0, 1.000000e+01
@@ -27,11 +26,10 @@
   ret void
 }
 
-; Check that we also remove the done bit from compressed exports correctly.
 ; CHECK-LABEL: return_void_compr
-; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm
-; CHECK: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
+; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off done compr vm
+; CHECK-NOT: exp
+; CHECK: s_endpgm
 define amdgpu_ps void @return_void_compr(float %0) #0 {
 main_body:
   %cmp = fcmp olt float %0, 1.000000e+01
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -718,9 +718,8 @@
   unreachable
 }
 
-; Test that there is an extra export inserted after the normal export,
-; if the normal export is inside a uniformly reached block and there is
-; an infinite loop in the pixel shader.
+; Test that there is an extra export inserted for the infinite loop
+; exit path of the pixel shader.
 
 ; IR-LABEL: @uniformly_reached_export
 ; IR-NEXT: .entry:
@@ -728,20 +727,23 @@
 
 ; IR: [[FLOW]]:
 ; IR-NEXT: phi
-; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock
+; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %[[FLOW1:.*]]
 
 ; IR: [[LOOP]]:
-; IR-NEXT: br i1 false, label %[[FLOW1:.*]], label %[[LOOP]]
+; IR-NEXT: br i1 false, label %UnifiedExportBlock, label %[[LOOP]]
 
 ; IR: [[EXP]]:
-; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg false, i1 immarg true)
+; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
 ; IR-NEXT: br label %[[FLOW]]
 
 ; IR: [[FLOW1]]:
 ; IR-NEXT: br label %UnifiedReturnBlock
 
-; IR: UnifiedReturnBlock:
+; IR: UnifiedExportBlock:
 ; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
+; IR-NEXT: br label %[[FLOW1]]
+
+; IR: UnifiedReturnBlock:
 ; IR-NEXT: ret void
 
 define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {