diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -66,9 +66,11 @@
 
   // We can preserve non-critical-edgeness when we unify function exit nodes
   void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void unifyExports(Function &F, ArrayRef<BasicBlock *> ReturningBlocks,
+                    IRBuilder<> &B);
   BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
                                   ArrayRef<BasicBlock *> ReturningBlocks,
-                                  bool InsertExport, StringRef Name);
+                                  bool UpdateExports, StringRef Name);
   bool runOnFunction(Function &F) override;
 };
 
@@ -133,36 +135,118 @@
   return true;
 }
 
-static void removeDoneExport(Function &F) {
-  ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
-  for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
-      if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
-        if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
-          Intrin->setArgOperand(6, BoolFalse); // done
-        } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
-          Intrin->setArgOperand(4, BoolFalse); // done
-        }
+// Reverse search block to find a "done" export
+static IntrinsicInst *findExportDone(BasicBlock *BB, ConstantInt *BoolTrue) {
+  for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) {
+    if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&*I)) {
+      if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
+        if (Intrin->getArgOperand(6) == BoolTrue)
+          return Intrin;
+      } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
+        if (Intrin->getArgOperand(4) == BoolTrue)
+          return Intrin;
       }
     }
   }
+  return nullptr;
 }
 
-BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
-    Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
-    bool InsertExport, StringRef Name) {
-  // Otherwise, we need to insert a new basic block into the function, add a PHI
-  // nodes (if the function returns values), and convert all of the return
-  // instructions into unconditional branches.
-  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
-  IRBuilder<> B(NewRetBlock);
+// Search a block and its predecessors to find a "done" export
+static IntrinsicInst *findExportDone(BasicBlock *BB,
+                                     DenseSet<const BasicBlock *> &Visited,
+                                     ConstantInt *BoolTrue) {
+  SmallVector<BasicBlock *, 8> Stack;
+
+  Stack.push_back(BB);
+  do {
+    BasicBlock *Top = Stack.pop_back_val();
+    IntrinsicInst *Export = findExportDone(Top, BoolTrue);
+    if (Export)
+      return Export;
+
+    for (BasicBlock *Pred : predecessors(Top)) {
+      if (Visited.insert(Pred).second)
+        Stack.push_back(Pred);
+    }
+  } while (!Stack.empty());
+
+  return nullptr;
+}
+
+static bool isChannelEnabled(unsigned Channel, uint64_t EnBits, bool IsCompr) {
+  return (!IsCompr && (EnBits & (1 << Channel))) ||
+         (IsCompr && (EnBits & (0x3 << (2 * Channel))));
+}
 
-  if (InsertExport) {
-    // Ensure that there's only one "done" export in the shader by removing the
-    // "done" bit set on the original final export. More than one "done" export
-    // can lead to undefined behavior.
-    removeDoneExport(F);
+static StringRef unifiedChannelName(unsigned Channel) {
+  switch (Channel) {
+  case 0:
+    return "UnifiedExportVSRC0";
+  case 1:
+    return "UnifiedExportVSRC1";
+  case 2:
+    return "UnifiedExportVSRC2";
+  case 3:
+    return "UnifiedExportVSRC3";
+  default:
+    return "";
+  }
+}
 
+void AMDGPUUnifyDivergentExitNodes::unifyExports(
+    Function &F, ArrayRef<BasicBlock *> ReturningBlocks, IRBuilder<> &B) {
+  // Ensure that there's only one "done" export in the shader by unifying
+  // all "done" exports into a single instruction in the unified return block.
+  // More than one "done" export can lead to undefined behavior.
+
+  // Find exports to unify
+  ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
+  DenseMap<BasicBlock *, IntrinsicInst *> Exports;
+  DenseSet<const BasicBlock *> Visited;
+  Type *ExportType = nullptr;
+  uint64_t EnBits = 0;
+  uint64_t Target = 0;
+  bool IsCompr = false;
+  bool MatchingExports = true;
+  for (BasicBlock *BB : ReturningBlocks) {
+    IntrinsicInst *Intrin = findExportDone(BB, Visited, BoolTrue);
+    if (!Intrin)
+      continue;
+
+    // Check all exports we are unifying are consistent
+    uint64_t TargetVal =
+        cast<ConstantInt>(Intrin->getArgOperand(0))->getZExtValue();
+    uint64_t EnBitsVal =
+        cast<ConstantInt>(Intrin->getArgOperand(1))->getZExtValue();
+    Type *TypeVal = Intrin->getFunctionType()->getParamType(2);
+    bool ComprVal = Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr;
+    if (Exports.size()) {
+      MatchingExports = MatchingExports && (Target == TargetVal) &&
+                        (EnBits == EnBitsVal) && (ExportType == TypeVal) &&
+                        (IsCompr == ComprVal);
+    } else {
+      Target = TargetVal;
+      EnBits = EnBitsVal;
+      ExportType = TypeVal;
+      IsCompr = ComprVal;
+    }
+    Exports[BB] = Intrin;
+  }
+
+  if (!Exports.size() || !MatchingExports) {
+    LLVM_DEBUG(dbgs() << "Unifying exports using null export\n");
+
+    // Clear "done" bit from exports
+    ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
+    for (auto Export : Exports) {
+      IntrinsicInst *Intrin = Export.second;
+      if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
+        Intrin->setArgOperand(6, BoolFalse); // done
+      } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
+        Intrin->setArgOperand(4, BoolFalse); // done
+      }
+    }
+    // Insert null export for unified return block
     Value *Undef = UndefValue::get(B.getFloatTy());
     B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
                       {
@@ -172,7 +256,70 @@
                         B.getTrue(), // done
                         B.getTrue(), // valid mask
                       });
+  } else {
+    LLVM_DEBUG(dbgs() << "Unifying exports by merging\n");
+
+    // Create one phi node per enabled export channel
+    const unsigned Channels = IsCompr ? 2 : 4;
+    PHINode *Phis[4] = {nullptr};
+    for (unsigned Idx = 0; Idx < Channels; ++Idx) {
+      if (isChannelEnabled(Idx, EnBits, IsCompr))
+        Phis[Idx] = B.CreatePHI(ExportType, ReturningBlocks.size(),
+                                unifiedChannelName(Idx));
+    }
+    // Add export values to phi nodes and remove old export
+    Value *Undef = UndefValue::get(ExportType);
+    for (BasicBlock *BB : ReturningBlocks) {
+      if (Exports.count(BB)) {
+        IntrinsicInst *Intrin = Exports[BB];
+        for (unsigned Idx = 0; Idx < Channels; ++Idx) {
+          if (isChannelEnabled(Idx, EnBits, IsCompr))
+            Phis[Idx]->addIncoming(Intrin->getOperand(2 + Idx), BB);
+        }
+        Intrin->eraseFromParent();
+      } else {
+        for (unsigned Idx = 0; Idx < Channels; ++Idx) {
+          if (isChannelEnabled(Idx, EnBits, IsCompr))
+            Phis[Idx]->addIncoming(Undef, BB);
+        }
+      }
+    }
+    // Insert new unified export based on phi nodes
+    if (IsCompr) {
+      B.CreateIntrinsic(Intrinsic::amdgcn_exp_compr, {ExportType},
+                        {
+                            B.getInt32(Target), B.getInt32(EnBits),
+                            (EnBits & 0x3) ? Phis[0] : Undef,
+                            (EnBits & 0xc) ? Phis[1] : Undef,
+                            B.getTrue(), // done
+                            B.getTrue(), // valid mask
+                        });
+    } else {
+      B.CreateIntrinsic(Intrinsic::amdgcn_exp, {ExportType},
+                        {
+                            B.getInt32(Target), B.getInt32(EnBits),
+                            (EnBits & 0x1) ? Phis[0] : Undef,
+                            (EnBits & 0x2) ? Phis[1] : Undef,
+                            (EnBits & 0x4) ? Phis[2] : Undef,
+                            (EnBits & 0x8) ? Phis[3] : Undef,
+                            B.getTrue(), // done
+                            B.getTrue(), // valid mask
+                        });
+    }
   }
+}
+
+BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
+    Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
+    bool UpdateExports, StringRef Name) {
+  // Otherwise, we need to insert a new basic block into the function, add a PHI
+  // nodes (if the function returns values), and convert all of the return
+  // instructions into unconditional branches.
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+  IRBuilder<> B(NewRetBlock);
+
+  if (UpdateExports)
+    unifyExports(F, ReturningBlocks, B);
 
   PHINode *PN = nullptr;
   if (F.getReturnType()->isVoidTy()) {
@@ -181,7 +328,7 @@
     // If the function doesn't return void... add a PHI node to the block...
     PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
                      "UnifiedRetVal");
-    assert(!InsertExport);
+    assert(!UpdateExports);
     B.CreateRet(PN);
   }
 
@@ -239,7 +386,7 @@
   // Dummy return block for infinite loop.
   BasicBlock *DummyReturnBB = nullptr;
 
-  bool InsertExport = false;
+  bool UpdateExports = false;
 
   bool Changed = false;
   std::vector<DominatorTree::UpdateType> Updates;
@@ -265,11 +412,16 @@
         // For pixel shaders, the producer guarantees that an export is
         // executed before each return instruction. However, if there is an
         // infinite loop and we insert a return ourselves, we need to uphold
-        // that guarantee by inserting a null export. This can happen e.g. in
-        // an infinite loop with kill instructions, which is supposed to
-        // terminate. However, we don't need to do this if there is a non-void
-        // return value, since then there is an epilog afterwards which will
-        // still export.
+        // that guarantee by inserting a null export, or moving an existing
+        // export. This can happen e.g. in an infinite loop with kill
+        // instructions, which is supposed to terminate. However, we don't need
+        // to do this if there is a non-void return value, since then there is
+        // an epilog afterwards which will still export.
+        //
+        // We try not to insert new exports by moving an existing export to
+        // the unified return block.  When this is not possible, due to
+        // multiple incompatible final exports, then the "done" flag on
+        // all other exports is clear and a final null export is added.
         //
         // Note: In the case where only some threads enter the infinite loop,
         // this can result in the null export happening redundantly after the
@@ -281,14 +433,14 @@
         // Therefore, unless there's an actual infinite loop, which can have
         // invalid results, or there's a kill after the last export, which we
         // assume the frontend won't do, this export will have the same exec
-        // mask as the last "real" export, and therefore the valid mask will be
+        // mark as the last "real" export, and therefore the valid mask will be
         // overwritten with the same value and will still be correct. Also,
         // even though this forces an extra unnecessary export wait, we assume
         // that this happens rare enough in practice to that we don't have to
         // worry about performance.
         if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
             RetTy->isVoidTy()) {
-          InsertExport = true;
+          UpdateExports = true;
         }
 
         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
@@ -382,7 +534,7 @@
   if (ReturningBlocks.empty())
     return Changed; // No blocks return
 
-  if (ReturningBlocks.size() == 1 && !InsertExport)
+  if (ReturningBlocks.size() == 1 && !UpdateExports)
     return Changed; // Already has a single return block
 
   // Unify returning blocks. If we are going to insert the export it is also
@@ -391,11 +543,11 @@
   // and we do not want to end up with the normal export in a non-unified,
   // uniformly reached block with the "done" bit cleared.
   auto BlocksToUnify = std::move(ReturningBlocks);
-  if (InsertExport) {
+  if (UpdateExports) {
     llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
   }
 
-  unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport,
+  unifyReturnBlockSet(F, DTU, BlocksToUnify, UpdateExports,
                       "UnifiedReturnBlock");
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
--- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
@@ -4,13 +4,14 @@
 ; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls
 ; it with "true". In case it's called in a provably infinite loop, we still
 ; need to successfully exit and export something, even if we can't know where
-; to jump to in the LLVM IR. Therefore we insert a null export ourselves in
-; this case right before the s_endpgm to avoid GPU hangs, which is what this
-; tests.
+; to jump to in the LLVM IR. Therefore we move the export to the shader end.
 
 ; CHECK-LABEL: return_void
-; Make sure that we remove the done bit from the original export
-; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm
+; Make sure there is a single export
+; CHECK-NOT: s_endpgm
+; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} done vm
+; CHECK-NEXT: s_endpgm
+; Early termination block follows
 ; CHECK: exp null off, off, off, off done vm
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @return_void(float %0) #0 {
@@ -29,7 +30,10 @@
 
 ; Check that we also remove the done bit from compressed exports correctly.
 ; CHECK-LABEL: return_void_compr
-; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm
+; CHECK-NOT: s_endpgm
+; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off done compr vm
+; CHECK-NEXT: s_endpgm
+; Early termination block follows
 ; CHECK: exp null off, off, off, off done vm
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @return_void_compr(float %0) #0 {
@@ -48,9 +52,10 @@
 
 ; test the case where there's only a kill in an infinite loop
 ; CHECK-LABEL: only_kill
+; CHECK-NOT: s_endpgm
 ; CHECK: exp null off, off, off, off done vm
 ; CHECK-NEXT: s_endpgm
-; SILateBranchLowering inserts an extra null export here, but it should be harmless.
+; Early termination block follows
 ; CHECK: exp null off, off, off, off done vm
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @only_kill() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -718,41 +718,97 @@
   unreachable
 }
 
-; Test that there is an extra export inserted after the normal export,
-; if the normal export is inside a uniformly reached block and there is
-; an infinite loop in the pixel shader.
+; Test that divergent exit is unified with uniform export exit.
 
 ; IR-LABEL: @uniformly_reached_export
 ; IR-NEXT: .entry:
-; IR: br i1 [[CND:%.*]], label %[[EXP:.*]], label %[[FLOW:.*]]
+; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock
+
+; IR: [[LOOP]]:
+; IR-NEXT: br i1 false, label %[[FLOW:.*]], label %[[LOOP]]
 
 ; IR: [[FLOW]]:
-; IR-NEXT: phi
-; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock
+; IR-NEXT: br label %UnifiedReturnBlock
 
-; IR: [[LOOP]]:
-; IR-NEXT: br i1 false, label %[[FLOW1:.*]], label %[[LOOP]]
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true)
+; IR-NEXT: ret void
 
-; IR: [[EXP]]:
-; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg false, i1 immarg true)
-; IR-NEXT: br label %[[FLOW]]
+define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {
+.entry:
+  %tmp26 = fcmp olt float %tmp25, 0.000000e+00
+  br i1 %tmp26, label %loop, label %bb27
+
+loop:                                             ; preds = %loop, %.entry
+  br label %loop
+
+bb27:                                             ; preds = %.entry
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
+  ret void
+}
+
+; Test that early uniform export is moved to unified exit block.
+
+; IR-LABEL: @uniformly_reached_early_export
+; IR-NEXT: .entry:
+; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock
+
+; IR: [[LOOP]]:
+; IR-NEXT: br i1 false, label %[[FLOW:.*]], label %[[LOOP]]
 
-; IR: [[FLOW1]]:
+; IR: [[FLOW]]:
 ; IR-NEXT: br label %UnifiedReturnBlock
 
 ; IR: UnifiedReturnBlock:
-; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
+; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true)
 ; IR-NEXT: ret void
 
-define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {
+define amdgpu_ps void @uniformly_reached_early_export(float inreg %tmp25) {
 .entry:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
   %tmp26 = fcmp olt float %tmp25, 0.000000e+00
   br i1 %tmp26, label %loop, label %bb27
 
-loop:                                               ; preds = %loop, %.entry
+loop:                                             ; preds = %loop, %.entry
   br label %loop
 
 bb27:                                             ; preds = %.entry
+  ret void
+}
+
+; Test that incompatible divergent exports are not unified.
+
+; IR-LABEL: @incompatible_divergent_exports
+
+; IR: bb26:
+; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 1, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg false, i1 immarg true)
+; IR-NEXT: br
+
+; IR: bb27:
+; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg false, i1 immarg true)
+; IR-NEXT: br
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
+; IR-NEXT: ret void
+
+define amdgpu_ps void @incompatible_divergent_exports(float inreg %tmp25) {
+.entry:
+  %tmp26 = fcmp olt float %tmp25, 1.000000e+00
+  br i1 %tmp26, label %loop, label %bb25
+
+loop:                                             ; preds = %loop, %.entry
+  br label %loop
+
+bb25:                                             ; preds = %.entry
+  %tmp27 = fcmp olt float %tmp25, 0.000000e+00
+  br i1 %tmp27, label %bb26, label %bb27
+
+bb26:                                             ; preds = %.bb25
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 1, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
+  ret void
+
+bb27:                                             ; preds = %.bb25
   call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
   ret void
 }