diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -119,22 +120,36 @@
 
 static BasicBlock *unifyReturnBlockSet(Function &F,
                                        ArrayRef<BasicBlock *> ReturningBlocks,
+                                       bool InsertExport,
                                        const TargetTransformInfo &TTI,
                                        StringRef Name) {
   // Otherwise, we need to insert a new basic block into the function, add a PHI
   // nodes (if the function returns values), and convert all of the return
   // instructions into unconditional branches.
   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+  IRBuilder<> B(NewRetBlock);
+
+  if (InsertExport) {
+      Value *Undef = UndefValue::get(B.getFloatTy());
+      B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
+                        {
+                          B.getInt32(9), // target, SQ_EXP_NULL
+                          B.getInt32(0), // enabled channels
+                          Undef, Undef, Undef, Undef, // values
+                          B.getTrue(), // done
+                          B.getTrue(), // valid mask
+                        });
+  }
 
   PHINode *PN = nullptr;
   if (F.getReturnType()->isVoidTy()) {
-    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+    B.CreateRetVoid();
   } else {
     // If the function doesn't return void... add a PHI node to the block...
-    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
-                         "UnifiedRetVal");
-    NewRetBlock->getInstList().push_back(PN);
-    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+    PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
+                     "UnifiedRetVal");
+    assert(!InsertExport);
+    B.CreateRet(PN);
   }
 
   // Loop over all of the blocks, replacing the return instruction with an
@@ -173,6 +188,8 @@
   // Dummy return block for infinite loop.
   BasicBlock *DummyReturnBB = nullptr;
 
+  bool InsertExport = false;
+
   for (BasicBlock *BB : PDT.getRoots()) {
     if (isa<ReturnInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
@@ -188,6 +205,20 @@
                                            "DummyReturnBlock", &F);
         Type *RetTy = F.getReturnType();
         Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+
+        // For pixel shaders, the producer guarantees that an export is
+        // executed before each return instruction. However, if there is an
+        // infinite loop and we insert a return ourselves, we need to uphold
+        // that guarantee by inserting a null export. This can happen e.g. in
+        // an infinite loop with kill instructions, which is supposed to
+        // terminate. However, we don't need to do this if there is a non-void
+        // return value, since then there is an epilog afterwards which will
+        // still export.
+        if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
+            RetTy->isVoidTy()) {
+          InsertExport = true;
+        }
+
         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
         ReturningBlocks.push_back(DummyReturnBB);
       }
@@ -260,6 +291,6 @@
   const TargetTransformInfo &TTI
     = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+  unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; Although it's modeled without any control flow in order to get better code
+; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls
+; it with "true". In case it's called in a provably infinite loop, we still
+; need to successfully exit and export something, even if we can't know where
+; to jump to in the LLVM IR. Therefore we insert a null export ourselves in
+; this case right before the s_endpgm to avoid GPU hangs, which is what this
+; tests.
+
+; CHECK-LABEL: return_void
+; CHECK: exp null off, off, off, off done vm
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @return_void(float %0) #0 {
+main_body:
+  %cmp = fcmp olt float %0, 1.000000e+01
+  br i1 %cmp, label %end, label %loop
+
+loop:
+  call void @llvm.amdgcn.kill(i1 false) #3
+  br label %loop
+
+end:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 0., float 0., float 0., float 1., i1 true, i1 true) #3
+  ret void
+}
+
+; In case there's an epilog, we shouldn't have to do this.
+; CHECK-LABEL: return_nonvoid
+; CHECK-NOT: exp null off, off, off, off done vm
+define amdgpu_ps float @return_nonvoid(float %0) #0 {
+main_body:
+  %cmp = fcmp olt float %0, 1.000000e+01
+  br i1 %cmp, label %end, label %loop
+
+loop:
+  call void @llvm.amdgcn.kill(i1 false) #3
+  br label %loop
+
+end:
+  ret float 0.
+}
+
+declare void @llvm.amdgcn.kill(i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }