diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -195,7 +195,12 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { auto &PDT = getAnalysis().getPostDomTree(); - if (PDT.getRoots().size() <= 1) + + // If there's only one exit, we don't need to do anything, unless this is a + // pixel shader and that exit is an infinite loop, since we still have to + // insert an export in that case. + if (PDT.getRoots().size() <= 1 && + F.getCallingConv() != CallingConv::AMDGPU_PS) return false; LegacyDivergenceAnalysis &DA = getAnalysis(); @@ -321,7 +326,7 @@ if (ReturningBlocks.empty()) return false; // No blocks return - if (ReturningBlocks.size() == 1) + if (ReturningBlocks.size() == 1 && !InsertExport) return false; // Already has a single return block const TargetTransformInfo &TTI diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -45,6 +45,22 @@ ret void } +; test the case where there's only a kill in an infinite loop +; CHECK-LABEL: only_kill +; CHECK: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm +; SIInsertSkips inserts an extra null export here, but it should be harmless. +; CHECK: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @only_kill() #0 { +main_body: + br label %loop + +loop: + call void @llvm.amdgcn.kill(i1 false) #3 + br label %loop +} + ; In case there's an epilog, we shouldn't have to do this. ; CHECK-LABEL: return_nonvoid ; CHECK-NOT: exp null off, off, off, off done vm diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -14,12 +14,13 @@ ; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ] ; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00 ; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00 -; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]] +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[UNIFIEDRETURNBLOCK:%.*]] ; IR: TransitionBlock: ; IR-NEXT: br i1 [[N30]], label [[DOTLOOPEXIT]], label [[N28]] ; IR: n31: ; IR-NEXT: ret void -; IR: DummyReturnBlock: +; IR: UnifiedReturnBlock: +; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) ; IR-NEXT: ret void ; .entry: