diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -208,6 +208,7 @@ // Loop over all of the blocks in a function, tracking all of the blocks that // return. SmallVector ReturningBlocks; + SmallVector UniformlyReachedRetBlocks; SmallVector UnreachableBlocks; // Dummy return block for infinite loop. @@ -219,6 +220,8 @@ if (isa(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) ReturningBlocks.push_back(BB); + else + UniformlyReachedRetBlocks.push_back(BB); } else if (isa(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) UnreachableBlocks.push_back(BB); @@ -332,6 +335,18 @@ const TargetTransformInfo &TTI = getAnalysis().getTTI(F); - unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock"); + // Unify returning blocks. If we are going to insert the export it is also + // necessary to include blocks that are uniformly reached, because in addition + // to inserting the export the "done" bits on existing exports will be cleared + // and we do not want to end up with the normal export in a non-unified, + // uniformly reached block with the "done" bit cleared. + auto BlocksToUnify = std::move(ReturningBlocks); + if (InsertExport) { + BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(), + UniformlyReachedRetBlocks.end()); + } + + unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI, + "UnifiedReturnBlock"); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -719,6 +719,46 @@ unreachable } +; Test that there is an extra export inserted after the normal export, +; if the normal export is inside a uniformly reached block and there is +; an infinite loop in the pixel shader. + +; IR-LABEL: @uniformly_reached_export +; IR-NEXT: .entry: +; IR: br i1 [[CND:%.*]], label %[[EXP:.*]], label %[[FLOW:.*]] + +; IR: [[FLOW]]: +; IR-NEXT: phi +; IR-NEXT: br i1 [[CND2:%.*]], label %[[PREHEADER:.*]], label %[[FLOW2:.*]] + +; IR: [[FLOW2]]: +; IR-NEXT: br label %UnifiedReturnBlock + +; IR: [[EXP]]: +; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg false, i1 immarg true) +; IR-NEXT: br label %[[FLOW]] + +; IR: UnifiedReturnBlock: +; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) +; IR-NEXT: ret void + +define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) { +.entry: + %tmp26 = fcmp olt float %tmp25, 0.000000e+00 + br i1 %tmp26, label %.preheader.1, label %bb27 + +.preheader.1: ; preds = %.entry + br label %bb + +bb: ; preds = %bb, %.preheader.1 + br label %bb + +bb27: ; preds = %.entry + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) + ret void +} + +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #0 declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -17,8 +17,6 @@ ; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[UNIFIEDRETURNBLOCK:%.*]] ; IR: TransitionBlock: ; IR-NEXT: br i1 [[N30]], label [[DOTLOOPEXIT]], label [[N28]] -; IR: n31: -; IR-NEXT: ret void ; IR: UnifiedReturnBlock: ; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) ; IR-NEXT: ret void