diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -68,7 +68,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override; BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, - bool InsertExport, StringRef Name); + bool UpdateExports); bool runOnFunction(Function &F) override; }; @@ -133,46 +133,94 @@ return true; } -static void removeDoneExport(Function &F) { - ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); - for (BasicBlock &BB : F) { - for (Instruction &I : BB) { - if (IntrinsicInst *Intrin = llvm::dyn_cast(&I)) { - if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { - Intrin->setArgOperand(6, BoolFalse); // done - } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { - Intrin->setArgOperand(4, BoolFalse); // done - } +// Reverse search block to find a "done" export +static IntrinsicInst *findExportDone(BasicBlock *BB, ConstantInt *BoolTrue) { + for (auto &I : reverse(*BB)) { + if (IntrinsicInst *Intrin = llvm::dyn_cast(&I)) { + if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { + if (Intrin->getArgOperand(6) == BoolTrue) + return Intrin; + } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { + if (Intrin->getArgOperand(4) == BoolTrue) + return Intrin; } } } + return nullptr; +} + +// Search a block and its predecessors to see if there is a reachable +// "done" export. +// Note: this assumes the IR is already well-formed +static bool hasExportDone(BasicBlock *BB, ConstantInt *BoolTrue) { + SmallPtrSet Visited; + SmallVector Stack; + + Stack.push_back(BB); + do { + BasicBlock *Top = Stack.pop_back_val(); + if (findExportDone(Top, BoolTrue)) + return true; + + for (BasicBlock *Pred : predecessors(Top)) { + if (Visited.insert(Pred).second) + Stack.push_back(Pred); + } + } while (!Stack.empty()); + + return false; } BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, - bool InsertExport, StringRef Name) { - // Otherwise, we need to insert a new basic block into the function, add a PHI - // nodes (if the function returns values), and convert all of the return - // instructions into unconditional branches. - BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); + bool UpdateExports) { + const StringRef ExportBlockName = "UnifiedExportBlock"; + const StringRef ReturnBlockName = "UnifiedReturnBlock"; + + // Process exports first so that any new blocks are inserted in order. + BasicBlock *ExportBlock = nullptr; + SmallPtrSet NeedsExport; + + if (UpdateExports) { + // Determine whether return blocks have a reachable "done" export. + // Note: this assume any reachable export is uniform, it is up to + // frontends to ensure this is true. + for (BasicBlock *BB : ReturningBlocks) { + if (!hasExportDone(BB, ConstantInt::getTrue(F.getContext()))) + NeedsExport.insert(BB); + } + if (!NeedsExport.empty()) { + // Insert export block with export null. + // If every return needs an export, this will become the return block. + ExportBlock = BasicBlock::Create( + F.getContext(), + NeedsExport.size() == ReturningBlocks.size() ? ReturnBlockName + : ExportBlockName, + &F); + IRBuilder<> B(ExportBlock); + Value *Undef = UndefValue::get(B.getFloatTy()); + B.CreateIntrinsic(Intrinsic::amdgcn_exp, {B.getFloatTy()}, + { + B.getInt32(AMDGPU::Exp::ET_NULL), + B.getInt32(0), // enabled channels + Undef, Undef, Undef, Undef, // values + B.getTrue(), // done + B.getTrue(), // valid mask + }); + } + } + + // We need to insert a new basic block into the function, + // add a PHI nodes (if the function returns values), + // and convert all of the return instructions into unconditional branches. + BasicBlock *NewRetBlock = + NeedsExport.size() == ReturningBlocks.size() + ? ExportBlock + : BasicBlock::Create(F.getContext(), ReturnBlockName, &F); IRBuilder<> B(NewRetBlock); - if (InsertExport) { - // Ensure that there's only one "done" export in the shader by removing the - // "done" bit set on the original final export. More than one "done" export - // can lead to undefined behavior. - removeDoneExport(F); - - Value *Undef = UndefValue::get(B.getFloatTy()); - B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() }, - { - B.getInt32(AMDGPU::Exp::ET_NULL), - B.getInt32(0), // enabled channels - Undef, Undef, Undef, Undef, // values - B.getTrue(), // done - B.getTrue(), // valid mask - }); - } + if (ExportBlock && ExportBlock != NewRetBlock) + BranchInst::Create(NewRetBlock, ExportBlock); PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { @@ -181,14 +229,16 @@ // If the function doesn't return void... add a PHI node to the block... PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(), "UnifiedRetVal"); - assert(!InsertExport); + assert(!UpdateExports); B.CreateRet(PN); } // Loop over all of the blocks, replacing the return instruction with an // unconditional branch. std::vector Updates; - Updates.reserve(ReturningBlocks.size()); + Updates.reserve(ReturningBlocks.size() + 1); + if (ExportBlock) + Updates.push_back({DominatorTree::Insert, NewRetBlock, NewRetBlock}); for (BasicBlock *BB : ReturningBlocks) { // Add an incoming element to the PHI node for every return instruction that // is merging into this new block... @@ -197,8 +247,13 @@ // Remove and delete the return inst. BB->getTerminator()->eraseFromParent(); - BranchInst::Create(NewRetBlock, BB); - Updates.push_back({DominatorTree::Insert, BB, NewRetBlock}); + if (NeedsExport.count(BB)) { + BranchInst::Create(ExportBlock, BB); + Updates.push_back({DominatorTree::Insert, BB, ExportBlock}); + } else { + BranchInst::Create(NewRetBlock, BB); + Updates.push_back({DominatorTree::Insert, BB, NewRetBlock}); + } } if (RequireAndPreserveDomTree) @@ -239,7 +294,7 @@ // Dummy return block for infinite loop. BasicBlock *DummyReturnBB = nullptr; - bool InsertExport = false; + bool UpdateExports = false; bool Changed = false; std::vector Updates; @@ -271,24 +326,14 @@ // return value, since then there is an epilog afterwards which will // still export. // - // Note: In the case where only some threads enter the infinite loop, - // this can result in the null export happening redundantly after the - // original exports. However, The last "real" export happens after all - // the threads that didn't enter an infinite loop converged, which - // means that the only extra threads to execute the null export are - // threads that entered the infinite loop, and they only could've - // exited through being killed which sets their exec bit to 0. - // Therefore, unless there's an actual infinite loop, which can have - // invalid results, or there's a kill after the last export, which we - // assume the frontend won't do, this export will have the same exec - // mask as the last "real" export, and therefore the valid mask will be - // overwritten with the same value and will still be correct. Also, - // even though this forces an extra unnecessary export wait, we assume - // that this happens rare enough in practice to that we don't have to - // worry about performance. + // Note: we only insert a null export in the case where an existing + // "done" export is not reachable from a return block. + // This avoids double "done" exports for well-formed IR generated by + // existing frontends (which ensure there is only one such export in + // occuring in uniform control flow). if (F.getCallingConv() == CallingConv::AMDGPU_PS && RetTy->isVoidTy()) { - InsertExport = true; + UpdateExports = true; } ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); @@ -382,7 +427,7 @@ if (ReturningBlocks.empty()) return Changed; // No blocks return - if (ReturningBlocks.size() == 1 && !InsertExport) + if (ReturningBlocks.size() == 1 && !UpdateExports) return Changed; // Already has a single return block // Unify returning blocks. If we are going to insert the export it is also @@ -391,11 +436,10 @@ // and we do not want to end up with the normal export in a non-unified, // uniformly reached block with the "done" bit cleared. auto BlocksToUnify = std::move(ReturningBlocks); - if (InsertExport) { + if (UpdateExports) { llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks); } - unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, - "UnifiedReturnBlock"); + unifyReturnBlockSet(F, DTU, BlocksToUnify, UpdateExports); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -9,10 +9,9 @@ ; tests. ; CHECK-LABEL: return_void -; Make sure that we remove the done bit from the original export -; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm -; CHECK: exp null off, off, off, off done vm -; CHECK-NEXT: s_endpgm +; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} done vm +; CHECK-NOT: exp +; CHECK: s_endpgm define amdgpu_ps void @return_void(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 @@ -27,11 +26,10 @@ ret void } -; Check that we also remove the done bit from compressed exports correctly. ; CHECK-LABEL: return_void_compr -; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm -; CHECK: exp null off, off, off, off done vm -; CHECK-NEXT: s_endpgm +; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off done compr vm +; CHECK-NOT: exp +; CHECK: s_endpgm define amdgpu_ps void @return_void_compr(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -718,9 +718,8 @@ unreachable } -; Test that there is an extra export inserted after the normal export, -; if the normal export is inside a uniformly reached block and there is -; an infinite loop in the pixel shader. +; Test that there is an extra export inserted for the infinite loop +; exit path of the pixel shader. ; IR-LABEL: @uniformly_reached_export ; IR-NEXT: .entry: @@ -728,20 +727,23 @@ ; IR: [[FLOW]]: ; IR-NEXT: phi -; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock +; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %[[FLOW1:.*]] ; IR: [[LOOP]]: -; IR-NEXT: br i1 false, label %[[FLOW1:.*]], label %[[LOOP]] +; IR-NEXT: br i1 false, label %UnifiedExportBlock, label %[[LOOP]] ; IR: [[EXP]]: -; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg false, i1 immarg true) +; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) ; IR-NEXT: br label %[[FLOW]] ; IR: [[FLOW1]]: ; IR-NEXT: br label %UnifiedReturnBlock -; IR: UnifiedReturnBlock: +; IR: UnifiedExportBlock: ; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) +; IR-NEXT: br label %[[FLOW1]] + +; IR: UnifiedReturnBlock: ; IR-NEXT: ret void define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {