diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -68,7 +68,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override; BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, - bool InsertExport, StringRef Name); + StringRef Name); bool runOnFunction(Function &F) override; }; @@ -133,47 +133,15 @@ return true; } -static void removeDoneExport(Function &F) { - ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); - for (BasicBlock &BB : F) { - for (Instruction &I : BB) { - if (IntrinsicInst *Intrin = llvm::dyn_cast(&I)) { - if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { - Intrin->setArgOperand(6, BoolFalse); // done - } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { - Intrin->setArgOperand(4, BoolFalse); // done - } - } - } - } -} - BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, - bool InsertExport, StringRef Name) { + StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI // nodes (if the function returns values), and convert all of the return // instructions into unconditional branches. BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); IRBuilder<> B(NewRetBlock); - if (InsertExport) { - // Ensure that there's only one "done" export in the shader by removing the - // "done" bit set on the original final export. More than one "done" export - // can lead to undefined behavior. - removeDoneExport(F); - - Value *Undef = UndefValue::get(B.getFloatTy()); - B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() }, - { - B.getInt32(AMDGPU::Exp::ET_NULL), - B.getInt32(0), // enabled channels - Undef, Undef, Undef, Undef, // values - B.getTrue(), // done - B.getTrue(), // valid mask - }); - } - PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { B.CreateRetVoid(); @@ -181,7 +149,6 @@ // If the function doesn't return void... add a PHI node to the block... PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(), "UnifiedRetVal"); - assert(!InsertExport); B.CreateRet(PN); } @@ -221,10 +188,8 @@ auto &PDT = getAnalysis().getPostDomTree(); - // If there's only one exit, we don't need to do anything, unless this is a - // pixel shader and that exit is an infinite loop, since we still have to - // insert an export in that case. - if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS) + // If there's only one exit, we don't need to do anything. + if (PDT.root_size() <= 1) return false; LegacyDivergenceAnalysis &DA = getAnalysis(); @@ -233,14 +198,11 @@ // Loop over all of the blocks in a function, tracking all of the blocks that // return. SmallVector ReturningBlocks; - SmallVector UniformlyReachedRetBlocks; SmallVector UnreachableBlocks; // Dummy return block for infinite loop. BasicBlock *DummyReturnBB = nullptr; - bool InsertExport = false; - bool Changed = false; std::vector Updates; @@ -248,8 +210,6 @@ if (isa(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) ReturningBlocks.push_back(BB); - else - UniformlyReachedRetBlocks.push_back(BB); } else if (isa(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) UnreachableBlocks.push_back(BB); @@ -261,36 +221,6 @@ "DummyReturnBlock", &F); Type *RetTy = F.getReturnType(); Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); - - // For pixel shaders, the producer guarantees that an export is - // executed before each return instruction. However, if there is an - // infinite loop and we insert a return ourselves, we need to uphold - // that guarantee by inserting a null export. This can happen e.g. in - // an infinite loop with kill instructions, which is supposed to - // terminate. However, we don't need to do this if there is a non-void - // return value, since then there is an epilog afterwards which will - // still export. - // - // Note: In the case where only some threads enter the infinite loop, - // this can result in the null export happening redundantly after the - // original exports. However, The last "real" export happens after all - // the threads that didn't enter an infinite loop converged, which - // means that the only extra threads to execute the null export are - // threads that entered the infinite loop, and they only could've - // exited through being killed which sets their exec bit to 0. - // Therefore, unless there's an actual infinite loop, which can have - // invalid results, or there's a kill after the last export, which we - // assume the frontend won't do, this export will have the same exec - // mask as the last "real" export, and therefore the valid mask will be - // overwritten with the same value and will still be correct. Also, - // even though this forces an extra unnecessary export wait, we assume - // that this happens rare enough in practice to that we don't have to - // worry about performance. - if (F.getCallingConv() == CallingConv::AMDGPU_PS && - RetTy->isVoidTy()) { - InsertExport = true; - } - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); ReturningBlocks.push_back(DummyReturnBB); } @@ -382,20 +312,9 @@ if (ReturningBlocks.empty()) return Changed; // No blocks return - if (ReturningBlocks.size() == 1 && !InsertExport) + if (ReturningBlocks.size() == 1) return Changed; // Already has a single return block - // Unify returning blocks. If we are going to insert the export it is also - // necessary to include blocks that are uniformly reached, because in addition - // to inserting the export the "done" bits on existing exports will be cleared - // and we do not want to end up with the normal export in a non-unified, - // uniformly reached block with the "done" bit cleared. - auto BlocksToUnify = std::move(ReturningBlocks); - if (InsertExport) { - llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks); - } - - unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, - "UnifiedReturnBlock"); + unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock"); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -34,11 +34,8 @@ ; CHECK-NEXT: ; %bb.4: ; %end ; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: exp mrt0 v1, v1, v1, v0 vm +; CHECK-NEXT: exp mrt0 v1, v1, v1, v0 done vm ; CHECK-NEXT: BB0_5: ; %UnifiedReturnBlock -; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] -; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: BB0_6: ; CHECK-NEXT: s_mov_b64 exec, 0 @@ -81,11 +78,8 @@ ; CHECK-NEXT: s_cbranch_execz BB1_5 ; CHECK-NEXT: ; %bb.4: ; %end ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: exp mrt0 v0, off, v0, off compr vm +; CHECK-NEXT: exp mrt0 v0, off, v0, off done compr vm ; CHECK-NEXT: BB1_5: ; %UnifiedReturnBlock -; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] -; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: BB1_6: ; CHECK-NEXT: s_mov_b64 exec, 0 @@ -112,16 +106,12 @@ ; CHECK-NEXT: BB2_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 BB2_4 +; CHECK-NEXT: s_cbranch_scc0 BB2_3 ; CHECK-NEXT: ; %bb.2: ; %loop ; CHECK-NEXT: ; in Loop: Header=BB2_1 Depth=1 ; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_mov_b64 vcc, exec -; CHECK-NEXT: s_cbranch_execnz BB2_1 -; CHECK-NEXT: ; %bb.3: ; %UnifiedReturnBlock -; CHECK-NEXT: exp null off, off, off, off done vm -; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB2_4: +; CHECK-NEXT: s_branch BB2_1 +; CHECK-NEXT: BB2_3: ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -724,24 +724,16 @@ ; IR-LABEL: @uniformly_reached_export ; IR-NEXT: .entry: -; IR: br i1 [[CND:%.*]], label %[[EXP:.*]], label %[[FLOW:.*]] - -; IR: [[FLOW]]: -; IR-NEXT: phi -; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock +; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %[[EXP:.*]] ; IR: [[LOOP]]: -; IR-NEXT: br i1 false, label %[[FLOW1:.*]], label %[[LOOP]] +; IR-NEXT: br i1 false, label %DummyReturnBlock, label %[[LOOP]] ; IR: [[EXP]]: -; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg false, i1 immarg true) -; IR-NEXT: br label %[[FLOW]] - -; IR: [[FLOW1]]: -; IR-NEXT: br label %UnifiedReturnBlock +; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) +; IR-NEXT: ret void -; IR: UnifiedReturnBlock: -; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) +; IR: DummyReturnBlock: ; IR-NEXT: ret void define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) { diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -14,11 +14,12 @@ ; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ] ; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00 ; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00 -; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]] ; IR: TransitionBlock: ; IR-NEXT: br i1 [[N30]], label [[DOTLOOPEXIT]], label [[N28]] -; IR: UnifiedReturnBlock: -; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) +; IR: n31: +; IR-NEXT: ret void +; IR: DummyReturnBlock: ; IR-NEXT: ret void ; .entry: