diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -66,9 +66,11 @@ // We can preserve non-critical-edgeness when we unify function exit nodes void getAnalysisUsage(AnalysisUsage &AU) const override; + void unifyExports(Function &F, ArrayRef ReturningBlocks, + IRBuilder<> &B); BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, - bool InsertExport, StringRef Name); + bool UpdateExports, StringRef Name); bool runOnFunction(Function &F) override; }; @@ -133,36 +135,118 @@ return true; } -static void removeDoneExport(Function &F) { - ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); - for (BasicBlock &BB : F) { - for (Instruction &I : BB) { - if (IntrinsicInst *Intrin = llvm::dyn_cast(&I)) { - if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { - Intrin->setArgOperand(6, BoolFalse); // done - } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { - Intrin->setArgOperand(4, BoolFalse); // done - } +// Reverse search block to find a "done" export +static IntrinsicInst *findExportDone(BasicBlock *BB, ConstantInt *BoolTrue) { + for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) { + if (IntrinsicInst *Intrin = llvm::dyn_cast(&*I)) { + if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { + if (Intrin->getArgOperand(6) == BoolTrue) + return Intrin; + } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { + if (Intrin->getArgOperand(4) == BoolTrue) + return Intrin; } } } + return nullptr; } -BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( - Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, - bool InsertExport, StringRef Name) { - // Otherwise, we need to insert a new basic block into the function, add a PHI - // nodes (if the function returns values), and convert all of the return - // instructions into unconditional branches. - BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); - IRBuilder<> B(NewRetBlock); +// Search a block and its predecessors to find a "done" export +static IntrinsicInst *findExportDone(BasicBlock *BB, + DenseSet &Visited, + ConstantInt *BoolTrue) { + SmallVector Stack; + + Stack.push_back(BB); + do { + BasicBlock *Top = Stack.pop_back_val(); + IntrinsicInst *Export = findExportDone(Top, BoolTrue); + if (Export) + return Export; + + for (BasicBlock *Pred : predecessors(Top)) { + if (Visited.insert(Pred).second) + Stack.push_back(Pred); + } + } while (!Stack.empty()); + + return nullptr; +} + +static bool isChannelEnabled(unsigned Channel, uint64_t EnBits, bool IsCompr) { + return (!IsCompr && (EnBits & (1 << Channel))) || + (IsCompr && (EnBits & (0x3 << (2 * Channel)))); +} - if (InsertExport) { - // Ensure that there's only one "done" export in the shader by removing the - // "done" bit set on the original final export. More than one "done" export - // can lead to undefined behavior. - removeDoneExport(F); +static StringRef unifiedChannelName(unsigned Channel) { + switch (Channel) { + case 0: + return "UnifiedExportVSRC0"; + case 1: + return "UnifiedExportVSRC1"; + case 2: + return "UnifiedExportVSRC2"; + case 3: + return "UnifiedExportVSRC3"; + default: + return ""; + } +} +void AMDGPUUnifyDivergentExitNodes::unifyExports( + Function &F, ArrayRef ReturningBlocks, IRBuilder<> &B) { + // Ensure that there's only one "done" export in the shader by unifying + // all "done" exports into a single instruction in the unified return block. + // More than one "done" export can lead to undefined behavior. + + // Find exports to unify + ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); + DenseMap Exports; + DenseSet Visited; + Type *ExportType = nullptr; + uint64_t EnBits = 0; + uint64_t Target = 0; + bool IsCompr = false; + bool MatchingExports = true; + for (BasicBlock *BB : ReturningBlocks) { + IntrinsicInst *Intrin = findExportDone(BB, Visited, BoolTrue); + if (!Intrin) + continue; + + // Check all exports we are unifying are consistent + uint64_t TargetVal = + cast(Intrin->getArgOperand(0))->getZExtValue(); + uint64_t EnBitsVal = + cast(Intrin->getArgOperand(1))->getZExtValue(); + Type *TypeVal = Intrin->getFunctionType()->getParamType(2); + bool ComprVal = Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr; + if (Exports.size()) { + MatchingExports = MatchingExports && (Target == TargetVal) && + (EnBits == EnBitsVal) && (ExportType == TypeVal) && + (IsCompr == ComprVal); + } else { + Target = TargetVal; + EnBits = EnBitsVal; + ExportType = TypeVal; + IsCompr = ComprVal; + } + Exports[BB] = Intrin; + } + + if (!Exports.size() || !MatchingExports) { + LLVM_DEBUG(dbgs() << "Unifying exports using null export\n"); + + // Clear "done" bit from exports + ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); + for (auto Export : Exports) { + IntrinsicInst *Intrin = Export.second; + if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { + Intrin->setArgOperand(6, BoolFalse); // done + } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { + Intrin->setArgOperand(4, BoolFalse); // done + } + } + // Insert null export for unified return block Value *Undef = UndefValue::get(B.getFloatTy()); B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() }, { @@ -172,7 +256,70 @@ B.getTrue(), // done B.getTrue(), // valid mask }); + } else { + LLVM_DEBUG(dbgs() << "Unifying exports by merging\n"); + + // Create one phi node per enabled export channel + const unsigned Channels = IsCompr ? 2 : 4; + PHINode *Phis[4] = {nullptr}; + for (unsigned Idx = 0; Idx < Channels; ++Idx) { + if (isChannelEnabled(Idx, EnBits, IsCompr)) + Phis[Idx] = B.CreatePHI(ExportType, ReturningBlocks.size(), + unifiedChannelName(Idx)); + } + // Add export values to phi nodes and remove old export + Value *Undef = UndefValue::get(ExportType); + for (BasicBlock *BB : ReturningBlocks) { + if (Exports.count(BB)) { + IntrinsicInst *Intrin = Exports[BB]; + for (unsigned Idx = 0; Idx < Channels; ++Idx) { + if (isChannelEnabled(Idx, EnBits, IsCompr)) + Phis[Idx]->addIncoming(Intrin->getOperand(2 + Idx), BB); + } + Intrin->eraseFromParent(); + } else { + for (unsigned Idx = 0; Idx < Channels; ++Idx) { + if (isChannelEnabled(Idx, EnBits, IsCompr)) + Phis[Idx]->addIncoming(Undef, BB); + } + } + } + // Insert new unified export based on phi nodes + if (IsCompr) { + B.CreateIntrinsic(Intrinsic::amdgcn_exp_compr, {ExportType}, + { + B.getInt32(Target), B.getInt32(EnBits), + (EnBits & 0x3) ? Phis[0] : Undef, + (EnBits & 0xc) ? Phis[1] : Undef, + B.getTrue(), // done + B.getTrue(), // valid mask + }); + } else { + B.CreateIntrinsic(Intrinsic::amdgcn_exp, {ExportType}, + { + B.getInt32(Target), B.getInt32(EnBits), + (EnBits & 0x1) ? Phis[0] : Undef, + (EnBits & 0x2) ? Phis[1] : Undef, + (EnBits & 0x4) ? Phis[2] : Undef, + (EnBits & 0x8) ? Phis[3] : Undef, + B.getTrue(), // done + B.getTrue(), // valid mask + }); + } } +} + +BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( + Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, + bool UpdateExports, StringRef Name) { + // Otherwise, we need to insert a new basic block into the function, add a PHI + // nodes (if the function returns values), and convert all of the return + // instructions into unconditional branches. + BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); + IRBuilder<> B(NewRetBlock); + + if (UpdateExports) + unifyExports(F, ReturningBlocks, B); PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { @@ -181,7 +328,7 @@ // If the function doesn't return void... add a PHI node to the block... PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(), "UnifiedRetVal"); - assert(!InsertExport); + assert(!UpdateExports); B.CreateRet(PN); } @@ -239,7 +386,7 @@ // Dummy return block for infinite loop. BasicBlock *DummyReturnBB = nullptr; - bool InsertExport = false; + bool UpdateExports = false; bool Changed = false; std::vector Updates; @@ -265,11 +412,16 @@ // For pixel shaders, the producer guarantees that an export is // executed before each return instruction. However, if there is an // infinite loop and we insert a return ourselves, we need to uphold - // that guarantee by inserting a null export. This can happen e.g. in - // an infinite loop with kill instructions, which is supposed to - // terminate. However, we don't need to do this if there is a non-void - // return value, since then there is an epilog afterwards which will - // still export. + // that guarantee by inserting a null export, or moving an existing + // export. This can happen e.g. in an infinite loop with kill + // instructions, which is supposed to terminate. However, we don't need + // to do this if there is a non-void return value, since then there is + // an epilog afterwards which will still export. + // + // We try not to insert new exports by moving an existing export to + // the unified return block. When this is not possible, due to + // multiple incompatible final exports, then the "done" flag on + // all other exports is clear and a final null export is added. // // Note: In the case where only some threads enter the infinite loop, // this can result in the null export happening redundantly after the @@ -281,14 +433,14 @@ // Therefore, unless there's an actual infinite loop, which can have // invalid results, or there's a kill after the last export, which we // assume the frontend won't do, this export will have the same exec - // mask as the last "real" export, and therefore the valid mask will be + // mark as the last "real" export, and therefore the valid mask will be // overwritten with the same value and will still be correct. Also, // even though this forces an extra unnecessary export wait, we assume // that this happens rare enough in practice to that we don't have to // worry about performance. if (F.getCallingConv() == CallingConv::AMDGPU_PS && RetTy->isVoidTy()) { - InsertExport = true; + UpdateExports = true; } ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); @@ -382,7 +534,7 @@ if (ReturningBlocks.empty()) return Changed; // No blocks return - if (ReturningBlocks.size() == 1 && !InsertExport) + if (ReturningBlocks.size() == 1 && !UpdateExports) return Changed; // Already has a single return block // Unify returning blocks. If we are going to insert the export it is also @@ -391,11 +543,11 @@ // and we do not want to end up with the normal export in a non-unified, // uniformly reached block with the "done" bit cleared. auto BlocksToUnify = std::move(ReturningBlocks); - if (InsertExport) { + if (UpdateExports) { llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks); } - unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, + unifyReturnBlockSet(F, DTU, BlocksToUnify, UpdateExports, "UnifiedReturnBlock"); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -4,13 +4,14 @@ ; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls ; it with "true". In case it's called in a provably infinite loop, we still ; need to successfully exit and export something, even if we can't know where -; to jump to in the LLVM IR. Therefore we insert a null export ourselves in -; this case right before the s_endpgm to avoid GPU hangs, which is what this -; tests. +; to jump to in the LLVM IR. Therefore we move the export to the shader end. ; CHECK-LABEL: return_void -; Make sure that we remove the done bit from the original export -; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm +; Make sure there is a single export +; CHECK-NOT: s_endpgm +; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} done vm +; CHECK-NEXT: s_endpgm +; Early termination block follows ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @return_void(float %0) #0 { @@ -29,7 +30,10 @@ ; Check that we also remove the done bit from compressed exports correctly. ; CHECK-LABEL: return_void_compr -; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm +; CHECK-NOT: s_endpgm +; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off done compr vm +; CHECK-NEXT: s_endpgm +; Early termination block follows ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @return_void_compr(float %0) #0 { @@ -48,9 +52,10 @@ ; test the case where there's only a kill in an infinite loop ; CHECK-LABEL: only_kill +; CHECK-NOT: s_endpgm ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm -; SILateBranchLowering inserts an extra null export here, but it should be harmless. +; Early termination block follows ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @only_kill() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -718,41 +718,97 @@ unreachable } -; Test that there is an extra export inserted after the normal export, -; if the normal export is inside a uniformly reached block and there is -; an infinite loop in the pixel shader. +; Test that divergent exit is unified with uniform export exit. ; IR-LABEL: @uniformly_reached_export ; IR-NEXT: .entry: -; IR: br i1 [[CND:%.*]], label %[[EXP:.*]], label %[[FLOW:.*]] +; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock + +; IR: [[LOOP]]: +; IR-NEXT: br i1 false, label %[[FLOW:.*]], label %[[LOOP]] ; IR: [[FLOW]]: -; IR-NEXT: phi -; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock +; IR-NEXT: br label %UnifiedReturnBlock -; IR: [[LOOP]]: -; IR-NEXT: br i1 false, label %[[FLOW1:.*]], label %[[LOOP]] +; IR: UnifiedReturnBlock: +; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> , <2 x half> , i1 true, i1 true) +; IR-NEXT: ret void -; IR: [[EXP]]: -; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg false, i1 immarg true) -; IR-NEXT: br label %[[FLOW]] +define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) { +.entry: + %tmp26 = fcmp olt float %tmp25, 0.000000e+00 + br i1 %tmp26, label %loop, label %bb27 + +loop: ; preds = %loop, %.entry + br label %loop + +bb27: ; preds = %.entry + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) + ret void +} + +; Test that early uniform export is moved to unified exit block. + +; IR-LABEL: @uniformly_reached_early_export +; IR-NEXT: .entry: +; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock + +; IR: [[LOOP]]: +; IR-NEXT: br i1 false, label %[[FLOW:.*]], label %[[LOOP]] -; IR: [[FLOW1]]: +; IR: [[FLOW]]: ; IR-NEXT: br label %UnifiedReturnBlock ; IR: UnifiedReturnBlock: -; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) +; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> , <2 x half> , i1 true, i1 true) ; IR-NEXT: ret void -define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) { +define amdgpu_ps void @uniformly_reached_early_export(float inreg %tmp25) { .entry: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) %tmp26 = fcmp olt float %tmp25, 0.000000e+00 br i1 %tmp26, label %loop, label %bb27 -loop: ; preds = %loop, %.entry +loop: ; preds = %loop, %.entry br label %loop bb27: ; preds = %.entry + ret void +} + +; Test that incompatible divergent exports are not unified. + +; IR-LABEL: @incompatible_divergent_exports + +; IR: bb26: +; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 1, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg false, i1 immarg true) +; IR-NEXT: br + +; IR: bb27: +; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg false, i1 immarg true) +; IR-NEXT: br + +; IR: UnifiedReturnBlock: +; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) +; IR-NEXT: ret void + +define amdgpu_ps void @incompatible_divergent_exports(float inreg %tmp25) { +.entry: + %tmp26 = fcmp olt float %tmp25, 1.000000e+00 + br i1 %tmp26, label %loop, label %bb25 + +loop: ; preds = %loop, %.entry + br label %loop + +bb25: ; preds = %.entry + %tmp27 = fcmp olt float %tmp25, 0.000000e+00 + br i1 %tmp27, label %bb26, label %bb27 + +bb26: ; preds = %.bb25 + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 1, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) + ret void + +bb27: ; preds = %.bb25 call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) ret void }